From c4deaa7426ec213e1057faecf9bea8aad90eb4b1 Mon Sep 17 00:00:00 2001
From: AlongWY <AlongWY@users.noreply.github.com>
Date: Sat, 27 Jan 2024 05:20:08 +0000
Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac

---
 .nojekyll   |     0
 cache.json  |     1 +
 favicon.ico |   Bin 0 -> 15086 bytes
 index.css   |   355 +
 index.html  | 70843 ++++++++++++++++++++++++++++++++++++++++++++++++++
 index.js    |    39 +
 6 files changed, 71238 insertions(+)
 create mode 100644 .nojekyll
 create mode 100644 cache.json
 create mode 100644 favicon.ico
 create mode 100644 index.css
 create mode 100644 index.html
 create mode 100644 index.js

diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 00000000..e69de29b
diff --git a/cache.json b/cache.json
new file mode 100644
index 00000000..bce99211
--- /dev/null
+++ b/cache.json
@@ -0,0 +1 @@
+{"2024-01-19T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.10882v1","updated":"2024-01-19T18:49:36Z","published":"2024-01-19T18:49:36Z","title":"Reinforcement learning for question answering in programming domain\n  using public community scoring as a human feedback","summary":"  In this study, we investigate the enhancement of the GPT Neo 125M performance\nin Community Question Answering (CQA) with a focus on programming, through the\nintegration of Reinforcement Learning from Human Feedback (RLHF) and the\nutilization of scores from Stack Overflow. Two distinct reward model training\nstrategies are employed for fine-tuning with Proximal Policy Optimization\n(PPO). Notably, the improvements in performance achieved through this method\nare comparable to those of GPT Neo 2.7B parameter variant. Additionally, an\nauxiliary scoring mechanism is introduced, which demonstrates the limitations\nof conventional linguistic metrics in evaluating responses in the programming\ndomain. Through accurate analysis, this paper looks at the divergence between\ntraditional linguistic metrics and our human-preferences-based reward model,\nunderscoring the imperative for domain-specific evaluation methods. By\nelucidating the complexities involved in applying RLHF to programming CQA and\naccentuating the significance of context-aware evaluation, this study\ncontributes to the ongoing efforts in refining Large Language Models through\nfocused human feedback.\n","authors":["Alexey Gorbatovski","Sergey Kovalchuk"],"pdf_url":"https://arxiv.org/pdf/2401.10882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10862v1","updated":"2024-01-19T18:05:34Z","published":"2024-01-19T18:05:34Z","title":"Pruning for Protection: Increasing Jailbreak Resistance in Aligned LLMs\n  Without Fine-Tuning","summary":"  Large Language Models (LLMs) are vulnerable to `Jailbreaking' prompts, a type\nof attack that can coax these models into generating harmful and illegal\ncontent. In this paper, we show that pruning up to 20% of LLM parameters\nmarkedly increases their resistance to such attacks without additional training\nand without sacrificing their performance in standard benchmarks. Intriguingly,\nwe discovered that the enhanced safety observed post-pruning correlates to the\ninitial safety training level of the model, hinting that the effect of pruning\ncould be more general and may hold for other LLM behaviors beyond safety.\nAdditionally, we introduce a curated dataset of 225 harmful tasks across five\ncategories, inserted into ten different Jailbreaking prompts, showing that\npruning aids LLMs in concentrating attention on task-relevant tokens in\njailbreaking prompts. Lastly, our experiments reveal that the prominent chat\nmodels, such as LLaMA-2 Chat, Vicuna, and Mistral Instruct exhibit high\nsusceptibility to jailbreaking attacks, with some categories achieving nearly\n70-100% success rate. These insights underline the potential of pruning as a\ngeneralizable approach for improving LLM safety, reliability, and potentially\nother desired behaviors.\n","authors":["Adib Hasan","Ileana Rugina","Alex Wang"],"pdf_url":"https://arxiv.org/pdf/2401.10862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10850v1","updated":"2024-01-19T17:51:11Z","published":"2024-01-19T17:51:11Z","title":"Advancements in eHealth Data Analytics through Natural Language\n  Processing and Deep Learning","summary":"  The healthcare environment is commonly referred to as \"information-rich\" but\nalso \"knowledge poor\". Healthcare systems collect huge amounts of data from\nvarious sources: lab reports, medical letters, logs of medical tools or\nprograms, medical prescriptions, etc. These massive sets of data can provide\ngreat knowledge and information that can improve the medical services, and\noverall the healthcare domain, such as disease prediction by analyzing the\npatient's symptoms or disease prevention, by facilitating the discovery of\nbehavioral factors for diseases. Unfortunately, only a relatively small volume\nof the textual eHealth data is processed and interpreted, an important factor\nbeing the difficulty in efficiently performing Big Data operations. In the\nmedical field, detecting domain-specific multi-word terms is a crucial task as\nthey can define an entire concept with a few words. A term can be defined as a\nlinguistic structure or a concept, and it is composed of one or more words with\na specific meaning to a domain. All the terms of a domain create its\nterminology. This chapter offers a critical study of the current, most\nperformant solutions for analyzing unstructured (image and textual) eHealth\ndata. This study also provides a comparison of the current Natural Language\nProcessing and Deep Learning techniques in the eHealth context. Finally, we\nexamine and discuss some of the current issues, and we define a set of research\ndirections in this area.\n","authors":["Elena-Simona Apostol","Ciprian-Octavian Truică"],"pdf_url":"https://arxiv.org/pdf/2401.10850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10841v1","updated":"2024-01-19T17:40:50Z","published":"2024-01-19T17:40:50Z","title":"Using LLMs to discover emerging coded antisemitic hate-speech emergence\n  in extremist social media","summary":"  Online hate speech proliferation has created a difficult problem for social\nmedia platforms. A particular challenge relates to the use of coded language by\ngroups interested in both creating a sense of belonging for its users and\nevading detection. Coded language evolves quickly and its use varies over time.\nThis paper proposes a methodology for detecting emerging coded hate-laden\nterminology. The methodology is tested in the context of online antisemitic\ndiscourse. The approach considers posts scraped from social media platforms,\noften used by extremist users. The posts are scraped using seed expressions\nrelated to previously known discourse of hatred towards Jews. The method begins\nby identifying the expressions most representative of each post and calculating\ntheir frequency in the whole corpus. It filters out grammatically incoherent\nexpressions as well as previously encountered ones so as to focus on emergent\nwell-formed terminology. This is followed by an assessment of semantic\nsimilarity to known antisemitic terminology using a fine-tuned large language\nmodel, and subsequent filtering out of the expressions that are too distant\nfrom known expressions of hatred. Emergent antisemitic expressions containing\nterms clearly relating to Jewish topics are then removed to return only coded\nexpressions of hatred.\n","authors":["Dhanush Kikkisetti","Raza Ul Mustafa","Wendy Melillo","Roberto Corizzo","Zois Boukouvalas","Jeff Gill","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2401.10841v1.pdf","comment":"9 pages, 4 figures, 2 algorithms, 3 tables"},{"id":"http://arxiv.org/abs/2309.14393v2","updated":"2024-01-19T17:33:44Z","published":"2023-09-25T14:50:04Z","title":"LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language\n  Models","summary":"  The carbon footprint associated with large language models (LLMs) is a\nsignificant concern, encompassing emissions from their training, inference,\nexperimentation, and storage processes, including operational and embodied\ncarbon emissions. An essential aspect is accurately estimating the carbon\nimpact of emerging LLMs even before their training, which heavily relies on GPU\nusage. Existing studies have reported the carbon footprint of LLM training, but\nonly one tool, mlco2, can predict the carbon footprint of new neural networks\nprior to physical training. However, mlco2 has several serious limitations. It\ncannot extend its estimation to dense or mixture-of-experts (MoE) LLMs,\ndisregards critical architectural parameters, focuses solely on GPUs, and\ncannot model embodied carbon footprints. Addressing these gaps, we introduce\n\\textit{\\carb}, an end-to-end carbon footprint projection model designed for\nboth dense and MoE LLMs. Compared to mlco2, \\carb~significantly enhances the\naccuracy of carbon footprint estimations for various LLMs. The source code is\nreleased at \\url{https://github.com/SotaroKaneda/MLCarbon}.\n","authors":["Ahmad Faiz","Sotaro Kaneda","Ruhan Wang","Rita Osi","Prateek Sharma","Fan Chen","Lei Jiang"],"pdf_url":"https://arxiv.org/pdf/2309.14393v2.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.10825v1","updated":"2024-01-19T17:21:05Z","published":"2024-01-19T17:21:05Z","title":"A survey on recent advances in named entity recognition","summary":"  Named Entity Recognition seeks to extract substrings within a text that name\nreal-world objects and to determine their type (for example, whether they refer\nto persons or organizations). In this survey, we first present an overview of\nrecent popular approaches, but we also look at graph- and transformer- based\nmethods including Large Language Models (LLMs) that have not had much coverage\nin other surveys. Second, we focus on methods designed for datasets with scarce\nannotations. Third, we evaluate the performance of the main NER implementations\non a variety of datasets with differing characteristics (as regards their\ndomain, their size, and their number of classes). We thus provide a deep\ncomparison of algorithms that are never considered together. Our experiments\nshed some light on how the characteristics of datasets affect the behavior of\nthe methods that we compare.\n","authors":["Imed Keraghel","Stanislas Morbieu","Mohamed Nadif"],"pdf_url":"https://arxiv.org/pdf/2401.10825v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2401.05273v2","updated":"2024-01-19T16:57:30Z","published":"2024-01-10T17:13:28Z","title":"INACIA: Integrating Large Language Models in Brazilian Audit Courts:\n  Opportunities and Challenges","summary":"  This paper introduces INACIA (Instru\\c{c}\\~ao Assistida com Intelig\\^encia\nArtificial), a groundbreaking system designed to integrate Large Language\nModels (LLMs) into the operational framework of Brazilian Federal Court of\nAccounts (TCU). The system automates various stages of case analysis, including\nbasic information extraction, admissibility examination, Periculum in mora and\nFumus boni iuris analyses, and recommendations generation. Through a series of\nexperiments, we demonstrate INACIA's potential in extracting relevant\ninformation from case documents, evaluating its legal plausibility, and\nformulating propositions for judicial decision-making. Utilizing a validation\ndataset alongside LLMs, our evaluation methodology presents an innovative\napproach to assessing system performance, correlating highly with human\njudgment. The results highlight INACIA's proficiency in handling complex legal\ntasks, indicating its suitability for augmenting efficiency and judicial\nfairness within legal systems. The paper also discusses potential enhancements\nand future applications, positioning INACIA as a model for worldwide AI\nintegration in legal domains.\n","authors":["Jayr Pereira","Andre Assumpcao","Julio Trecenti","Luiz Airosa","Caio Lente","Jhonatan Cléto","Guilherme Dobins","Rodrigo Nogueira","Luis Mitchell","Roberto Lotufo"],"pdf_url":"https://arxiv.org/pdf/2401.05273v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08565v2","updated":"2024-01-19T16:48:59Z","published":"2023-09-15T17:33:24Z","title":"How Transferable are Attribute Controllers on Pretrained Multilingual\n  Translation Models?","summary":"  Customizing machine translation models to comply with fine-grained attributes\nsuch as formality has seen tremendous progress recently. However, current\napproaches mostly rely on at least some supervised data with attribute\nannotation. Data scarcity therefore remains a bottleneck to democratizing such\ncustomization possibilities to a wider range of languages, lower-resource ones\nin particular. Given recent progress in pretrained massively multilingual\ntranslation models, we use them as a foundation to transfer the attribute\ncontrolling capabilities to languages without supervised data. In this work, we\npresent a comprehensive analysis of transferring attribute controllers based on\na pretrained NLLB-200 model. We investigate both training- and inference-time\ncontrol techniques under various data scenarios, and uncover their relative\nstrengths and weaknesses in zero-shot performance and domain robustness. We\nshow that both paradigms are complementary, as shown by consistent improvements\non 5 zero-shot directions. Moreover, a human evaluation on a real low-resource\nlanguage, Bengali, confirms our findings on zero-shot transfer to new target\nlanguages. The code is\n$\\href{https://github.com/dannigt/attribute-controller-transfer}{\\text{here}}$.\n","authors":["Danni Liu","Jan Niehues"],"pdf_url":"https://arxiv.org/pdf/2309.08565v2.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2302.12190v2","updated":"2024-01-19T16:30:14Z","published":"2023-02-23T17:31:40Z","title":"MCWDST: a Minimum-Cost Weighted Directed Spanning Tree Algorithm for\n  Real-Time Fake News Mitigation in Social Media","summary":"  The widespread availability of internet access and handheld devices confers\nto social media a power similar to the one newspapers used to have. People seek\naffordable information on social media and can reach it within seconds. Yet\nthis convenience comes with dangers; any user may freely post whatever they\nplease and the content can stay online for a long period, regardless of its\ntruthfulness. A need to detect untruthful information, also known as fake news,\narises. In this paper, we present an end-to-end solution that accurately\ndetects fake news and immunizes network nodes that spread them in real-time. To\ndetect fake news, we propose two new stack deep learning architectures that\nutilize convolutional and bidirectional LSTM layers. To mitigate the spread of\nfake news, we propose a real-time network-aware strategy that (1) constructs a\nminimum-cost weighted directed spanning tree for a detected node, and (2)\nimmunizes nodes in that tree by scoring their harmfulness using a novel ranking\nfunction. We demonstrate the effectiveness of our solution on five real-world\ndatasets.\n","authors":["Ciprian-Octavian Truică","Elena-Simona Apostol","Radu-Cătălin Nicolescu","Panagiotis Karras"],"pdf_url":"https://arxiv.org/pdf/2302.12190v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07107v3","updated":"2024-01-19T16:01:28Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":"  As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions, such as\nsearch agents, within this expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Haonan Chen","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v3.pdf","comment":"updated to version 2"},{"id":"http://arxiv.org/abs/2401.10774v1","updated":"2024-01-19T15:48:40Z","published":"2024-01-19T15:48:40Z","title":"Medusa: Simple LLM Inference Acceleration Framework with Multiple\n  Decoding Heads","summary":"  The inference process in Large Language Models (LLMs) is often limited due to\nthe absence of parallelism in the auto-regressive decoding process, resulting\nin most operations being restricted by the memory bandwidth of accelerators.\nWhile methods such as speculative decoding have been suggested to address this\nissue, their implementation is impeded by the challenges associated with\nacquiring and maintaining a separate draft model. In this paper, we present\nMedusa, an efficient method that augments LLM inference by adding extra\ndecoding heads to predict multiple subsequent tokens in parallel. Using a\ntree-based attention mechanism, Medusa constructs multiple candidate\ncontinuations and verifies them simultaneously in each decoding step. By\nleveraging parallel processing, Medusa introduces only minimal overhead in\nterms of single-step latency while substantially reducing the number of\ndecoding steps required.\n  We present two levels of fine-tuning procedures for Medusa to meet the needs\nof different use cases: Medusa-1: Medusa is directly fine-tuned on top of a\nfrozen backbone LLM, enabling lossless inference acceleration. Medusa-2: Medusa\nis fine-tuned together with the backbone LLM, enabling better prediction\naccuracy of Medusa heads and higher speedup but needing a special training\nrecipe that preserves the backbone model's capabilities.\n  Moreover, we propose several extensions that improve or expand the utility of\nMedusa, including a self-distillation to handle situations where no training\ndata is available and a typical acceptance scheme to boost the acceptance rate\nwhile maintaining generation quality. We evaluate Medusa on models of various\nsizes and training procedures. Our experiments demonstrate that Medusa-1 can\nachieve over 2.2x speedup without compromising generation quality, while\nMedusa-2 further improves the speedup to 2.3-3.6x.\n","authors":["Tianle Cai","Yuhong Li","Zhengyang Geng","Hongwu Peng","Jason D. Lee","Deming Chen","Tri Dao"],"pdf_url":"https://arxiv.org/pdf/2401.10774v1.pdf","comment":"The code for this implementation is available at\n  https://github.com/FasterDecoding/Medusa"},{"id":"http://arxiv.org/abs/2401.10768v1","updated":"2024-01-19T15:39:49Z","published":"2024-01-19T15:39:49Z","title":"Mitigating Hallucinations of Large Language Models via Knowledge\n  Consistent Alignment","summary":"  While Large Language Models (LLMs) have proven to be exceptional on a variety\nof tasks after alignment, they may still produce responses that contradict the\ncontext or world knowledge confidently, a phenomenon known as\n``hallucination''. In this paper, we demonstrate that reducing the\ninconsistency between the external knowledge encapsulated in the training data\nand the intrinsic knowledge inherited in the pretraining corpus could mitigate\nhallucination in alignment. Specifically, we introduce a novel knowledge\nconsistent alignment (KCA) approach, which involves automatically formulating\nexaminations based on external knowledge for accessing the comprehension of\nLLMs. For data encompassing knowledge inconsistency, KCA implements several\nsimple yet efficient strategies for processing. We illustrate the superior\nperformance of the proposed KCA approach in mitigating hallucinations across\nsix benchmarks using LLMs of different backbones and scales. Furthermore, we\nconfirm the correlation between knowledge inconsistency and hallucination,\nsignifying the effectiveness of reducing knowledge inconsistency in alleviating\nhallucinations. Our code, model weights, and data are public at\n\\url{https://github.com/fanqiwan/KCA}.\n","authors":["Fanqi Wan","Xinting Huang","Leyang Cui","Xiaojun Quan","Wei Bi","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2401.10768v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2306.16143v4","updated":"2024-01-19T15:05:14Z","published":"2023-06-28T12:17:45Z","title":"Generative User-Experience Research for Developing Domain-specific\n  Natural Language Processing Applications","summary":"  User experience (UX) is a part of human-computer interaction (HCI) research\nand focuses on increasing intuitiveness, transparency, simplicity, and trust\nfor the system users. Most UX research for machine learning (ML) or natural\nlanguage processing (NLP) focuses on a data-driven methodology. It engages\ndomain users mainly for usability evaluation. Moreover, more typical UX methods\ntailor the systems towards user usability, unlike learning about the user needs\nfirst. This paper proposes a new methodology for integrating generative UX\nresearch into developing domain NLP applications. Generative UX research\nemploys domain users at the initial stages of prototype development, i.e.,\nideation and concept evaluation, and the last stage for evaluating system\nusefulness and user utility. The methodology emerged from and is evaluated on a\ncase study about the full-cycle prototype development of a domain-specific\nsemantic search for daily operations in the process industry. A key finding of\nour case study is that involving domain experts increases their interest and\ntrust in the final NLP application. The combined UX+NLP research of the\nproposed method efficiently considers data- and user-driven opportunities and\nconstraints, which can be crucial for developing NLP applications.\n","authors":["Anastasia Zhukova","Lukas von Sperl","Christian E. Matt","Bela Gipp"],"pdf_url":"https://arxiv.org/pdf/2306.16143v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10716v1","updated":"2024-01-19T14:27:44Z","published":"2024-01-19T14:27:44Z","title":"Structured Code Representations Enable Data-Efficient Adaptation of Code\n  Language Models","summary":"  Current language models tailored for code tasks often adopt the\npre-training-then-fine-tuning paradigm from natural language processing,\nmodeling source code as plain text. This approach, however, overlooks the\nunambiguous structures inherent in programming languages. In this work, we\nexplore data-efficient adaptation of pre-trained code models by further\npre-training and fine-tuning them with program structures. Specifically, we\nrepresent programs as parse trees -- also known as concrete syntax trees (CSTs)\n-- and adapt pre-trained models on serialized CSTs. Although the models that we\nadapt have been pre-trained only on the surface form of programs, we find that\na small amount of continual pre-training and fine-tuning on CSTs without\nchanging the model architecture yields improvements over the baseline approach\nacross various code tasks. The improvements are found to be particularly\nsignificant when there are limited training examples, demonstrating the\neffectiveness of integrating program structures with plain-text representation\neven when working with backbone models that have not been pre-trained with\nstructures.\n","authors":["Mayank Agarwal","Yikang Shen","Bailin Wang","Yoon Kim","Jie Chen"],"pdf_url":"https://arxiv.org/pdf/2401.10716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10712v1","updated":"2024-01-19T14:22:29Z","published":"2024-01-19T14:22:29Z","title":"Q&A Prompts: Discovering Rich Visual Clues through Mining\n  Question-Answer Prompts for VQA requiring Diverse World Knowledge","summary":"  With the breakthrough of multi-modal large language models, answering complex\nvisual questions that demand advanced reasoning abilities and world knowledge\nhas become a much more important testbed for developing AI models than ever.\nHowever, equipping AI models with robust cross-modality reasoning ability\nremains challenging since the cognition scheme of humans has not been\nunderstood systematically. In this paper, we believe that if we can collect\nvisual clues in the given image as much as possible, we will recognize the\nimage more accurately, understand the question better, recall relevant\nknowledge more easily, and finally reason out the answer. We discover these\nrich visual clues by mining question-answer pairs in images and sending them\ninto multi-modal large language models as prompts. We call the proposed method\nQ&A Prompts. Specifically, we first use the image-answer pairs and the\ncorresponding questions in the training set as inputs and outputs to train a\nvisual question generation model. Then, we use an image tagging model to\nidentify various instances and send packaged image-tag pairs into the visual\nquestion generation model to generate relevant questions with the extracted\nimage tags as answers. Finally, we encode these generated question-answer pairs\nas prompts with a visual-aware prompting module and send them into pre-trained\nmulti-modal large language models to reason out the final answers. Experimental\nresults show that, compared with state-of-the-art methods, our Q&A Prompts\nachieves substantial improvements on the challenging visual question answering\ndatasets requiring reasoning over diverse world knowledge, such as OK-VQA and\nA-OKVQA.\n","authors":["Haibi Wang","Weifeng Ge"],"pdf_url":"https://arxiv.org/pdf/2401.10712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10711v1","updated":"2024-01-19T14:21:46Z","published":"2024-01-19T14:21:46Z","title":"Weakly Supervised Gaussian Contrastive Grounding with Large Multimodal\n  Models for Video Question Answering","summary":"  Video Question Answering (VideoQA) aims to answer natural language questions\nbased on the information observed in videos. Despite the recent success of\nLarge Multimodal Models (LMMs) in image-language understanding and reasoning,\nthey deal with VideoQA insufficiently by simply taking uniformly sampled frames\nas visual inputs, which ignores question-relevant visual clues. Moreover, there\nare no human annotations for question-critical timestamps in existing VideoQA\ndatasets. In light of this, we propose a novel weakly supervised framework to\nenforce the LMMs to reason out the answers with question-critical moments as\nvisual inputs. Specifically, we fuse the question and answer pairs as event\ndescriptions to find multiple keyframes as target moments, which will be\npseudo-labels. With these pseudo-labels as additionally weak supervision, we\ndevise a lightweight Gaussian-based Contrastive Grounding (GCG) module. GCG\nlearns multiple Gaussian functions to characterize the temporal structure of\nthe video, and sample question-critical frames as positive moments to be the\nvisual inputs of LMMs. Extensive experiments on several VideoQA benchmarks\nverify the effectiveness of our framework, and we achieve substantial\nimprovements compared to previous state-of-the-art methods.\n","authors":["Haibo Wang","Chenghang Lai","Yixuan Sun","Weifeng Ge"],"pdf_url":"https://arxiv.org/pdf/2401.10711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10695v1","updated":"2024-01-19T14:00:19Z","published":"2024-01-19T14:00:19Z","title":"LangBridge: Multilingual Reasoning Without Multilingual Supervision","summary":"  We introduce LangBridge, a zero-shot approach to adapt language models for\nmultilingual reasoning tasks without multilingual supervision. LangBridge\noperates by bridging two models, each specialized in different aspects: (1) one\nspecialized in understanding multiple languages (e.g., mT5 encoder) and (2) one\nspecialized in reasoning (e.g., Orca 2). LangBridge connects the two models by\nintroducing minimal trainable parameters between them. Despite utilizing only\nEnglish data for training, LangBridge considerably enhances the performance of\nlanguage models on low-resource languages across mathematical reasoning,\ncoding, and logical reasoning. Our analysis suggests that the efficacy of\nLangBridge stems from the language-agnostic characteristics of multilingual\nrepresentations. We publicly release our code and models.\n","authors":["Dongkeun Yoon","Joel Jang","Sungdong Kim","Seungone Kim","Sheikh Shafayat","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2401.10695v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.09343v2","updated":"2024-01-19T13:33:22Z","published":"2024-01-17T17:08:36Z","title":"Efficient slot labelling","summary":"  Slot labelling is an essential component of any dialogue system, aiming to\nfind important arguments in every user turn. Common approaches involve large\npre-trained language models (PLMs) like BERT or RoBERTa, but they face\nchallenges such as high computational requirements and dependence on\npre-training data. In this work, we propose a lightweight method which performs\non par or better than the state-of-the-art PLM-based methods, while having\nalmost 10x less trainable parameters. This makes it especially applicable for\nreal-life industry scenarios.\n","authors":["Vladimir Vlasov"],"pdf_url":"https://arxiv.org/pdf/2401.09343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10444v3","updated":"2024-01-19T13:19:13Z","published":"2023-09-19T09:04:15Z","title":"Exploring Iterative Enhancement for Improving Learnersourced\n  Multiple-Choice Question Explanations with Large Language Models","summary":"  Large language models exhibit superior capabilities in processing and\nunderstanding language, yet their applications in educational contexts remain\nunderexplored. Learnersourcing enhances learning by engaging students in\ncreating their own educational content. When learnersourcing multiple-choice\nquestions, creating explanations for the solution of a question is a crucial\nstep; it helps other students understand the solution and promotes a deeper\nunderstanding of related concepts. However, it is often difficult for students\nto craft effective solution explanations, due to limited subject understanding.\nTo help scaffold the task of automated explanation generation, we present and\nevaluate a framework called \"ILearner-LLM\", that iteratively enhances the\ngenerated explanations for the given questions with large language models.\nComprising an explanation generation model and an explanation evaluation model,\nthe framework generates high-quality student-aligned explanations by\niteratively feeding the quality rating score from the evaluation model back\ninto the instruction prompt of the explanation generation model. Experimental\nresults demonstrate the effectiveness of our ILearner-LLM on LLaMA2-13B and\nGPT-4 to generate higher quality explanations that are closer to those written\nby students on five PeerWise datasets. Our findings represent a promising path\nto enrich the learnersourcing experience for students and to enhance the\ncapabilities of large language models for educational applications.\n","authors":["Qiming Bao","Juho Leinonen","Alex Yuxuan Peng","Wanjun Zhong","Gaël Gendron","Timothy Pistotti","Alice Huang","Paul Denny","Michael Witbrock","Jiamou Liu"],"pdf_url":"https://arxiv.org/pdf/2309.10444v3.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2306.00168v3","updated":"2024-01-19T13:05:04Z","published":"2023-05-31T20:25:08Z","title":"Measuring the Robustness of NLP Models to Domain Shifts","summary":"  Existing research on Domain Robustness (DR) suffers from disparate setups,\nlack of task variety, and scarce research on recent models and capabilities\nsuch as few-shot learning. Furthermore, we claim that the common practice of\nmeasuring DR might further obscure the picture. Current research focuses on\nchallenge sets and relies solely on the Source Drop (SD): Using the source\nin-domain performance as a reference point for degradation. However, the Target\nDrop (TD) should be used as a complementary point of view. To understand the DR\nchallenge in modern NLP models, we developed a benchmark comprised of seven NLP\ntasks, including classification, QA, and generation. Our benchmark focuses on\nnatural topical domain shifts and enables measuring both the SD and the TD. Our\ncomprehensive study, involving over 14,000 domain shifts across 18 fine-tuned\nand few-shot models, shows that both models suffer from drops upon domain\nshifts. While fine-tuned models excel in-domain, few-shot LLMs often surpass\nthem cross-domain, showing better robustness. In addition, we found that a\nlarge SD can be explained by shifting to a harder domain rather than a genuine\nDR challenge. Thus, the TD is a more reliable metric.\n","authors":["Nitay Calderon","Naveh Porat","Eyal Ben-David","Alexander Chapanin","Zorik Gekhman","Nadav Oved","Vitaly Shalumov","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2306.00168v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01185v2","updated":"2024-01-19T12:34:07Z","published":"2023-12-02T17:24:17Z","title":"A ripple in time: a discontinuity in American history","summary":"  In this note we use the State of the Union Address (SOTU) dataset from Kaggle\nto make some surprising (and some not so surprising) observations pertaining to\nthe general timeline of American history, and the character and nature of the\naddresses themselves. Our main approach is using vector embeddings, such as\nBERT (DistilBERT) and GPT-2.\n  While it is widely believed that BERT (and its variations) is most suitable\nfor NLP classification tasks, we find out that GPT-2 in conjunction with\nnonlinear dimension reduction methods such as UMAP provide better separation\nand stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In\nour case, no model fine-tuning is required, and the pre-trained out-of-the-box\nGPT-2 model is enough.\n  We also used a fine-tuned DistilBERT model for classification detecting which\nPresident delivered which address, with very good results (accuracy 93\\% - 95\\%\ndepending on the run). An analogous task was performed to determine the year of\nwriting, and we were able to pin it down to about 4 years (which is a single\npresidential term).\n  It is worth noting that SOTU addresses provide relatively small writing\nsamples (with about 8000 words on average, and varying widely from under 2000\nwords to more than 20000), and that the amount of authors is relatively large\n(we used SOTU addresses of 42 US presidents). This shows that the techniques\nemployed turn out to be rather efficient, while all the computations described\nin this note can be performed using a single GPU instance of Google Colab.\n  The accompanying code is available on GitHub.\n","authors":["Alexander Kolpakov","Igor Rivin"],"pdf_url":"https://arxiv.org/pdf/2312.01185v2.pdf","comment":"7 pages, 8 figures; GitHub repository\n  https://github.com/sashakolpakov/ripple_in_time"},{"id":"http://arxiv.org/abs/2401.10660v1","updated":"2024-01-19T12:26:57Z","published":"2024-01-19T12:26:57Z","title":"A Simple Framework to Accelerate Multilingual Language Model for\n  Monolingual Text Generation","summary":"  Recent advancements in large language models have facilitated the execution\nof complex language tasks, not only in English but also in non-English\nlanguages. However, the tokenizers of most language models, such as Llama,\ntrained on English-centric corpora, tend to excessively fragment tokens in\nnon-English languages. This issue is especially pronounced in non-roman\nalphabetic languages, which are often divided at a character or even Unicode\nlevel, leading to slower text generation. To address this, our study introduces\na novel framework designed to expedite text generation in these languages. This\nframework predicts larger linguistic units than those of conventional\nmultilingual tokenizers and is specifically tailored to the target language,\nthereby reducing the number of decoding steps required. Our empirical results\ndemonstrate that the proposed framework increases the generation speed by a\nfactor of 1.9 compared to standard decoding while maintaining the performance\nof a pre-trained multilingual model on monolingual tasks.\n","authors":["Jimin Hong","Gibbeum Lee","Jaewoong Cho"],"pdf_url":"https://arxiv.org/pdf/2401.10660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10653v1","updated":"2024-01-19T11:59:13Z","published":"2024-01-19T11:59:13Z","title":"Attentive Fusion: A Transformer-based Approach to Multimodal Hate Speech\n  Detection","summary":"  With the recent surge and exponential growth of social media usage,\nscrutinizing social media content for the presence of any hateful content is of\nutmost importance. Researchers have been diligently working since the past\ndecade on distinguishing between content that promotes hatred and content that\ndoes not. Traditionally, the main focus has been on analyzing textual content.\nHowever, recent research attempts have also commenced into the identification\nof audio-based content. Nevertheless, studies have shown that relying solely on\naudio or text-based content may be ineffective, as recent upsurge indicates\nthat individuals often employ sarcasm in their speech and writing. To overcome\nthese challenges, we present an approach to identify whether a speech promotes\nhate or not utilizing both audio and textual representations. Our methodology\nis based on the Transformer framework that incorporates both audio and text\nsampling, accompanied by our very own layer called \"Attentive Fusion\". The\nresults of our study surpassed previous state-of-the-art techniques, achieving\nan impressive macro F1 score of 0.927 on the Test Set.\n","authors":["Atanu Mandal","Gargi Roy","Amit Barman","Indranil Dutta","Sudip Kumar Naskar"],"pdf_url":"https://arxiv.org/pdf/2401.10653v1.pdf","comment":"Accepted in 20th International Conference on Natural Language\n  Processing (ICON)"},{"id":"http://arxiv.org/abs/2401.10647v1","updated":"2024-01-19T11:48:09Z","published":"2024-01-19T11:48:09Z","title":"Sowing the Wind, Reaping the Whirlwind: The Impact of Editing Language\n  Models","summary":"  In the rapidly advancing field of artificial intelligence, the concept of\nRed-Teaming or Jailbreaking large language models (LLMs) has emerged as a\ncrucial area of study. This approach is especially significant in terms of\nassessing and enhancing the safety and robustness of these models. This paper\ninvestigates the intricate consequences of such modifications through model\nediting, uncovering a complex relationship between enhancing model accuracy and\npreserving its ethical integrity. Our in-depth analysis reveals a striking\nparadox: while injecting accurate information is crucial for model reliability,\nit can paradoxically destabilize the model's foundational framework, resulting\nin unpredictable and potentially unsafe behaviors. Additionally, we propose a\nbenchmark dataset NicheHazardQA to investigate this unsafe behavior both within\nthe same and cross topical domain. This aspect of our research sheds light on\nhow the edits, impact the model's safety metrics and guardrails. Our findings\nshow that model editing serves as a cost-effective tool for topical red-teaming\nby methodically applying targeted edits and evaluating the resultant model\nbehavior\n","authors":["Rima Hazra","Sayan Layek","Somnath Banerjee","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2401.10647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13274v2","updated":"2024-01-19T10:06:50Z","published":"2023-11-22T09:51:53Z","title":"Enhancing Summarization Performance through Transformer-Based Prompt\n  Engineering in Automated Medical Reporting","summary":"  Customized medical prompts enable Large Language Models (LLM) to effectively\naddress medical dialogue summarization. The process of medical reporting is\noften time-consuming for healthcare professionals. Implementing medical\ndialogue summarization techniques presents a viable solution to alleviate this\ntime constraint by generating automated medical reports. The effectiveness of\nLLMs in this process is significantly influenced by the formulation of the\nprompt, which plays a crucial role in determining the quality and relevance of\nthe generated reports. In this research, we used a combination of two distinct\nprompting strategies, known as shot prompting and pattern prompting to enhance\nthe performance of automated medical reporting. The evaluation of the automated\nmedical reports is carried out using the ROUGE score and a human evaluation\nwith the help of an expert panel. The two-shot prompting approach in\ncombination with scope and domain context outperforms other methods and\nachieves the highest score when compared to the human reference set by a\ngeneral practitioner. However, the automated reports are approximately twice as\nlong as the human references, due to the addition of both redundant and\nrelevant statements that are added to the report.\n","authors":["Daphne van Zandvoort","Laura Wiersema","Tom Huibers","Sandra van Dulmen","Sjaak Brinkkemper"],"pdf_url":"https://arxiv.org/pdf/2311.13274v2.pdf","comment":"12 pages, 4 figures, to be presented at HEALTHINF 2024, author\n  contributions: research conducted and written by Daphne van Zandvoort and\n  Laura Wiersema, research suggested and used software created by Tom Huibers,\n  data provided and feedback provided by Sandra van Dulmen, supervision and\n  feedback provided by Sjaak Brinkkemper"},{"id":"http://arxiv.org/abs/2311.12399v3","updated":"2024-01-19T09:49:46Z","published":"2023-11-21T07:22:48Z","title":"A Survey of Graph Meets Large Language Model: Progress and Future\n  Directions","summary":"  Graph plays a significant role in representing and analyzing complex\nrelationships in real-world applications such as citation networks, social\nnetworks, and biological data. Recently, Large Language Models (LLMs), which\nhave achieved tremendous success in various domains, have also been leveraged\nin graph-related tasks to surpass traditional Graph Neural Networks (GNNs)\nbased methods and yield state-of-the-art performance. In this survey, we first\npresent a comprehensive review and analysis of existing methods that integrate\nLLMs with graphs. First of all, we propose a new taxonomy, which organizes\nexisting methods into three categories based on the role (i.e., enhancer,\npredictor, and alignment component) played by LLMs in graph-related tasks. Then\nwe systematically survey the representative methods along the three categories\nof the taxonomy. Finally, we discuss the remaining limitations of existing\nstudies and highlight promising avenues for future research. The relevant\npapers are summarized and will be consistently updated at:\nhttps://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks.\n","authors":["Yuhan Li","Zhixun Li","Peisong Wang","Jia Li","Xiangguo Sun","Hong Cheng","Jeffrey Xu Yu"],"pdf_url":"https://arxiv.org/pdf/2311.12399v3.pdf","comment":"Work in progress; 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.10580v1","updated":"2024-01-19T09:46:08Z","published":"2024-01-19T09:46:08Z","title":"PHOENIX: Open-Source Language Adaption for Direct Preference\n  Optimization","summary":"  Large language models have gained immense importance in recent years and have\ndemonstrated outstanding results in solving various tasks. However, despite\nthese achievements, many questions remain unanswered in the context of large\nlanguage models. Besides the optimal use of the models for inference and the\nalignment of the results to the desired specifications, the transfer of models\nto other languages is still an underdeveloped area of research. The recent\npublication of models such as Llama-2 and Zephyr has provided new insights into\narchitectural improvements and the use of human feedback. However, insights\ninto adapting these techniques to other languages remain scarce. In this paper,\nwe build on latest improvements and apply the Direct Preference\nOptimization(DPO) approach to the German language. The model is available at\nhttps://huggingface.co/DRXD1000/Phoenix.\n","authors":["Matthias Uhlig","Sigurd Schacht","Sudarshan Kamath Barkur"],"pdf_url":"https://arxiv.org/pdf/2401.10580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10567v1","updated":"2024-01-19T09:13:28Z","published":"2024-01-19T09:13:28Z","title":"Self-training from Self-memory in Data-to-text Generation","summary":"  This paper introduces a novel training model, self-training from self-memory\n(STSM) in data-to-text generation (DTG), allowing the model to self-train on\nsubsets, including self-memory as outputs inferred directly from the trained\nmodels and/or the new data. The quality of self-memory is validated by two\nmodels, data-to-text (D2T) and text-to-data (T2D), by two pre-defined\nconditions: (1) the appearance of all source values in the outputs of the D2T\nmodel and (2) the ability to convert back to source data in the outputs in the\nT2D model. We utilize a greedy algorithm to generate shorter D2T outputs if\nthey contain all source values. Subsequently, we use the T2D model to confirm\nthat these outputs can capture input relationships by demonstrating their\ncapacity to convert text back into data. With 30% of the dataset, we can train\nthe D2T model with a competitive performance compared to full training in the\nsame setup. We experiment with our model on two datasets, E2E NLG and DART.\nSTSM offers the D2T model a generalization capability from its subset memory\nwhile reducing training data volume. Ultimately, we anticipate that this paper\nwill contribute to continual learning solutions that adapt to new training\ndata, incorporating it as a form of self-memory in DTG tasks. The curated\ndataset is publicly available at: https://github.com/hoangthangta/STSM.\n","authors":["Hoang-Thang Ta"],"pdf_url":"https://arxiv.org/pdf/2401.10567v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2401.09566v2","updated":"2024-01-19T08:57:19Z","published":"2024-01-17T19:43:43Z","title":"Aligning Large Language Models with Counterfactual DPO","summary":"  Advancements in large language models (LLMs) have demonstrated remarkable\ncapabilities across a diverse range of applications. These models excel in\ngenerating text completions that are contextually coherent and cover an\nextensive array of subjects. However, the vast datasets required for their\ntraining make aligning response styles during the pretraining and instruction\ntuning phases challenging. Consequently, an additional alignment phase is\ntypically employed, wherein the model is further trained with human preference\ndata to better align its outputs with human expectations. While this process\ndoesn't introduce new capabilities per se, it does accentuate generation styles\ninnate to the model. This paper explores the utilization of counterfactual\nprompting within the framework of Direct Preference Optimization (DPO) to align\nthe model's style without relying on human intervention. We demonstrate that\nthis method effectively instils desirable behaviour, mitigates undesirable\nones, and encourages the model to disregard inappropriate instructions. Our\nfindings suggest that counterfactual prompting with DPO presents a low-resource\nway to fine-tune LLMs to meet the demands for responsible and ethically aligned\nAI systems.\n","authors":["Bradley Butcher"],"pdf_url":"https://arxiv.org/pdf/2401.09566v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10559v1","updated":"2024-01-19T08:50:54Z","published":"2024-01-19T08:50:54Z","title":"OrchMoE: Efficient Multi-Adapter Learning with Task-Skill Synergy","summary":"  We advance the field of Parameter-Efficient Fine-Tuning (PEFT) with our novel\nmulti-adapter method, OrchMoE, which capitalizes on modular skill architecture\nfor enhanced forward transfer in neural networks. Unlike prior models that\ndepend on explicit task identification inputs, OrchMoE automatically discerns\ntask categories, streamlining the learning process. This is achieved through an\nintegrated mechanism comprising an Automatic Task Classification module and a\nTask-Skill Allocation module, which collectively deduce task-specific\nclassifications and tailor skill allocation matrices. Our extensive evaluations\non the 'Super Natural Instructions' dataset, featuring 1,600 diverse\ninstructional tasks, indicate that OrchMoE substantially outperforms comparable\nmulti-adapter baselines in terms of both performance and sample utilization\nefficiency, all while operating within the same parameter constraints. These\nfindings suggest that OrchMoE offers a significant leap forward in multi-task\nlearning efficiency.\n","authors":["Haowen Wang","Tao Sun","Kaixiang Ji","Jian Wang","Cong Fan","Jinjie Gu"],"pdf_url":"https://arxiv.org/pdf/2401.10559v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.08326v2","updated":"2024-01-19T08:48:37Z","published":"2024-01-16T12:45:15Z","title":"RoTBench: A Multi-Level Benchmark for Evaluating the Robustness of Large\n  Language Models in Tool Learning","summary":"  Tool learning has generated widespread interest as a vital means of\ninteraction between Large Language Models (LLMs) and the physical world.\nCurrent research predominantly emphasizes LLMs' capacity to utilize tools in\nwell-structured environments while overlooking their stability when confronted\nwith the inevitable noise of the real world. To bridge this gap, we introduce\nRoTBench, a multi-level benchmark for evaluating the robustness of LLMs in tool\nlearning. Specifically, we establish five external environments, each featuring\nvarying levels of noise (i.e., Clean, Slight, Medium, Heavy, and Union),\nproviding an in-depth analysis of the model's resilience across three critical\nphases: tool selection, parameter identification, and content filling.\nExperiments involving six widely-used models underscore the urgent necessity\nfor enhancing the robustness of LLMs in tool learning. For instance, the\nperformance of GPT-4 even drops significantly from 80.00 to 58.10 when there is\nno substantial change in manual accuracy. More surprisingly, the noise\ncorrection capability inherent in the GPT family paradoxically impedes its\nadaptability in the face of mild noise. In light of these findings, we propose\nRoTTuning, a strategy that enriches the diversity of training environments to\nbolster the robustness of LLMs in tool learning. The code and data are\navailable at https://github.com/Junjie-Ye/RoTBench.\n","authors":["Junjie Ye","Yilong Wu","Songyang Gao","Caishuang Huang","Sixian Li","Guanyu Li","Xiaoran Fan","Qi Zhang","Tao Gui","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2401.08326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10543v1","updated":"2024-01-19T08:02:37Z","published":"2024-01-19T08:02:37Z","title":"Multilingual acoustic word embeddings for zero-resource languages","summary":"  This research addresses the challenge of developing speech applications for\nzero-resource languages that lack labelled data. It specifically uses acoustic\nword embedding (AWE) -- fixed-dimensional representations of variable-duration\nspeech segments -- employing multilingual transfer, where labelled data from\nseveral well-resourced languages are used for pertaining. The study introduces\na new neural network that outperforms existing AWE models on zero-resource\nlanguages. It explores the impact of the choice of well-resourced languages.\nAWEs are applied to a keyword-spotting system for hate speech detection in\nSwahili radio broadcasts, demonstrating robustness in real-world scenarios.\nAdditionally, novel semantic AWE models improve semantic query-by-example\nsearch.\n","authors":["Christiaan Jacobs","Herman Kamper"],"pdf_url":"https://arxiv.org/pdf/2401.10543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14995v2","updated":"2024-01-19T07:47:01Z","published":"2023-07-27T16:45:33Z","title":"TransNormerLLM: A Faster and Better Large Language Model with Improved\n  TransNormer","summary":"  We present TransNormerLLM, the first linear attention-based Large Language\nModel (LLM) that outperforms conventional softmax attention-based models in\nterms of both accuracy and efficiency. TransNormerLLM evolves from the previous\nlinear attention architecture TransNormer by making advanced modifications that\ninclude positional embedding, linear attention acceleration, gating mechanisms,\ntensor normalization, and inference acceleration and stabilization.\nSpecifically, we use LRPE together with an exponential decay to avoid attention\ndilution issues while allowing the model to retain global interactions between\ntokens. Additionally, we propose Lightning Attention, a cutting-edge technique\nthat accelerates linear attention by more than twice in runtime and reduces\nmemory usage by a remarkable four times. To further enhance the performance of\nTransNormer, we leverage a gating mechanism for smooth training and a new\ntensor normalization scheme to accelerate the model, resulting in an impressive\nacceleration of over $20\\%$. Furthermore, we develop a robust inference\nalgorithm that ensures numerical stability and consistent inference speed,\nregardless of the sequence length, showcasing superior efficiency during both\ntraining and inference stages. We also implement an efficient model parallel\nschema for TransNormerLLM, enabling seamless deployment on large-scale clusters\nand facilitating expansion to even more extensive models, i.e., LLMs with 175B\nparameters. We validate our model design through a series of ablations and\ntrain models with sizes of 385M, 1B, and 7B on our self-collected corpus.\nBenchmark results demonstrate that our models not only match the performance of\nstate-of-the-art LLMs with Transformer but are also significantly faster. Code\nis released at: https://github.com/OpenNLPLab/TransnormerLLM.\n","authors":["Zhen Qin","Dong Li","Weigao Sun","Weixuan Sun","Xuyang Shen","Xiaodong Han","Yunshen Wei","Baohong Lv","Xiao Luo","Yu Qiao","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2307.14995v2.pdf","comment":"Technical Report. Yiran Zhong is the corresponding author. Zhen Qin,\n  Dong Li, Weigao Sun, Weixuan Sun, Xuyang Shen contribute equally to this\n  paper. Code is released at: https://github.com/OpenNLPLab/TransnormerLLM"},{"id":"http://arxiv.org/abs/2401.10536v1","updated":"2024-01-19T07:30:57Z","published":"2024-01-19T07:30:57Z","title":"Speech Swin-Transformer: Exploring a Hierarchical Transformer with\n  Shifted Windows for Speech Emotion Recognition","summary":"  Swin-Transformer has demonstrated remarkable success in computer vision by\nleveraging its hierarchical feature representation based on Transformer. In\nspeech signals, emotional information is distributed across different scales of\nspeech features, e.\\,g., word, phrase, and utterance. Drawing above\ninspiration, this paper presents a hierarchical speech Transformer with shifted\nwindows to aggregate multi-scale emotion features for speech emotion\nrecognition (SER), called Speech Swin-Transformer. Specifically, we first\ndivide the speech spectrogram into segment-level patches in the time domain,\ncomposed of multiple frame patches. These segment-level patches are then\nencoded using a stack of Swin blocks, in which a local window Transformer is\nutilized to explore local inter-frame emotional information across frame\npatches of each segment patch. After that, we also design a shifted window\nTransformer to compensate for patch correlations near the boundaries of segment\npatches. Finally, we employ a patch merging operation to aggregate\nsegment-level emotional features for hierarchical speech representation by\nexpanding the receptive field of Transformer from frame-level to segment-level.\nExperimental results demonstrate that our proposed Speech Swin-Transformer\noutperforms the state-of-the-art methods.\n","authors":["Yong Wang","Cheng Lu","Hailun Lian","Yan Zhao","Björn Schuller","Yuan Zong","Wenming Zheng"],"pdf_url":"https://arxiv.org/pdf/2401.10536v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.10535v1","updated":"2024-01-19T07:21:45Z","published":"2024-01-19T07:21:45Z","title":"The \"Colonial Impulse\" of Natural Language Processing: An Audit of\n  Bengali Sentiment Analysis Tools and Their Identity-based Biases","summary":"  While colonization has sociohistorically impacted people's identities across\nvarious dimensions, those colonial values and biases continue to be perpetuated\nby sociotechnical systems. One category of sociotechnical systems--sentiment\nanalysis tools--can also perpetuate colonial values and bias, yet less\nattention has been paid to how such tools may be complicit in perpetuating\ncoloniality, although they are often used to guide various practices (e.g.,\ncontent moderation). In this paper, we explore potential bias in sentiment\nanalysis tools in the context of Bengali communities that have experienced and\ncontinue to experience the impacts of colonialism. Drawing on identity\ncategories most impacted by colonialism amongst local Bengali communities, we\nfocused our analytic attention on gender, religion, and nationality. We\nconducted an algorithmic audit of all sentiment analysis tools for Bengali,\navailable on the Python package index (PyPI) and GitHub. Despite similar\nsemantic content and structure, our analyses showed that in addition to\ninconsistencies in output from different tools, Bengali sentiment analysis\ntools exhibit bias between different identity categories and respond\ndifferently to different ways of identity expression. Connecting our findings\nwith colonially shaped sociocultural structures of Bengali communities, we\ndiscuss the implications of downstream bias of sentiment analysis tools.\n","authors":["Dipto Das","Shion Guha","Jed Brubaker","Bryan Semaan"],"pdf_url":"https://arxiv.org/pdf/2401.10535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10529v1","updated":"2024-01-19T07:10:13Z","published":"2024-01-19T07:10:13Z","title":"Mementos: A Comprehensive Benchmark for Multimodal Large Language Model\n  Reasoning over Image Sequences","summary":"  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in\nhandling a variety of visual-language tasks. However, current MLLM benchmarks\nare predominantly designed to evaluate reasoning based on static information\nabout a single image, and the ability of modern MLLMs to extrapolate from image\nsequences, which is essential for understanding our ever-changing world, has\nbeen less investigated. To address this challenge, this paper introduces\nMementos, a new benchmark designed to assess MLLMs' sequential image reasoning\nabilities. Mementos features 4,761 diverse image sequences with varying\nlengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning\nperformance. Through a careful evaluation of nine recent MLLMs on Mementos,\nincluding GPT-4V and Gemini, we find that they struggle to accurately describe\ndynamic information about given image sequences, often leading to\nhallucinations/misrepresentations of objects and their corresponding behaviors.\nOur quantitative analysis and case studies identify three key factors impacting\nMLLMs' sequential image reasoning: the correlation between object and\nbehavioral hallucinations, the influence of cooccurring behaviors, and the\ncompounding impact of behavioral hallucinations. Our dataset is available at\nhttps://github.com/umd-huang-lab/Mementos.\n","authors":["Xiyao Wang","Yuhang Zhou","Xiaoyu Liu","Hongjin Lu","Yuancheng Xu","Feihong He","Jaehong Yoon","Taixi Lu","Gedas Bertasius","Mohit Bansal","Huaxiu Yao","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.10529v1.pdf","comment":"27 pages, 23 figures"},{"id":"http://arxiv.org/abs/2401.10521v1","updated":"2024-01-19T06:54:39Z","published":"2024-01-19T06:54:39Z","title":"Cross-lingual Editing in Multilingual Language Models","summary":"  The training of large language models (LLMs) necessitates substantial data\nand computational resources, and updating outdated LLMs entails significant\nefforts and resources. While numerous model editing techniques (METs) have\nemerged to efficiently update model outputs without retraining, their\neffectiveness in multilingual LLMs, where knowledge is stored in diverse\nlanguages, remains an underexplored research area. This research paper\nintroduces the cross-lingual model editing (\\textbf{XME}) paradigm, wherein a\nfact is edited in one language, and the subsequent update propagation is\nobserved across other languages. To investigate the XME paradigm, we conducted\nexperiments using BLOOM, mBERT, and XLM-RoBERTa using the two writing scripts:\n\\textit{Latin} (English, French, and Spanish) and \\textit{Indic} (Hindi,\nGujarati, and Bengali). The results reveal notable performance limitations of\nstate-of-the-art METs under the XME setting, mainly when the languages involved\nbelong to two distinct script families. These findings highlight the need for\nfurther research and development of XME techniques to address these challenges.\nFor more comprehensive information, the dataset used in this research and the\nassociated code are publicly available at the following\nURL\\url{https://github.com/lingo-iitgn/XME}.\n","authors":["Himanshu Beniwal","Kowsik Nandagopan D","Mayank Singh"],"pdf_url":"https://arxiv.org/pdf/2401.10521v1.pdf","comment":"Accepted at EACL 2024"},{"id":"http://arxiv.org/abs/2312.15880v2","updated":"2024-01-19T06:42:16Z","published":"2023-12-26T04:22:56Z","title":"KnowledgeNavigator: Leveraging Large Language Models for Enhanced\n  Reasoning over Knowledge Graph","summary":"  Large language model (LLM) has achieved outstanding performance on various\ndownstream tasks with its powerful natural language understanding and zero-shot\ncapability, but LLM still suffers from knowledge limitation. Especially in\nscenarios that require long logical chains or complex reasoning, the\nhallucination and knowledge limitation of LLM limit its performance in question\nanswering (QA). In this paper, we propose a novel framework KnowledgeNavigator\nto address these challenges by efficiently and accurately retrieving external\nknowledge from knowledge graph and using it as a key factor to enhance LLM\nreasoning. Specifically, KnowledgeNavigator first mines and enhances the\npotential constraints of the given question to guide the reasoning. Then it\nretrieves and filters external knowledge that supports answering through\niterative reasoning on knowledge graph with the guidance of LLM and the\nquestion. Finally, KnowledgeNavigator constructs the structured knowledge into\neffective prompts that are friendly to LLM to help its reasoning. We evaluate\nKnowledgeNavigator on multiple public KGQA benchmarks, the experiments show the\nframework has great effectiveness and generalization, outperforming previous\nknowledge graph enhanced LLM methods and is comparable to the fully supervised\nmodels.\n","authors":["Tiezheng Guo","Qingwen Yang","Chen Wang","Yanyi Liu","Pan Li","Jiawei Tang","Dapeng Li","Yingyou Wen"],"pdf_url":"https://arxiv.org/pdf/2312.15880v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05492v3","updated":"2024-01-19T06:06:46Z","published":"2023-10-09T07:56:16Z","title":"How Abilities in Large Language Models are Affected by Supervised\n  Fine-tuning Data Composition","summary":"  Large language models (LLMs) with enormous pre-training tokens and parameters\nemerge diverse abilities, including math reasoning, code generation, and\ninstruction following. These abilities are further enhanced by supervised\nfine-tuning (SFT). While the open-source community has explored ad-hoc SFT for\nenhancing individual capabilities, proprietary LLMs exhibit versatility across\nvarious skills. Therefore, understanding the facilitation of multiple abilities\nvia SFT is paramount. In this study, we specifically focuses on the interplay\nof data composition between mathematical reasoning, code generation, and\ngeneral human-aligning abilities during SFT. We propose four intriguing\nresearch questions to explore the association between model performance and\nvarious factors including data amount, composition ratio, model size and SFT\nstrategies. Our experiments reveal that distinct capabilities scale differently\nand larger models generally show superior performance with same amount of data.\nMathematical reasoning and code generation consistently improve with increasing\ndata amount, whereas general abilities plateau after roughly a thousand\nsamples. Moreover, we observe data composition appears to enhance various\nabilities under limited data conditions, yet can lead to performance conflicts\nwhen data is plentiful. Our findings also suggest the amount of composition\ndata influences performance more than the composition ratio. In analysis of SFT\nstrategies, we find that sequentially learning multiple skills risks\ncatastrophic forgetting. Our proposed Dual-stage Mixed Fine-tuning (DMT)\nstrategy offers a promising solution to learn multiple abilities with different\nscaling patterns.\n","authors":["Guanting Dong","Hongyi Yuan","Keming Lu","Chengpeng Li","Mingfeng Xue","Dayiheng Liu","Wei Wang","Zheng Yuan","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.05492v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10510v1","updated":"2024-01-19T05:58:30Z","published":"2024-01-19T05:58:30Z","title":"A match made in consistency heaven: when large language models meet\n  evolutionary algorithms","summary":"  Pre-trained large language models (LLMs) have powerful capabilities for\ngenerating creative natural text. Evolutionary algorithms (EAs) can discover\ndiverse solutions to complex real-world problems. Motivated by the common\ncollective and directionality of text sequence generation and evolution, this\npaper illustrates the strong consistency of LLMs and EAs, which includes\nmultiple one-to-one key characteristics: token embedding and genotype-phenotype\nmapping, position encoding and fitness shaping, position embedding and\nselection, attention and crossover, feed-forward neural network and mutation,\nmodel training and parameter update, and multi-task learning and\nmulti-objective optimization. Based on this consistency perspective, existing\ncoupling studies are analyzed, including evolutionary fine-tuning and\nLLM-enhanced EAs. Leveraging these insights, we outline a fundamental roadmap\nfor future research in coupling LLMs and EAs, while highlighting key challenges\nalong the way. The consistency not only reveals the evolution mechanism behind\nLLMs but also facilitates the development of evolved artificial agents that\napproach or surpass biological organisms.\n","authors":["Wang Chao","Jiaxuan Zhao","Licheng Jiao","Lingling Li","Fang Liu","Shuyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2401.10510v1.pdf","comment":"A perspective article under review"},{"id":"http://arxiv.org/abs/2401.10506v1","updated":"2024-01-19T05:48:07Z","published":"2024-01-19T05:48:07Z","title":"FinSQL: Model-Agnostic LLMs-based Text-to-SQL Framework for Financial\n  Analysis","summary":"  Text-to-SQL, which provides zero-code interface for operating relational\ndatabases, has gained much attention in financial analysis; because, financial\nprofessionals may not well-skilled in SQL programming. However, until now,\nthere is no practical Text-to-SQL benchmark dataset for financial analysis, and\nexisting Text-to-SQL methods have not considered the unique characteristics of\ndatabases in financial applications, such as commonly existing wide tables. To\naddress these issues, we collect a practical Text-to-SQL benchmark dataset and\npropose a model-agnostic Large Language Model (LLMs)-based Text-to-SQL\nframework for financial analysis. The benchmark dataset, BULL, is collected\nfrom the practical financial analysis business of Hundsun Technologies Inc.,\nincluding databases for fund, stock, and macro economy. Besides, the proposed\nLLMs-based Text-to-SQL framework, FinSQL, provides a systematic treatment for\nfinancial Text-to-SQL from the perspectives of prompt construction,\nparameter-efficient fine-tuning and output calibration. Extensive experimental\nresults on BULL demonstrate that FinSQL achieves the state-of-the-art\nText-to-SQL performance at a small cost; furthermore, FinSQL can bring up to\n36.64% performance improvement in scenarios requiring few-shot cross-database\nmodel transfer.\n","authors":["Chao Zhang","Yuren Mao","Yijiang Fan","Yu Mi","Yunjun Gao","Lu Chen","Dongfang Lou","Jinshu Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10506v1.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2401.00368v2","updated":"2024-01-19T05:16:20Z","published":"2023-12-31T02:13:18Z","title":"Improving Text Embeddings with Large Language Models","summary":"  In this paper, we introduce a novel and simple method for obtaining\nhigh-quality text embeddings using only synthetic data and less than 1k\ntraining steps. Unlike existing methods that often depend on multi-stage\nintermediate pre-training with billions of weakly-supervised text pairs,\nfollowed by fine-tuning with a few labeled datasets, our method does not\nrequire building complex training pipelines or relying on manually collected\ndatasets that are often constrained by task diversity and language coverage. We\nleverage proprietary LLMs to generate diverse synthetic data for hundreds of\nthousands of text embedding tasks across nearly 100 languages. We then\nfine-tune open-source decoder-only LLMs on the synthetic data using standard\ncontrastive loss. Experiments demonstrate that our method achieves strong\nperformance on highly competitive text embedding benchmarks without using any\nlabeled data. Furthermore, when fine-tuned with a mixture of synthetic and\nlabeled data, our model sets new state-of-the-art results on the BEIR and MTEB\nbenchmarks.\n","authors":["Liang Wang","Nan Yang","Xiaolong Huang","Linjun Yang","Rangan Majumder","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2401.00368v2.pdf","comment":"20 pages, 15 tables"},{"id":"http://arxiv.org/abs/2401.10491v1","updated":"2024-01-19T05:02:46Z","published":"2024-01-19T05:02:46Z","title":"Knowledge Fusion of Large Language Models","summary":"  While training large language models (LLMs) from scratch can generate models\nwith distinct functionalities and strengths, it comes at significant costs and\nmay result in redundant capabilities. Alternatively, a cost-effective and\ncompelling approach is to merge existing pre-trained LLMs into a more potent\nmodel. However, due to the varying architectures of these LLMs, directly\nblending their weights is impractical. In this paper, we introduce the notion\nof knowledge fusion for LLMs, aimed at combining the capabilities of existing\nLLMs and transferring them into a single LLM. By leveraging the generative\ndistributions of source LLMs, we externalize their collective knowledge and\nunique strengths, thereby potentially elevating the capabilities of the target\nmodel beyond those of any individual source LLM. We validate our approach using\nthree popular LLMs with different architectures--Llama-2, MPT, and\nOpenLLaMA--across various benchmarks and tasks. Our findings confirm that the\nfusion of LLMs can improve the performance of the target model across a range\nof capabilities such as reasoning, commonsense, and code generation. Our code,\nmodel weights, and data are public at\n\\url{https://github.com/fanqiwan/FuseLLM}.\n","authors":["Fanqi Wan","Xinting Huang","Deng Cai","Xiaojun Quan","Wei Bi","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2401.10491v1.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.09972v2","updated":"2024-01-19T04:29:42Z","published":"2024-01-18T13:41:08Z","title":"Better Explain Transformers by Illuminating Important Information","summary":"  Transformer-based models excel in various natural language processing (NLP)\ntasks, attracting countless efforts to explain their inner workings. Prior\nmethods explain Transformers by focusing on the raw gradient and attention as\ntoken attribution scores, where non-relevant information is often considered\nduring explanation computation, resulting in confusing results. In this work,\nwe propose highlighting the important information and eliminating irrelevant\ninformation by a refined information flow on top of the layer-wise relevance\npropagation (LRP) method. Specifically, we consider identifying syntactic and\npositional heads as important attention heads and focus on the relevance\nobtained from these important heads. Experimental results demonstrate that\nirrelevant information does distort output attribution scores and then should\nbe masked during explanation computation. Compared to eight baselines on both\nclassification and question-answering datasets, our method consistently\noutperforms with over 3\\% to 33\\% improvement on explanation metrics, providing\nsuperior explanation performance. Our anonymous code repository is available\nat: https://github.com/LinxinS97/Mask-LRP\n","authors":["Linxin Song","Yan Cui","Ao Luo","Freddy Lecue","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2401.09972v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10487v1","updated":"2024-01-19T04:24:07Z","published":"2024-01-19T04:24:07Z","title":"Generative Dense Retrieval: Memory Can Be a Burden","summary":"  Generative Retrieval (GR), autoregressively decoding relevant document\nidentifiers given a query, has been shown to perform well under the setting of\nsmall-scale corpora. By memorizing the document corpus with model parameters,\nGR implicitly achieves deep interaction between query and document. However,\nsuch a memorizing mechanism faces three drawbacks: (1) Poor memory accuracy for\nfine-grained features of documents; (2) Memory confusion gets worse as the\ncorpus size increases; (3) Huge memory update costs for new documents. To\nalleviate these problems, we propose the Generative Dense Retrieval (GDR)\nparadigm. Specifically, GDR first uses the limited memory volume to achieve\ninter-cluster matching from query to relevant document clusters.\nMemorizing-free matching mechanism from Dense Retrieval (DR) is then introduced\nto conduct fine-grained intra-cluster matching from clusters to relevant\ndocuments. The coarse-to-fine process maximizes the advantages of GR's deep\ninteraction and DR's scalability. Besides, we design a cluster identifier\nconstructing strategy to facilitate corpus memory and a cluster-adaptive\nnegative sampling strategy to enhance the intra-cluster mapping ability.\nEmpirical results show that GDR obtains an average of 3.0 R@100 improvement on\nNQ dataset under multiple settings and has better scalability.\n","authors":["Peiwen Yuan","Xinglin Wang","Shaoxiong Feng","Boyuan Pan","Yiwei Li","Heda Wang","Xupeng Miao","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2401.10487v1.pdf","comment":"EACL 2024 main"},{"id":"http://arxiv.org/abs/2401.10480v1","updated":"2024-01-19T04:03:59Z","published":"2024-01-19T04:03:59Z","title":"Escape Sky-high Cost: Early-stopping Self-Consistency for Multi-step\n  Reasoning","summary":"  Self-consistency (SC) has been a widely used decoding strategy for\nchain-of-thought reasoning. Despite bringing significant performance\nimprovements across a variety of multi-step reasoning tasks, it is a high-cost\nmethod that requires multiple sampling with the preset size. In this paper, we\npropose a simple and scalable sampling process, \\textbf{E}arly-Stopping\n\\textbf{S}elf-\\textbf{C}onsistency (ESC), to greatly reduce the cost of SC\nwithout sacrificing performance. On this basis, one control scheme for ESC is\nfurther derivated to dynamically choose the performance-cost balance for\ndifferent tasks and models. To demonstrate ESC's effectiveness, we conducted\nextensive experiments on three popular categories of reasoning tasks:\narithmetic, commonsense and symbolic reasoning over language models with\nvarying scales. The empirical results show that ESC reduces the average number\nof sampling of chain-of-thought reasoning by a significant margin on six\nbenchmarks, including MATH (-33.8%), GSM8K (-80.1%), StrategyQA (-76.8%),\nCommonsenseQA (-78.5%), Coin Flip (-84.2%) and Last Letters (-67.4%), while\nattaining comparable performances.\n","authors":["Yiwei Li","Peiwen Yuan","Shaoxiong Feng","Boyuan Pan","Xinglin Wang","Bin Sun","Heda Wang","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2401.10480v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10472v1","updated":"2024-01-19T03:49:28Z","published":"2024-01-19T03:49:28Z","title":"Name Tagging Under Domain Shift via Metric Learning for Life Sciences","summary":"  Name tagging is a key component of Information Extraction (IE), particularly\nin scientific domains such as biomedicine and chemistry, where large language\nmodels (LLMs), e.g., ChatGPT, fall short. We investigate the applicability of\ntransfer learning for enhancing a name tagging model trained in the biomedical\ndomain (the source domain) to be used in the chemical domain (the target\ndomain). A common practice for training such a model in a few-shot learning\nsetting is to pretrain the model on the labeled source data, and then, to\nfinetune it on a hand-full of labeled target examples. In our experiments we\nobserved that such a model is prone to mis-labeling the source entities, which\ncan often appear in the text, as the target entities. To alleviate this\nproblem, we propose a model to transfer the knowledge from the source domain to\nthe target domain, however, at the same time, to project the source entities\nand target entities into separate regions of the feature space. This diminishes\nthe risk of mis-labeling the source entities as the target entities. Our model\nconsists of two stages: 1) entity grouping in the source domain, which\nincorporates knowledge from annotated events to establish relations between\nentities, and 2) entity discrimination in the target domain, which relies on\npseudo labeling and contrastive learning to enhance discrimination between the\nentities in the two domains. We carry out our extensive experiments across\nthree source and three target datasets, and demonstrate that our method\noutperforms the baselines, in some scenarios by 5\\% absolute value.\n","authors":["Hongyi Liu","Qingyun Wang","Payam Karisani","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2401.10472v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2401.10471v1","updated":"2024-01-19T03:48:27Z","published":"2024-01-19T03:48:27Z","title":"DeepEdit: Knowledge Editing as Decoding with Constraints","summary":"  We develop a new perspective of knowledge editing for large language models\n(LLMs) as decoding with constraints. We propose DeepEdit (Depth-first Search\nbased Progressive Decoding for Knowledge Editing), a neuro-symbolic method that\nimproves knowledge editing with better coherence of reasoning, relevance to the\nquestion, and awareness of updated knowledge. DeepEdit can be flexibly applied\nto all black-box LLMs: it does not require any access to the model parameters,\nrepresentations, or output vocabulary distributions. DeepEdit progressively\nproduces the high-quality reasoning steps towards effective knowledge editing.\nIt utilizes a depth-first search to revise the LLMs' output, which improves the\noutput's informativeness to the input question and awareness of the updated\nknowledge. Qualitatively, DeepEdit effectively controls LLMs to produce more\nsuccinct reasoning in accord with knowledge editing. Quantitatively, DeepEdit\nyields significant gains on MQuaKE, a challenging multi-hop question-answering\ndataset with knowledge editing. We release the source code at\nhttps://github.com/wangywUST/DeepEdit.\n","authors":["Yiwei Wang","Muhao Chen","Nanyun Peng","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2401.10471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10465v1","updated":"2024-01-19T03:37:27Z","published":"2024-01-19T03:37:27Z","title":"Data-driven grapheme-to-phoneme representations for a lexicon-free\n  text-to-speech","summary":"  Grapheme-to-Phoneme (G2P) is an essential first step in any modern,\nhigh-quality Text-to-Speech (TTS) system. Most of the current G2P systems rely\non carefully hand-crafted lexicons developed by experts. This poses a two-fold\nproblem. Firstly, the lexicons are generated using a fixed phoneme set,\nusually, ARPABET or IPA, which might not be the most optimal way to represent\nphonemes for all languages. Secondly, the man-hours required to produce such an\nexpert lexicon are very high. In this paper, we eliminate both of these issues\nby using recent advances in self-supervised learning to obtain data-driven\nphoneme representations instead of fixed representations. We compare our\nlexicon-free approach against strong baselines that utilize a well-crafted\nlexicon. Furthermore, we show that our data-driven lexicon-free method performs\nas good or even marginally better than the conventional rule-based or\nlexicon-based neural G2Ps in terms of Mean Opinion Score (MOS) while using no\nprior language lexicon or phoneme set, i.e. no linguistic expertise.\n","authors":["Abhinav Garg","Jiyeon Kim","Sushil Khyalia","Chanwoo Kim","Dhananjaya Gowda"],"pdf_url":"https://arxiv.org/pdf/2401.10465v1.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.10463v1","updated":"2024-01-19T03:24:36Z","published":"2024-01-19T03:24:36Z","title":"Critical Data Size of Language Models from a Grokking Perspective","summary":"  We explore the critical data size in language models, a threshold that marks\na fundamental shift from quick memorization to slow generalization. We\nformalize the phase transition under the grokking configuration into the Data\nEfficiency Hypothesis and identify data insufficiency, sufficiency, and surplus\nregimes in language models training dynamics. We develop a grokking\nconfiguration to reproduce grokking on simplistic language models stably by\nrescaling initialization and weight decay. We show that generalization occurs\nonly when language models reach a critical size. We analyze grokking across\nsample-wise and model-wise, verifying the proposed data efficiency hypothesis.\nOur experiments reveal smoother phase transitions occurring at the critical\ndataset size for language datasets. As the model size increases, this critical\npoint also becomes larger, indicating that larger models require more data. Our\nresults deepen the understanding of language model training, offering a novel\nperspective on the role of data in the learning mechanism of language models.\n","authors":["Xuekai Zhu","Yao Fu","Bowen Zhou","Zhouhan Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03279v2","updated":"2024-01-19T02:26:38Z","published":"2023-08-07T03:39:52Z","title":"UniversalNER: Targeted Distillation from Large Language Models for Open\n  Named Entity Recognition","summary":"  Large language models (LLMs) have demonstrated remarkable generalizability,\nsuch as understanding arbitrary entities and relations. Instruction tuning has\nproven effective for distilling LLMs into more cost-efficient models such as\nAlpaca and Vicuna. Yet such student models still trail the original LLMs by\nlarge margins in downstream applications. In this paper, we explore targeted\ndistillation with mission-focused instruction tuning to train student models\nthat can excel in a broad application class such as open information\nextraction. Using named entity recognition (NER) for case study, we show how\nChatGPT can be distilled into much smaller UniversalNER models for open NER.\nFor evaluation, we assemble the largest NER benchmark to date, comprising 43\ndatasets across 9 diverse domains such as biomedicine, programming, social\nmedia, law, finance. Without using any direct supervision, UniversalNER attains\nremarkable NER accuracy across tens of thousands of entity types, outperforming\ngeneral instruction-tuned models such as Alpaca and Vicuna by over 30 absolute\nF1 points in average. With a tiny fraction of parameters, UniversalNER not only\nacquires ChatGPT's capability in recognizing arbitrary entity types, but also\noutperforms its NER accuracy by 7-9 absolute F1 points in average. Remarkably,\nUniversalNER even outperforms by a large margin state-of-the-art multi-task\ninstruction-tuned systems such as InstructUIE, which uses supervised NER\nexamples. We also conduct thorough ablation studies to assess the impact of\nvarious components in our distillation approach. We release the distillation\nrecipe, data, and UniversalNER models to facilitate future research on targeted\ndistillation.\n","authors":["Wenxuan Zhou","Sheng Zhang","Yu Gu","Muhao Chen","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2308.03279v2.pdf","comment":"Accepted at ICLR 2024. Project page: https://universal-ner.github.io/"},{"id":"http://arxiv.org/abs/2401.10449v1","updated":"2024-01-19T01:36:07Z","published":"2024-01-19T01:36:07Z","title":"Contextualized Automatic Speech Recognition with Attention-Based Bias\n  Phrase Boosted Beam Search","summary":"  End-to-end (E2E) automatic speech recognition (ASR) methods exhibit\nremarkable performance. However, since the performance of such methods is\nintrinsically linked to the context present in the training data, E2E-ASR\nmethods do not perform as desired for unseen user contexts (e.g., technical\nterms, personal names, and playlists). Thus, E2E-ASR methods must be easily\ncontextualized by the user or developer. This paper proposes an attention-based\ncontextual biasing method that can be customized using an editable phrase list\n(referred to as a bias list). The proposed method can be trained effectively by\ncombining a bias phrase index loss and special tokens to detect the bias\nphrases in the input speech data. In addition, to improve the contextualization\nperformance during inference further, we propose a bias phrase boosted (BPB)\nbeam search algorithm based on the bias phrase index probability. Experimental\nresults demonstrate that the proposed method consistently improves the word\nerror rate and the character error rate of the target phrases in the bias list\non both the Librispeech-960 (English) and our in-house (Japanese) dataset,\nrespectively.\n","authors":["Yui Sudo","Muhammad Shakeel","Yosuke Fukumoto","Yifan Peng","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2401.10449v1.pdf","comment":"accepted by ICASSP20224"},{"id":"http://arxiv.org/abs/2401.10447v1","updated":"2024-01-19T01:30:16Z","published":"2024-01-19T01:30:16Z","title":"Investigating Training Strategies and Model Robustness of Low-Rank\n  Adaptation for Language Modeling in Speech Recognition","summary":"  The use of low-rank adaptation (LoRA) with frozen pretrained language models\n(PLMs) has become increasing popular as a mainstream, resource-efficient\nmodeling approach for memory-constrained hardware. In this study, we first\nexplore how to enhance model performance by introducing various LoRA training\nstrategies, achieving relative word error rate reductions of 3.50\\% on the\npublic Librispeech dataset and of 3.67\\% on an internal dataset in the\nmessaging domain. To further characterize the stability of LoRA-based\nsecond-pass speech recognition models, we examine robustness against input\nperturbations. These perturbations are rooted in homophone replacements and a\nnovel metric called N-best Perturbation-based Rescoring Robustness (NPRR), both\ndesigned to measure the relative degradation in the performance of rescoring\nmodels. Our experimental results indicate that while advanced variants of LoRA,\nsuch as dynamic rank-allocated LoRA, lead to performance degradation in\n$1$-best perturbation, they alleviate the degradation in $N$-best perturbation.\nThis finding is in comparison to fully-tuned models and vanilla LoRA tuning\nbaselines, suggesting that a comprehensive selection is needed when using\nLoRA-based adaptation for compute-cost savings and robust language modeling.\n","authors":["Yu Yu","Chao-Han Huck Yang","Tuan Dinh","Sungho Ryu","Jari Kolehmainen","Roger Ren","Denis Filimonov","Prashanth G. Shivakumar","Ankur Gandhe","Ariya Rastow","Jia Xu","Ivan Bulyko","Andreas Stolcke"],"pdf_url":"https://arxiv.org/pdf/2401.10447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10446v1","updated":"2024-01-19T01:29:27Z","published":"2024-01-19T01:29:27Z","title":"Large Language Models are Efficient Learners of Noise-Robust Speech\n  Recognition","summary":"  Recent advances in large language models (LLMs) have promoted generative\nerror correction (GER) for automatic speech recognition (ASR), which leverages\nthe rich linguistic knowledge and powerful reasoning ability of LLMs to improve\nrecognition results. The latest work proposes a GER benchmark with HyPoradise\ndataset to learn the mapping from ASR N-best hypotheses to ground-truth\ntranscription by efficient LLM finetuning, which shows great effectiveness but\nlacks specificity on noise-robust ASR. In this work, we extend the benchmark to\nnoisy conditions and investigate if we can teach LLMs to perform denoising for\nGER just like what robust ASR do}, where one solution is introducing noise\ninformation as a conditioner into LLM. However, directly incorporating noise\nembeddings from audio encoder could harm the LLM tuning due to cross-modality\ngap. To this end, we propose to extract a language-space noise embedding from\nthe N-best list to represent the noise conditions of source speech, which can\npromote the denoising process in GER. Furthermore, in order to enhance its\nrepresentation ability of audio noise, we design a knowledge distillation (KD)\napproach via mutual information estimation to distill the real noise\ninformation in audio embeddings to our language embedding. Experiments on\nvarious latest LLMs demonstrate our approach achieves a new breakthrough with\nup to 53.9% correction improvement in terms of word error rate while with\nlimited training data. Analysis shows that our language-space noise embedding\ncan well represent the noise conditions of source speech, under which\noff-the-shelf LLMs show strong ability of language-space denoising.\n","authors":["Yuchen Hu","Chen Chen","Chao-Han Huck Yang","Ruizhe Li","Chao Zhang","Pin-Yu Chen","EnSiong Chng"],"pdf_url":"https://arxiv.org/pdf/2401.10446v1.pdf","comment":"Accepted to ICLR 2024, Spotlight top 5%, 24 pages. This work will be\n  open sourced at: https://github.com/YUCHEN005/RobustGER under MIT license"},{"id":"http://arxiv.org/abs/2401.10440v1","updated":"2024-01-19T01:07:50Z","published":"2024-01-19T01:07:50Z","title":"Breaking the Curse of Multilinguality with Cross-lingual Expert Language\n  Models","summary":"  Despite their popularity in non-English NLP, multilingual language models\noften underperform monolingual ones due to inter-language competition for model\nparameters. We propose Cross-lingual Expert Language Models (X-ELM), which\nmitigate this competition by independently training language models on subsets\nof the multilingual corpus. This process specializes X-ELMs to different\nlanguages while remaining effective as a multilingual ensemble. Our experiments\nshow that when given the same compute budget, X-ELM outperforms jointly trained\nmultilingual models across all considered languages and that these gains\ntransfer to downstream tasks. X-ELM provides additional benefits over\nperformance improvements: new experts can be iteratively added, adapting X-ELM\nto new languages without catastrophic forgetting. Furthermore, training is\nasynchronous, reducing the hardware requirements for multilingual training and\ndemocratizing multilingual modeling.\n","authors":["Terra Blevins","Tomasz Limisiewicz","Suchin Gururangan","Margaret Li","Hila Gonen","Noah A. Smith","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2401.10440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04398v2","updated":"2024-01-19T01:05:05Z","published":"2024-01-09T07:46:26Z","title":"Chain-of-Table: Evolving Tables in the Reasoning Chain for Table\n  Understanding","summary":"  Table-based reasoning with large language models (LLMs) is a promising\ndirection to tackle many table understanding tasks, such as table-based\nquestion answering and fact verification. Compared with generic reasoning,\ntable-based reasoning requires the extraction of underlying semantics from both\nfree-form questions and semi-structured tabular data. Chain-of-Thought and its\nsimilar approaches incorporate the reasoning chain in the form of textual\ncontext, but it is still an open question how to effectively leverage tabular\ndata in the reasoning chain. We propose the Chain-of-Table framework, where\ntabular data is explicitly used in the reasoning chain as a proxy for\nintermediate thoughts. Specifically, we guide LLMs using in-context learning to\niteratively generate operations and update the table to represent a tabular\nreasoning chain. LLMs can therefore dynamically plan the next operation based\non the results of the previous ones. This continuous evolution of the table\nforms a chain, showing the reasoning process for a given tabular problem. The\nchain carries structured information of the intermediate results, enabling more\naccurate and reliable predictions. Chain-of-Table achieves new state-of-the-art\nperformance on WikiTQ, FeTaQA, and TabFact benchmarks across multiple LLM\nchoices.\n","authors":["Zilong Wang","Hao Zhang","Chun-Liang Li","Julian Martin Eisenschlos","Vincent Perot","Zifeng Wang","Lesly Miculicich","Yasuhisa Fujii","Jingbo Shang","Chen-Yu Lee","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2401.04398v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11052v1","updated":"2024-01-19T23:00:31Z","published":"2024-01-19T23:00:31Z","title":"Mining experimental data from Materials Science literature with Large\n  Language Models","summary":"  This study is dedicated to evaluating the capabilities of advanced large\nlanguage models (LLMs) such as GPT-3.5-Turbo, GPT-4, and GPT-4-Turbo in the\nextraction of structured information from scientific documents within the field\nof materials science. We introduce a novel methodology for the comparative\nanalysis of intricate material expressions, emphasising the standardisation of\nchemical formulas to tackle the complexities inherent in materials science\ninformation assessment. To this end, we primarily focus on two critical tasks\nof information extraction: (i) a named entity recognition (NER) of studied\nmaterials and physical properties and (ii) a relation extraction (RE) between\nthese entities. The performance of LLMs in executing these tasks is benchmarked\nagainst traditional models based on the BERT architecture and rule-based\napproaches. For NER, LLMs fail to outperform the baseline with zero-shot\nprompting and exhibit only limited improvement with few-shot prompting.\nHowever, for RE, a GPT-3.5-Turbo fine-tuned with the appropriate strategy\noutperforms all models, including the baseline. Without any fine-tuning, GPT-4\nand GPT-4-Turbo display remarkable reasoning and relationship extraction\ncapabilities after being provided with merely a couple of examples, surpassing\nthe baseline. Overall, the results suggest that although LLMs demonstrate\nrelevant reasoning skills in connecting concepts, for tasks requiring\nextracting complex domain-specific entities like materials, specialised models\nare currently a better choice.\n","authors":["Luca Foppiano","Guillaume Lambard","Toshiyuki Amagasa","Masashi Ishii"],"pdf_url":"https://arxiv.org/pdf/2401.11052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11048v1","updated":"2024-01-19T22:24:39Z","published":"2024-01-19T22:24:39Z","title":"PubTator 3.0: an AI-powered Literature Resource for Unlocking Biomedical\n  Knowledge","summary":"  PubTator 3.0 (https://www.ncbi.nlm.nih.gov/research/pubtator3/) is a\nbiomedical literature resource using state-of-the-art AI techniques to offer\nsemantic and relation searches for key concepts like proteins, genetic\nvariants, diseases, and chemicals. It currently provides over one billion\nentity and relation annotations across approximately 36 million PubMed\nabstracts and 6 million full-text articles from the PMC open access subset,\nupdated weekly. PubTator 3.0's online interface and API utilize these\nprecomputed entity relations and synonyms to provide advanced search\ncapabilities and enable large-scale analyses, streamlining many complex\ninformation needs. We showcase the retrieval quality of PubTator 3.0 using a\nseries of entity pair queries, demonstrating that PubTator 3.0 retrieves a\ngreater number of articles than either PubMed or Google Scholar, with higher\nprecision in the top 20 results. We further show that integrating ChatGPT\n(GPT-4) with PubTator APIs dramatically improves the factuality and\nverifiability of its responses. In summary, PubTator 3.0 offers a comprehensive\nset of features and tools that allow researchers to navigate the ever-expanding\nwealth of biomedical literature, expediting research and unlocking valuable\ninsights for scientific discovery.\n","authors":["Chih-Hsuan Wei","Alexis Allot","Po-Ting Lai","Robert Leaman","Shubo Tian","Ling Luo","Qiao Jin","Zhizheng Wang","Qingyu Chen","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2401.11048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11033v1","updated":"2024-01-19T21:21:02Z","published":"2024-01-19T21:21:02Z","title":"FAIR Enough: How Can We Develop and Assess a FAIR-Compliant Dataset for\n  Large Language Models' Training?","summary":"  Advancements in Large Language Models (LLMs) highlight the need for ethical\npractices and data integrity. We introduce a framework that embeds FAIR\n(Findable, Accessible, Interoperable, Reusable) data principles into LLM\ntraining. This approach marks a shift towards practices compliant with FAIR\nstandards. Our framework presents guidelines for integrating FAIR data\nprinciples into LLM training. This initiative includes a checklist for\nresearchers and developers. We also demonstrate its practical application\nthrough a case study focused on bias identification and mitigation in our\nFAIR-compliant dataset. This work is a significant contribution to AI ethics\nand data science, advocating for balanced and ethical training methods in LLMs.\n","authors":["Shaina Raza","Shardul Ghuge","Chen Ding","Deval Pandya"],"pdf_url":"https://arxiv.org/pdf/2401.11033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11021v1","updated":"2024-01-19T20:40:23Z","published":"2024-01-19T20:40:23Z","title":"Analysis and Detection of Multilingual Hate Speech Using Transformer\n  Based Deep Learning","summary":"  Hate speech is harmful content that directly attacks or promotes hatred\nagainst members of groups or individuals based on actual or perceived aspects\nof identity, such as racism, religion, or sexual orientation. This can affect\nsocial life on social media platforms as hateful content shared through social\nmedia can harm both individuals and communities. As the prevalence of hate\nspeech increases online, the demand for automated detection as an NLP task is\nincreasing. In this work, the proposed method is using transformer-based model\nto detect hate speech in social media, like twitter, Facebook, WhatsApp,\nInstagram, etc. The proposed model is independent of languages and has been\ntested on Italian, English, German, Bengali. The Gold standard datasets were\ncollected from renowned researcher Zeerak Talat, Sara Tonelli, Melanie Siegel,\nand Rezaul Karim. The success rate of the proposed model for hate speech\ndetection is higher than the existing baseline and state-of-the-art models with\naccuracy in Bengali dataset is 89%, in English: 91%, in German dataset 91% and\nin Italian dataset it is 77%. The proposed algorithm shows substantial\nimprovement to the benchmark method.\n","authors":["Arijit Das","Somashree Nandy","Rupam Saha","Srijan Das","Diganta Saha"],"pdf_url":"https://arxiv.org/pdf/2401.11021v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2401.10995v1","updated":"2024-01-19T19:23:37Z","published":"2024-01-19T19:23:37Z","title":"The Radiation Oncology NLP Database","summary":"  We present the Radiation Oncology NLP Database (ROND), the first dedicated\nNatural Language Processing (NLP) dataset for radiation oncology, an important\nmedical specialty that has received limited attention from the NLP community in\nthe past. With the advent of Artificial General Intelligence (AGI), there is an\nincreasing need for specialized datasets and benchmarks to facilitate research\nand development. ROND is specifically designed to address this gap in the\ndomain of radiation oncology, a field that offers many opportunities for NLP\nexploration. It encompasses various NLP tasks including Logic Reasoning, Text\nClassification, Named Entity Recognition (NER), Question Answering (QA), Text\nSummarization, and Patient-Clinician Conversations, each with a distinct focus\non radiation oncology concepts and application cases. In addition, we have\ndeveloped an instruction-tuning dataset consisting of over 20k instruction\npairs (based on ROND) and trained a large language model, CancerChat. This\nserves to demonstrate the potential of instruction-tuning large language models\nwithin a highly-specialized medical domain. The evaluation results in this\nstudy could serve as baseline results for future research. ROND aims to\nstimulate advancements in radiation oncology and clinical NLP by offering a\nplatform for testing and improving algorithms and models in a domain-specific\ncontext. The ROND dataset is a joint effort of multiple U.S. health\ninstitutions. The data is available at\nhttps://github.com/zl-liu/Radiation-Oncology-NLP-Database.\n","authors":["Zhengliang Liu","Jason Holmes","Wenxiong Liao","Chenbin Liu","Lian Zhang","Hongying Feng","Peilong Wang","Muhammad Ali Elahi","Hongmin Cai","Lichao Sun","Quanzheng Li","Xiang Li","Tianming Liu","Jiajian Shen","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10995v1.pdf","comment":"10 pages, 7 figures, 6 tables"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.10891v1","updated":"2024-01-19T18:59:52Z","published":"2024-01-19T18:59:52Z","title":"Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data","summary":"  This work presents Depth Anything, a highly practical solution for robust\nmonocular depth estimation. Without pursuing novel technical modules, we aim to\nbuild a simple yet powerful foundation model dealing with any images under any\ncircumstances. To this end, we scale up the dataset by designing a data engine\nto collect and automatically annotate large-scale unlabeled data (~62M), which\nsignificantly enlarges the data coverage and thus is able to reduce the\ngeneralization error. We investigate two simple yet effective strategies that\nmake data scaling-up promising. First, a more challenging optimization target\nis created by leveraging data augmentation tools. It compels the model to\nactively seek extra visual knowledge and acquire robust representations.\nSecond, an auxiliary supervision is developed to enforce the model to inherit\nrich semantic priors from pre-trained encoders. We evaluate its zero-shot\ncapabilities extensively, including six public datasets and randomly captured\nphotos. It demonstrates impressive generalization ability. Further, through\nfine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs\nare set. Our better depth model also results in a better depth-conditioned\nControlNet. Our models are released at\nhttps://github.com/LiheYoung/Depth-Anything.\n","authors":["Lihe Yang","Bingyi Kang","Zilong Huang","Xiaogang Xu","Jiashi Feng","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10891v1.pdf","comment":"Project page: https://depth-anything.github.io"},{"id":"http://arxiv.org/abs/2401.10890v1","updated":"2024-01-19T18:59:37Z","published":"2024-01-19T18:59:37Z","title":"Event detection from novel data sources: Leveraging satellite imagery\n  alongside GPS traces","summary":"  Rapid identification and response to breaking events, particularly those that\npose a threat to human life such as natural disasters or conflicts, is of\nparamount importance. The prevalence of mobile devices and the ubiquity of\nnetwork connectivity has generated a massive amount of temporally- and\nspatially-stamped data. Numerous studies have used mobile data to derive\nindividual human mobility patterns for various applications. Similarly, the\nincreasing number of orbital satellites has made it easier to gather\nhigh-resolution images capturing a snapshot of a geographical area in sub-daily\ntemporal frequency. We propose a novel data fusion methodology integrating\nsatellite imagery with privacy-enhanced mobile data to augment the event\ninference task, whether in real-time or historical. In the absence of boots on\nthe ground, mobile data is able to give an approximation of human mobility,\nproximity to one another, and the built environment. On the other hand,\nsatellite imagery can provide visual information on physical changes to the\nbuilt and natural environment. The expected use cases for our methodology\ninclude small-scale disaster detection (i.e., tornadoes, wildfires, and floods)\nin rural regions, search and rescue operation augmentation for lost hikers in\nremote wilderness areas, and identification of active conflict areas and\npopulation displacement in war-torn states. Our implementation is open-source\non GitHub: https://github.com/ekinugurel/SatMobFusion.\n","authors":["Ekin Ugurel","Steffen Coenen","Minda Zhou Chen","Cynthia Chen"],"pdf_url":"https://arxiv.org/pdf/2401.10890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10889v1","updated":"2024-01-19T18:59:11Z","published":"2024-01-19T18:59:11Z","title":"Synthesizing Moving People with 3D Control","summary":"  In this paper, we present a diffusion model-based framework for animating\npeople from a single image for a given target 3D motion sequence. Our approach\nhas two core components: a) learning priors about invisible parts of the human\nbody and clothing, and b) rendering novel body poses with proper clothing and\ntexture. For the first part, we learn an in-filling diffusion model to\nhallucinate unseen parts of a person given a single image. We train this model\non texture map space, which makes it more sample-efficient since it is\ninvariant to pose and viewpoint. Second, we develop a diffusion-based rendering\npipeline, which is controlled by 3D human poses. This produces realistic\nrenderings of novel poses of the person, including clothing, hair, and\nplausible in-filling of unseen regions. This disentangled approach allows our\nmethod to generate a sequence of images that are faithful to the target motion\nin the 3D pose and, to the input image in terms of visual similarity. In\naddition to that, the 3D control allows various synthetic camera trajectories\nto render a person. Our experiments show that our method is resilient in\ngenerating prolonged motions and varied challenging and complex poses compared\nto prior methods. Please check our website for more details:\nhttps://boyiliee.github.io/3DHM.github.io/.\n","authors":["Boyi Li","Jathushan Rajasegaran","Yossi Gandelsman","Alexei A. Efros","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2401.10889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10886v1","updated":"2024-01-19T18:57:46Z","published":"2024-01-19T18:57:46Z","title":"SCENES: Subpixel Correspondence Estimation With Epipolar Supervision","summary":"  Extracting point correspondences from two or more views of a scene is a\nfundamental computer vision problem with particular importance for relative\ncamera pose estimation and structure-from-motion. Existing local feature\nmatching approaches, trained with correspondence supervision on large-scale\ndatasets, obtain highly-accurate matches on the test sets. However, they do not\ngeneralise well to new datasets with different characteristics to those they\nwere trained on, unlike classic feature extractors. Instead, they require\nfinetuning, which assumes that ground-truth correspondences or ground-truth\ncamera poses and 3D structure are available. We relax this assumption by\nremoving the requirement of 3D structure, e.g., depth maps or point clouds, and\nonly require camera pose information, which can be obtained from odometry. We\ndo so by replacing correspondence losses with epipolar losses, which encourage\nputative matches to lie on the associated epipolar line. While weaker than\ncorrespondence supervision, we observe that this cue is sufficient for\nfinetuning existing models on new data. We then further relax the assumption of\nknown camera poses by using pose estimates in a novel bootstrapping approach.\nWe evaluate on highly challenging datasets, including an indoor drone dataset\nand an outdoor smartphone camera dataset, and obtain state-of-the-art results\nwithout strong supervision.\n","authors":["Dominik A. Kloepfer","João F. Henriques","Dylan Campbell"],"pdf_url":"https://arxiv.org/pdf/2401.10886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20685v2","updated":"2024-01-19T18:53:13Z","published":"2023-10-31T17:49:48Z","title":"NeRF Revisited: Fixing Quadrature Instability in Volume Rendering","summary":"  Neural radiance fields (NeRF) rely on volume rendering to synthesize novel\nviews. Volume rendering requires evaluating an integral along each ray, which\nis numerically approximated with a finite sum that corresponds to the exact\nintegral along the ray under piecewise constant volume density. As a\nconsequence, the rendered result is unstable w.r.t. the choice of samples along\nthe ray, a phenomenon that we dub quadrature instability. We propose a\nmathematically principled solution by reformulating the sample-based rendering\nequation so that it corresponds to the exact integral under piecewise linear\nvolume density. This simultaneously resolves multiple issues: conflicts between\nsamples along different rays, imprecise hierarchical sampling, and\nnon-differentiability of quantiles of ray termination distances w.r.t. model\nparameters. We demonstrate several benefits over the classical sample-based\nrendering equation, such as sharper textures, better geometric reconstruction,\nand stronger depth supervision. Our proposed formulation can be also be used as\na drop-in replacement to the volume rendering equation of existing NeRF-based\nmethods. Our project page can be found at pl-nerf.github.io.\n","authors":["Mikaela Angelina Uy","Kiyohiro Nakayama","Guandao Yang","Rahul Krishna Thomas","Leonidas Guibas","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2310.20685v2.pdf","comment":"Neurips 2023"},{"id":"http://arxiv.org/abs/2401.10877v1","updated":"2024-01-19T18:41:53Z","published":"2024-01-19T18:41:53Z","title":"The Cadaver in the Machine: The Social Practices of Measurement and\n  Validation in Motion Capture Technology","summary":"  Motion capture systems, used across various domains, make body\nrepresentations concrete through technical processes. We argue that the\nmeasurement of bodies and the validation of measurements for motion capture\nsystems can be understood as social practices. By analyzing the findings of a\nsystematic literature review (N=278) through the lens of social practice\ntheory, we show how these practices, and their varying attention to errors,\nbecome ingrained in motion capture design and innovation over time. Moreover,\nwe show how contemporary motion capture systems perpetuate assumptions about\nhuman bodies and their movements. We suggest that social practices of\nmeasurement and validation are ubiquitous in the development of data- and\nsensor-driven systems more broadly, and provide this work as a basis for\ninvestigating hidden design assumptions and their potential negative\nconsequences in human-computer interaction.\n","authors":["Emma Harvey","Hauke Sandhaus","Abigail Z. Jacobs","Emanuel Moss","Mona Sloane"],"pdf_url":"https://arxiv.org/pdf/2401.10877v1.pdf","comment":"34 pages, 9 figures. To appear in the 2024 ACM CHI Conference on\n  Human Factors in Computing Systems (CHI '24)"},{"id":"http://arxiv.org/abs/2306.08251v2","updated":"2024-01-19T18:35:54Z","published":"2023-06-14T05:34:02Z","title":"GBSD: Generative Bokeh with Stage Diffusion","summary":"  The bokeh effect is an artistic technique that blurs out-of-focus areas in a\nphotograph and has gained interest due to recent developments in text-to-image\nsynthesis and the ubiquity of smart-phone cameras and photo-sharing apps. Prior\nwork on rendering bokeh effects have focused on post hoc image manipulation to\nproduce similar blurring effects in existing photographs using classical\ncomputer graphics or neural rendering techniques, but have either depth\ndiscontinuity artifacts or are restricted to reproducing bokeh effects that are\npresent in the training data. More recent diffusion based models can synthesize\nimages with an artistic style, but either require the generation of\nhigh-dimensional masks, expensive fine-tuning, or affect global image\ncharacteristics. In this paper, we present GBSD, the first generative\ntext-to-image model that synthesizes photorealistic images with a bokeh style.\nMotivated by how image synthesis occurs progressively in diffusion models, our\napproach combines latent diffusion models with a 2-stage conditioning algorithm\nto render bokeh effects on semantically defined objects. Since we can focus the\neffect on objects, this semantic bokeh effect is more versatile than classical\nrendering techniques. We evaluate GBSD both quantitatively and qualitatively\nand demonstrate its ability to be applied in both text-to-image and\nimage-to-image settings.\n","authors":["Jieren Deng","Xin Zhou","Hao Tian","Zhihong Pan","Derek Aguiar"],"pdf_url":"https://arxiv.org/pdf/2306.08251v2.pdf","comment":"Short Version is accepted by International Conference on Acoustics,\n  Speech, and Signal Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2303.05015v2","updated":"2024-01-19T18:23:19Z","published":"2023-03-09T03:33:56Z","title":"Smooth and Stepwise Self-Distillation for Object Detection","summary":"  Distilling the structured information captured in feature maps has\ncontributed to improved results for object detection tasks, but requires\ncareful selection of baseline architectures and substantial pre-training.\nSelf-distillation addresses these limitations and has recently achieved\nstate-of-the-art performance for object detection despite making several\nsimplifying architectural assumptions. Building on this work, we propose Smooth\nand Stepwise Self-Distillation (SSSD) for object detection. Our SSSD\narchitecture forms an implicit teacher from object labels and a feature pyramid\nnetwork backbone to distill label-annotated feature maps using Jensen-Shannon\ndistance, which is smoother than distillation losses used in prior work. We\nadditionally add a distillation coefficient that is adaptively configured based\non the learning rate. We extensively benchmark SSSD against a baseline and two\nstate-of-the-art object detector architectures on the COCO dataset by varying\nthe coefficients and backbone and detector networks. We demonstrate that SSSD\nachieves higher average precision in most experimental settings, is robust to a\nwide range of coefficients, and benefits from our stepwise distillation\nprocedure.\n","authors":["Jieren Deng","Xin Zhou","Hao Tian","Zhihong Pan","Derek Aguiar"],"pdf_url":"https://arxiv.org/pdf/2303.05015v2.pdf","comment":"Accepted by International Conference on Image Processing (ICIP) 2023"},{"id":"http://arxiv.org/abs/2401.10857v1","updated":"2024-01-19T18:00:52Z","published":"2024-01-19T18:00:52Z","title":"Motion Consistency Loss for Monocular Visual Odometry with\n  Attention-Based Deep Learning","summary":"  Deep learning algorithms have driven expressive progress in many complex\ntasks. The loss function is a core component of deep learning techniques,\nguiding the learning process of neural networks. This paper contributes by\nintroducing a consistency loss for visual odometry with deep learning-based\napproaches. The motion consistency loss explores repeated motions that appear\nin consecutive overlapped video clips. Experimental results show that our\napproach increased the performance of a model on the KITTI odometry benchmark.\n","authors":["André O. Françani","Marcos R. O. A. Maximo"],"pdf_url":"https://arxiv.org/pdf/2401.10857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10848v1","updated":"2024-01-19T17:48:05Z","published":"2024-01-19T17:48:05Z","title":"Source-Free and Image-Only Unsupervised Domain Adaptation for Category\n  Level Object Pose Estimation","summary":"  We consider the problem of source-free unsupervised category-level pose\nestimation from only RGB images to a target domain without any access to source\ndomain data or 3D annotations during adaptation. Collecting and annotating\nreal-world 3D data and corresponding images is laborious, expensive, yet\nunavoidable process, since even 3D pose domain adaptation methods require 3D\ndata in the target domain. We introduce 3DUDA, a method capable of adapting to\na nuisance-ridden target domain without 3D or depth data. Our key insight stems\nfrom the observation that specific object subparts remain stable across\nout-of-domain (OOD) scenarios, enabling strategic utilization of these\ninvariant subcomponents for effective model updates. We represent object\ncategories as simple cuboid meshes, and harness a generative model of neural\nfeature activations modeled at each mesh vertex learnt using differential\nrendering. We focus on individual locally robust mesh vertex features and\niteratively update them based on their proximity to corresponding features in\nthe target domain even when the global pose is not correct. Our model is then\ntrained in an EM fashion, alternating between updating the vertex features and\nthe feature extractor. We show that our method simulates fine-tuning on a\nglobal pseudo-labeled dataset under mild assumptions, which converges to the\ntarget domain asymptotically. Through extensive empirical validation, including\na complex extreme UDA setup which combines real nuisances, synthetic noise, and\nocclusion, we demonstrate the potency of our simple approach in addressing the\ndomain shift challenge and significantly improving pose estimation accuracy.\n","authors":["Prakhar Kaushik","Aayush Mishra","Adam Kortylewski","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2401.10848v1.pdf","comment":"36 pages, 9 figures, 50 tables; ICLR 2024 (Poster)"},{"id":"http://arxiv.org/abs/2401.10831v1","updated":"2024-01-19T17:27:21Z","published":"2024-01-19T17:27:21Z","title":"Understanding Video Transformers via Universal Concept Discovery","summary":"  This paper studies the problem of concept-based interpretability of\ntransformer representations for videos. Concretely, we seek to explain the\ndecision-making process of video transformers based on high-level,\nspatiotemporal concepts that are automatically discovered. Prior research on\nconcept-based interpretability has concentrated solely on image-level tasks.\nComparatively, video models deal with the added temporal dimension, increasing\ncomplexity and posing challenges in identifying dynamic concepts over time. In\nthis work, we systematically address these challenges by introducing the first\nVideo Transformer Concept Discovery (VTCD) algorithm. To this end, we propose\nan efficient approach for unsupervised identification of units of video\ntransformer representations - concepts, and ranking their importance to the\noutput of a model. The resulting concepts are highly interpretable, revealing\nspatio-temporal reasoning mechanisms and object-centric representations in\nunstructured video models. Performing this analysis jointly over a diverse set\nof supervised and self-supervised representations, we discover that some of\nthese mechanism are universal in video transformers. Finally, we demonstrate\nthat VTCDcan be used to improve model performance for fine-grained tasks.\n","authors":["Matthew Kowal","Achal Dave","Rares Ambrus","Adrien Gaidon","Konstantinos G. Derpanis","Pavel Tokmakov"],"pdf_url":"https://arxiv.org/pdf/2401.10831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10822v1","updated":"2024-01-19T17:16:16Z","published":"2024-01-19T17:16:16Z","title":"ActAnywhere: Subject-Aware Video Background Generation","summary":"  Generating video background that tailors to foreground subject motion is an\nimportant problem for the movie industry and visual effects community. This\ntask involves synthesizing background that aligns with the motion and\nappearance of the foreground subject, while also complies with the artist's\ncreative intention. We introduce ActAnywhere, a generative model that automates\nthis process which traditionally requires tedious manual efforts. Our model\nleverages the power of large-scale video diffusion models, and is specifically\ntailored for this task. ActAnywhere takes a sequence of foreground subject\nsegmentation as input and an image that describes the desired scene as\ncondition, to produce a coherent video with realistic foreground-background\ninteractions while adhering to the condition frame. We train our model on a\nlarge-scale dataset of human-scene interaction videos. Extensive evaluations\ndemonstrate the superior performance of our model, significantly outperforming\nbaselines. Moreover, we show that ActAnywhere generalizes to diverse\nout-of-distribution samples, including non-human subjects. Please visit our\nproject webpage at https://actanywhere.github.io.\n","authors":["Boxiao Pan","Zhan Xu","Chun-Hao Paul Huang","Krishna Kumar Singh","Yang Zhou","Leonidas J. Guibas","Jimei Yang"],"pdf_url":"https://arxiv.org/pdf/2401.10822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10815v1","updated":"2024-01-19T17:02:17Z","published":"2024-01-19T17:02:17Z","title":"RAD-DINO: Exploring Scalable Medical Image Encoders Beyond Text\n  Supervision","summary":"  Language-supervised pre-training has proven to be a valuable method for\nextracting semantically meaningful features from images, serving as a\nfoundational element in multimodal systems within the computer vision and\nmedical imaging domains. However, resulting features are limited by the\ninformation contained within the text. This is particularly problematic in\nmedical imaging, where radiologists' written findings focus on specific\nobservations; a challenge compounded by the scarcity of paired imaging-text\ndata due to concerns over leakage of personal health information. In this work,\nwe fundamentally challenge the prevailing reliance on language supervision for\nlearning general purpose biomedical imaging encoders. We introduce RAD-DINO, a\nbiomedical image encoder pre-trained solely on unimodal biomedical imaging data\nthat obtains similar or greater performance than state-of-the-art biomedical\nlanguage supervised models on a diverse range of benchmarks. Specifically, the\nquality of learned representations is evaluated on standard imaging tasks\n(classification and semantic segmentation), and a vision-language alignment\ntask (text report generation from images). To further demonstrate the drawback\nof language supervision, we show that features from RAD-DINO correlate with\nother medical records (e.g., sex or age) better than language-supervised\nmodels, which are generally not mentioned in radiology reports. Finally, we\nconduct a series of ablations determining the factors in RAD-DINO's\nperformance; notably, we observe that RAD-DINO's downstream performance scales\nwell with the quantity and diversity of training data, demonstrating that\nimage-only supervision is a scalable approach for training a foundational\nbiomedical image encoder.\n","authors":["Fernando Pérez-García","Harshita Sharma","Sam Bond-Taylor","Kenza Bouzid","Valentina Salvatelli","Maximilian Ilse","Shruthi Bannur","Daniel C. Castro","Anton Schwaighofer","Matthew P. Lungren","Maria Wetscherek","Noel Codella","Stephanie L. Hyland","Javier Alvarez-Valle","Ozan Oktay"],"pdf_url":"https://arxiv.org/pdf/2401.10815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10805v1","updated":"2024-01-19T16:48:49Z","published":"2024-01-19T16:48:49Z","title":"Learning to Visually Connect Actions and their Effects","summary":"  In this work, we introduce the novel concept of visually Connecting Actions\nand Their Effects (CATE) in video understanding. CATE can have applications in\nareas like task planning and learning from demonstration. We propose different\nCATE-based task formulations, such as action selection and action\nspecification, where video understanding models connect actions and effects at\nsemantic and fine-grained levels. We observe that different formulations\nproduce representations capturing intuitive action properties. We also design\nvarious baseline models for action selection and action specification. Despite\nthe intuitive nature of the task, we observe that models struggle, and humans\noutperform them by a large margin. The study aims to establish a foundation for\nfuture efforts, showcasing the flexibility and versatility of connecting\nactions and effects in video understanding, with the hope of inspiring advanced\nformulations and models.\n","authors":["Eric Peh","Paritosh Parmar","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2401.10805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10790v1","updated":"2024-01-19T16:21:55Z","published":"2024-01-19T16:21:55Z","title":"Measuring the Impact of Scene Level Objects on Object Detection: Towards\n  Quantitative Explanations of Detection Decisions","summary":"  Although accuracy and other common metrics can provide a useful window into\nthe performance of an object detection model, they lack a deeper view of the\nmodel's decision process. Regardless of the quality of the training data and\nprocess, the features that an object detection model learns cannot be\nguaranteed. A model may learn a relationship between certain background\ncontext, i.e., scene level objects, and the presence of the labeled classes.\nFurthermore, standard performance verification and metrics would not identify\nthis phenomenon. This paper presents a new black box explainability method for\nadditional verification of object detection models by finding the impact of\nscene level objects on the identification of the objects within the image. By\ncomparing the accuracies of a model on test data with and without certain scene\nlevel objects, the contributions of these objects to the model's performance\nbecomes clearer. The experiment presented here will assess the impact of\nbuildings and people in image context on the detection of emergency road\nvehicles by a fine-tuned YOLOv8 model. A large increase in accuracy in the\npresence of a scene level object will indicate the model's reliance on that\nobject to make its detections. The results of this research lead to providing a\nquantitative explanation of the object detection model's decision process,\nenabling a deeper understanding of the model's performance.\n","authors":["Lynn Vonder Haar","Timothy Elvira","Luke Newcomb","Omar Ochoa"],"pdf_url":"https://arxiv.org/pdf/2401.10790v1.pdf","comment":"9 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2401.10786v1","updated":"2024-01-19T16:15:37Z","published":"2024-01-19T16:15:37Z","title":"Sat2Scene: 3D Urban Scene Generation from Satellite Images with\n  Diffusion","summary":"  Directly generating scenes from satellite imagery offers exciting\npossibilities for integration into applications like games and map services.\nHowever, challenges arise from significant view changes and scene scale.\nPrevious efforts mainly focused on image or video generation, lacking\nexploration into the adaptability of scene generation for arbitrary views.\nExisting 3D generation works either operate at the object level or are\ndifficult to utilize the geometry obtained from satellite imagery. To overcome\nthese limitations, we propose a novel architecture for direct 3D scene\ngeneration by introducing diffusion models into 3D sparse representations and\ncombining them with neural rendering techniques. Specifically, our approach\ngenerates texture colors at the point level for a given geometry using a 3D\ndiffusion model first, which is then transformed into a scene representation in\na feed-forward manner. The representation can be utilized to render arbitrary\nviews which would excel in both single-frame quality and inter-frame\nconsistency. Experiments in two city-scale datasets show that our model\ndemonstrates proficiency in generating photo-realistic street-view image\nsequences and cross-view urban scenes from satellite imagery.\n","authors":["Zuoyue Li","Zhenqiang Li","Zhaopeng Cui","Marc Pollefeys","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2401.10786v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2401.09495v2","updated":"2024-01-19T16:11:28Z","published":"2024-01-17T01:33:40Z","title":"IPR-NeRF: Ownership Verification meets Neural Radiance Field","summary":"  Neural Radiance Field (NeRF) models have gained significant attention in the\ncomputer vision community in the recent past with state-of-the-art visual\nquality and produced impressive demonstrations. Since then, technopreneurs have\nsought to leverage NeRF models into a profitable business. Therefore, NeRF\nmodels make it worth the risk of plagiarizers illegally copying,\nre-distributing, or misusing those models. This paper proposes a comprehensive\nintellectual property (IP) protection framework for the NeRF model in both\nblack-box and white-box settings, namely IPR-NeRF. In the black-box setting, a\ndiffusion-based solution is introduced to embed and extract the watermark via a\ntwo-stage optimization process. In the white-box setting, a designated digital\nsignature is embedded into the weights of the NeRF model by adopting the sign\nloss objective. Our extensive experiments demonstrate that not only does our\napproach maintain the fidelity (\\ie, the rendering quality) of IPR-NeRF models,\nbut it is also robust against both ambiguity and removal attacks compared to\nprior arts.\n","authors":["Win Kent Ong","Kam Woh Ng","Chee Seng Chan","Yi Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2401.09495v2.pdf","comment":"Error on the paper"},{"id":"http://arxiv.org/abs/2401.10777v1","updated":"2024-01-19T15:51:34Z","published":"2024-01-19T15:51:34Z","title":"Determination of efficiency indicators of the stand for intelligent\n  control of manual operations in industrial production","summary":"  Systems of intelligent control of manual operations in industrial production\nare being implemented in many industries nowadays. Such systems use\nhigh-resolution cameras and computer vision algorithms to automatically track\nthe operator's manipulations and prevent technological errors in the assembly\nprocess. At the same time compliance with safety regulations in the workspace\nis monitored. As a result, the defect rate of manufactured products and the\nnumber of accidents during the manual assembly of any device are decreased.\nBefore implementing an intelligent control system into a real production it is\nnecessary to calculate its efficiency. In order to do it experiments on the\nstand for manual operations control systems were carried out. This paper\nproposes the methodology for calculating the efficiency indicators. This\nmathematical approach is based on the IoU calculation of real- and\npredicted-time intervals between assembly stages. The results show high\nprecision in tracking the validity of manual assembly and do not depend on the\nduration of the assembly process.\n","authors":["Anton Sergeev","Victor Minchenkov","Aleksei Soldatov"],"pdf_url":"https://arxiv.org/pdf/2401.10777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01984v2","updated":"2024-01-19T15:51:32Z","published":"2024-01-03T21:24:44Z","title":"AUPIMO: Redefining Visual Anomaly Detection Benchmarks with High Speed\n  and Low Tolerance","summary":"  Recent advances in visual anomaly detection research have seen AUROC and\nAUPRO scores on public benchmark datasets such as MVTec and VisA converge\ntowards perfect recall, giving the impression that these benchmarks are\nnear-solved. However, high AUROC and AUPRO scores do not always reflect\nqualitative performance, which limits the validity of these metrics in\nreal-world applications. We argue that the artificial ceiling imposed by the\nlack of an adequate evaluation metric restrains progression of the field, and\nit is crucial that we revisit the evaluation metrics used to rate our\nalgorithms. In response, we introduce Per-IMage Overlap (PIMO), a novel metric\nthat addresses the shortcomings of AUROC and AUPRO. PIMO retains the\nrecall-based nature of the existing metrics but introduces two distinctions:\nthe assignment of curves (and respective area under the curve) is per-image,\nand its X-axis relies solely on normal images. Measuring recall per image\nsimplifies instance score indexing and is more robust to noisy annotations. As\nwe show, it also accelerates computation and enables the usage of statistical\ntests to compare models. By imposing low tolerance for false positives on\nnormal images, PIMO provides an enhanced model validation procedure and\nhighlights performance variations across datasets. Our experiments demonstrate\nthat PIMO offers practical advantages and nuanced performance insights that\nredefine anomaly detection benchmarks -- notably challenging the perception\nthat MVTec AD and VisA datasets have been solved by contemporary models.\nAvailable on GitHub: https://github.com/jpcbertoldo/aupimo.\n","authors":["Joao P. C. Bertoldo","Dick Ameln","Ashwin Vaidya","Samet Akçay"],"pdf_url":"https://arxiv.org/pdf/2401.01984v2.pdf","comment":"This research has been conducted during Google Summer of Code 2023\n  (GSoC 2023) at OpenVINO (Intel). GSoC 2023 page:\n  https://summerofcode.withgoogle.com/archive/2023/projects/SPMopugd"},{"id":"http://arxiv.org/abs/2401.10761v1","updated":"2024-01-19T15:33:46Z","published":"2024-01-19T15:33:46Z","title":"NN-VVC: Versatile Video Coding boosted by self-supervisedly learned\n  image coding for machines","summary":"  The recent progress in artificial intelligence has led to an ever-increasing\nusage of images and videos by machine analysis algorithms, mainly neural\nnetworks. Nonetheless, compression, storage and transmission of media have\ntraditionally been designed considering human beings as the viewers of the\ncontent. Recent research on image and video coding for machine analysis has\nprogressed mainly in two almost orthogonal directions. The first is represented\nby end-to-end (E2E) learned codecs which, while offering high performance on\nimage coding, are not yet on par with state-of-the-art conventional video\ncodecs and lack interoperability. The second direction considers using the\nVersatile Video Coding (VVC) standard or any other conventional video codec\n(CVC) together with pre- and post-processing operations targeting machine\nanalysis. While the CVC-based methods benefit from interoperability and broad\nhardware and software support, the machine task performance is often lower than\nthe desired level, particularly in low bitrates. This paper proposes a hybrid\ncodec for machines called NN-VVC, which combines the advantages of an\nE2E-learned image codec and a CVC to achieve high performance in both image and\nvideo coding for machines. Our experiments show that the proposed system\nachieved up to -43.20% and -26.8% Bj{\\o}ntegaard Delta rate reduction over VVC\nfor image and video data, respectively, when evaluated on multiple different\ndatasets and machine vision tasks. To the best of our knowledge, this is the\nfirst research paper showing a hybrid video codec that outperforms VVC on\nmultiple datasets and multiple machine vision tasks.\n","authors":["Jukka I. Ahonen","Nam Le","Honglei Zhang","Antti Hallapuro","Francesco Cricri","Hamed Rezazadegan Tavakoli","Miska M. Hannuksela","Esa Rahtu"],"pdf_url":"https://arxiv.org/pdf/2401.10761v1.pdf","comment":"ISM 2023 Best paper award winner version"},{"id":"http://arxiv.org/abs/2212.08044v3","updated":"2024-01-19T15:29:34Z","published":"2022-12-15T18:52:03Z","title":"Benchmarking Robustness of Multimodal Image-Text Models under\n  Distribution Shift","summary":"  Multimodal image-text models have shown remarkable performance in the past\nfew years. However, evaluating robustness against distribution shifts is\ncrucial before adopting them in real-world applications. In this work, we\ninvestigate the robustness of 12 popular open-sourced image-text models under\ncommon perturbations on five tasks (image-text retrieval, visual reasoning,\nvisual entailment, image captioning, and text-to-image generation). In\nparticular, we propose several new multimodal robustness benchmarks by applying\n17 image perturbation and 16 text perturbation techniques on top of existing\ndatasets. We observe that multimodal models are not robust to image and text\nperturbations, especially to image perturbations. Among the tested perturbation\nmethods, character-level perturbations constitute the most severe distribution\nshift for text, and zoom blur is the most severe shift for image data. We also\nintroduce two new robustness metrics (\\textbf{MMI} for MultiModal Impact score\nand \\textbf{MOR} for Missing Object Rate) for proper evaluations of multimodal\nmodels. We hope our extensive study sheds light on new directions for the\ndevelopment of robust multimodal models. More details can be found on the\nproject webpage: \\url{https://MMRobustness.github.io}.\n","authors":["Jielin Qiu","Yi Zhu","Xingjian Shi","Florian Wenzel","Zhiqiang Tang","Ding Zhao","Bo Li","Mu Li"],"pdf_url":"https://arxiv.org/pdf/2212.08044v3.pdf","comment":"Accepted by Journal of Data-centric Machine Learning Research (DMLR)\n  2024"},{"id":"http://arxiv.org/abs/2401.10752v1","updated":"2024-01-19T15:21:51Z","published":"2024-01-19T15:21:51Z","title":"HiCD: Change Detection in Quality-Varied Images via Hierarchical\n  Correlation Distillation","summary":"  Advanced change detection techniques primarily target image pairs of equal\nand high quality. However, variations in imaging conditions and platforms\nfrequently lead to image pairs with distinct qualities: one image being\nhigh-quality, while the other being low-quality. These disparities in image\nquality present significant challenges for understanding image pairs\nsemantically and extracting change features, ultimately resulting in a notable\ndecline in performance. To tackle this challenge, we introduce an innovative\ntraining strategy grounded in knowledge distillation. The core idea revolves\naround leveraging task knowledge acquired from high-quality image pairs to\nguide the model's learning process when dealing with image pairs that exhibit\ndifferences in quality. Additionally, we develop a hierarchical correlation\ndistillation approach (involving self-correlation, cross-correlation, and\nglobal correlation). This approach compels the student model to replicate the\ncorrelations inherent in the teacher model, rather than focusing solely on\nindividual features. This ensures effective knowledge transfer while\nmaintaining the student model's training flexibility.\n","authors":["Chao Pang","Xingxing Weng","Jiang Wu","Qiang Wang","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2401.10752v1.pdf","comment":"accepted by TGRS"},{"id":"http://arxiv.org/abs/2401.10741v1","updated":"2024-01-19T14:59:26Z","published":"2024-01-19T14:59:26Z","title":"Character Recognition in Byzantine Seals with Deep Neural Networks","summary":"  Seals are small coin-shaped artifacts, mostly made of lead, held with strings\nto seal letters. This work presents the first attempt towards automatic reading\nof text on Byzantine seal images.Byzantine seals are generally decorated with\niconography on the obverse side and Greek text on the reverse side. Text may\ninclude the sender's name, position in the Byzantine aristocracy, and elements\nof prayers. Both text and iconography are precious literary sources that wait\nto be exploited electronically, so the development of computerized systems for\ninterpreting seals images is of paramount importance. This work's contribution\nis hence a deep, two-stages, character reading pipeline for transcribing\nByzantine seal images. A first deep convolutional neural network (CNN) detects\ncharacters in the seal (character localization). A second convolutional network\nreads the localized characters (character classification). Finally, a\ndiplomatic transcription of the seal is provided by post-processing the two\nnetwork outputs. We provide an experimental evaluation of each CNN in isolation\nand both CNNs in combination. All performances are evaluated by\ncross-validation. Character localization achieves a mean average precision\n(mAP@0.5) greater than 0.9. Classification of characters cropped from ground\ntruth bounding boxes achieves Top-1 accuracy greater than 0.92. End-to-end\nevaluation shows the efficiency of the proposed approach when compared to the\nSoTA for similar tasks.\n","authors":["Théophile Rageau","Laurence Likforman-Sulem","Attilio Fiandrotti","Victoria Eyharabide","Béatrice Caseau","Jean-Claude Cheynet"],"pdf_url":"https://arxiv.org/pdf/2401.10741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10732v1","updated":"2024-01-19T14:49:56Z","published":"2024-01-19T14:49:56Z","title":"Bridging the gap between image coding for machines and humans","summary":"  Image coding for machines (ICM) aims at reducing the bitrate required to\nrepresent an image while minimizing the drop in machine vision analysis\naccuracy. In many use cases, such as surveillance, it is also important that\nthe visual quality is not drastically deteriorated by the compression process.\nRecent works on using neural network (NN) based ICM codecs have shown\nsignificant coding gains against traditional methods; however, the decompressed\nimages, especially at low bitrates, often contain checkerboard artifacts. We\npropose an effective decoder finetuning scheme based on adversarial training to\nsignificantly enhance the visual quality of ICM codecs, while preserving the\nmachine analysis accuracy, without adding extra bitcost or parameters at the\ninference phase. The results show complete removal of the checkerboard\nartifacts at the negligible cost of -1.6% relative change in task performance\nscore. In the cases where some amount of artifacts is tolerable, such as when\nmachine consumption is the primary target, this technique can enhance both\npixel-fidelity and feature-fidelity scores without losing task performance.\n","authors":["Nam Le","Honglei Zhang","Francesco Cricri","Ramin G. Youvalari","Hamed Rezazadegan Tavakoli","Emre Aksu","Miska M. Hannuksela","Esa Rahtu"],"pdf_url":"https://arxiv.org/pdf/2401.10732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10731v1","updated":"2024-01-19T14:49:42Z","published":"2024-01-19T14:49:42Z","title":"Removal and Selection: Improving RGB-Infrared Object Detection via\n  Coarse-to-Fine Fusion","summary":"  Object detection in visible (RGB) and infrared (IR) images has been widely\napplied in recent years. Leveraging the complementary characteristics of RGB\nand IR images, the object detector provides reliable and robust object\nlocalization from day to night. Existing fusion strategies directly inject RGB\nand IR images into convolution neural networks, leading to inferior detection\nperformance. Since the RGB and IR features have modality-specific noise, these\nstrategies will worsen the fused features along with the propagation. Inspired\nby the mechanism of human brain processing multimodal information, this work\nintroduces a new coarse-to-fine perspective to purify and fuse two modality\nfeatures. Specifically, following this perspective, we design a Redundant\nSpectrum Removal module to coarsely remove interfering information within each\nmodality and a Dynamic Feature Selection module to finely select the desired\nfeatures for feature fusion. To verify the effectiveness of the coarse-to-fine\nfusion strategy, we construct a new object detector called Removal and\nSelection Detector (RSDet). Extensive experiments on three RGB-IR object\ndetection datasets verify the superior performance of our method.\n","authors":["Tianyi Zhao","Maoxun Yuan","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2401.10731v1.pdf","comment":"9pages, 7figures"},{"id":"http://arxiv.org/abs/2401.10727v1","updated":"2024-01-19T14:44:37Z","published":"2024-01-19T14:44:37Z","title":"Tool-LMM: A Large Multi-Modal Model for Tool Agent Learning","summary":"  Recently, the astonishing performance of large language models (LLMs) in\nnatural language comprehension and generation tasks triggered lots of\nexploration of using them as central controllers to build agent systems.\nMultiple studies focus on bridging the LLMs to external tools to extend the\napplication scenarios. However, the current LLMs' perceiving tool-use ability\nis limited to a single text query, which may result in ambiguity in\nunderstanding the users' real intentions. LLMs are expected to eliminate that\nby perceiving the visual- or auditory-grounded instructions' information.\nTherefore, in this paper, we propose Tool-LMM, a system incorporating\nopen-source LLMs and multi-modal encoders so that the learnt LLMs can be\nconscious of multi-modal input instruction and then select the function-matched\ntool correctly. To facilitate the evaluation of the model's capability, we\ncollect a dataset featured by consisting of multi-modal input tools from\nHuggingFace. Another important feature of our dataset is that our dataset also\ncontains multiple potential choices for the same instruction due to the\nexistence of identical functions and synonymous functions, which provides more\npotential solutions for the same query. The experiments reveal that our LMM is\ncapable of recommending appropriate tools for multi-modal instructions. Codes\nand data are available at https://github.com/Tool-LMM/Tool-LMM.\n","authors":["Chenyu Wang","Weixin Luo","Qianyu Chen","Haonan Mai","Jindi Guo","Sixun Dong"," Xiaohua"," Xuan","Zhengxin Li","Lin Ma","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2401.10727v1.pdf","comment":"21 pages, 9 figures, 10 tables"},{"id":"http://arxiv.org/abs/2103.10702v4","updated":"2024-01-19T14:43:57Z","published":"2021-03-19T09:31:08Z","title":"ClawCraneNet: Leveraging Object-level Relation for Text-based Video\n  Segmentation","summary":"  Text-based video segmentation is a challenging task that segments out the\nnatural language referred objects in videos. It essentially requires semantic\ncomprehension and fine-grained video understanding. Existing methods introduce\nlanguage representation into segmentation models in a bottom-up manner, which\nmerely conducts vision-language interaction within local receptive fields of\nConvNets. We argue that such interaction is not fulfilled since the model can\nbarely construct region-level relationships given partial observations, which\nis contrary to the description logic of natural language/referring expressions.\nIn fact, people usually describe a target object using relations with other\nobjects, which may not be easily understood without seeing the whole video. To\naddress the issue, we introduce a novel top-down approach by imitating how we\nhuman segment an object with the language guidance. We first figure out all\ncandidate objects in videos and then choose the refereed one by parsing\nrelations among those high-level objects. Three kinds of object-level relations\nare investigated for precise relationship understanding, i.e., positional\nrelation, text-guided semantic relation, and temporal relation. Extensive\nexperiments on A2D Sentences and J-HMDB Sentences show our method outperforms\nstate-of-the-art methods by a large margin. Qualitative results also show our\nresults are more explainable.\n","authors":["Chen Liang","Yu Wu","Yawei Luo","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2103.10702v4.pdf","comment":"Extended version published in\n  https://ieeexplore.ieee.org/abstract/document/10083244"},{"id":"http://arxiv.org/abs/2401.10712v1","updated":"2024-01-19T14:22:29Z","published":"2024-01-19T14:22:29Z","title":"Q&A Prompts: Discovering Rich Visual Clues through Mining\n  Question-Answer Prompts for VQA requiring Diverse World Knowledge","summary":"  With the breakthrough of multi-modal large language models, answering complex\nvisual questions that demand advanced reasoning abilities and world knowledge\nhas become a much more important testbed for developing AI models than ever.\nHowever, equipping AI models with robust cross-modality reasoning ability\nremains challenging since the cognition scheme of humans has not been\nunderstood systematically. In this paper, we believe that if we can collect\nvisual clues in the given image as much as possible, we will recognize the\nimage more accurately, understand the question better, recall relevant\nknowledge more easily, and finally reason out the answer. We discover these\nrich visual clues by mining question-answer pairs in images and sending them\ninto multi-modal large language models as prompts. We call the proposed method\nQ&A Prompts. Specifically, we first use the image-answer pairs and the\ncorresponding questions in the training set as inputs and outputs to train a\nvisual question generation model. Then, we use an image tagging model to\nidentify various instances and send packaged image-tag pairs into the visual\nquestion generation model to generate relevant questions with the extracted\nimage tags as answers. Finally, we encode these generated question-answer pairs\nas prompts with a visual-aware prompting module and send them into pre-trained\nmulti-modal large language models to reason out the final answers. Experimental\nresults show that, compared with state-of-the-art methods, our Q&A Prompts\nachieves substantial improvements on the challenging visual question answering\ndatasets requiring reasoning over diverse world knowledge, such as OK-VQA and\nA-OKVQA.\n","authors":["Haibi Wang","Weifeng Ge"],"pdf_url":"https://arxiv.org/pdf/2401.10712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10711v1","updated":"2024-01-19T14:21:46Z","published":"2024-01-19T14:21:46Z","title":"Weakly Supervised Gaussian Contrastive Grounding with Large Multimodal\n  Models for Video Question Answering","summary":"  Video Question Answering (VideoQA) aims to answer natural language questions\nbased on the information observed in videos. Despite the recent success of\nLarge Multimodal Models (LMMs) in image-language understanding and reasoning,\nthey deal with VideoQA insufficiently by simply taking uniformly sampled frames\nas visual inputs, which ignores question-relevant visual clues. Moreover, there\nare no human annotations for question-critical timestamps in existing VideoQA\ndatasets. In light of this, we propose a novel weakly supervised framework to\nenforce the LMMs to reason out the answers with question-critical moments as\nvisual inputs. Specifically, we fuse the question and answer pairs as event\ndescriptions to find multiple keyframes as target moments, which will be\npseudo-labels. With these pseudo-labels as additionally weak supervision, we\ndevise a lightweight Gaussian-based Contrastive Grounding (GCG) module. GCG\nlearns multiple Gaussian functions to characterize the temporal structure of\nthe video, and sample question-critical frames as positive moments to be the\nvisual inputs of LMMs. Extensive experiments on several VideoQA benchmarks\nverify the effectiveness of our framework, and we achieve substantial\nimprovements compared to previous state-of-the-art methods.\n","authors":["Haibo Wang","Chenghang Lai","Yixuan Sun","Weifeng Ge"],"pdf_url":"https://arxiv.org/pdf/2401.10711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10709v1","updated":"2024-01-19T14:14:26Z","published":"2024-01-19T14:14:26Z","title":"Dense 3D Reconstruction Through Lidar: A Comparative Study on Ex-vivo\n  Porcine Tissue","summary":"  New sensing technologies and more advanced processing algorithms are\ntransforming computer-integrated surgery. While researchers are actively\ninvestigating depth sensing and 3D reconstruction for vision-based surgical\nassistance, it remains difficult to achieve real-time, accurate, and robust 3D\nrepresentations of the abdominal cavity for minimally invasive surgery. Thus,\nthis work uses quantitative testing on fresh ex-vivo porcine tissue to\nthoroughly characterize the quality with which a 3D laser-based time-of-flight\nsensor (lidar) can perform anatomical surface reconstruction. Ground-truth\nsurface shapes are captured with a commercial laser scanner, and the resulting\nsigned error fields are analyzed using rigorous statistical tools. When\ncompared to modern learning-based stereo matching from endoscopic images,\ntime-of-flight sensing demonstrates higher precision, lower processing delay,\nhigher frame rate, and superior robustness against sensor distance and poor\nillumination. Furthermore, we report on the potential negative effect of\nnear-infrared light penetration on the accuracy of lidar measurements across\ndifferent tissue samples, identifying a significant measured depth offset for\nmuscle in contrast to fat and liver. Our findings highlight the potential of\nlidar for intraoperative 3D perception and point toward new methods that\ncombine complementary time-of-flight and spectral imaging.\n","authors":["Guido Caccianiga","Julian Nubert","Marco Hutter","Katherine J. Kuchenbecker"],"pdf_url":"https://arxiv.org/pdf/2401.10709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.11795v2","updated":"2024-01-19T14:08:38Z","published":"2022-10-21T08:18:49Z","title":"PoseScript: Linking 3D Human Poses and Natural Language","summary":"  Natural language plays a critical role in many computer vision applications,\nsuch as image captioning, visual question answering, and cross-modal retrieval,\nto provide fine-grained semantic information. Unfortunately, while human pose\nis key to human understanding, current 3D human pose datasets lack detailed\nlanguage descriptions. To address this issue, we have introduced the PoseScript\ndataset. This dataset pairs more than six thousand 3D human poses from AMASS\nwith rich human-annotated descriptions of the body parts and their spatial\nrelationships. Additionally, to increase the size of the dataset to a scale\nthat is compatible with data-hungry learning algorithms, we have proposed an\nelaborate captioning process that generates automatic synthetic descriptions in\nnatural language from given 3D keypoints. This process extracts low-level pose\ninformation, known as \"posecodes\", using a set of simple but generic rules on\nthe 3D keypoints. These posecodes are then combined into higher level textual\ndescriptions using syntactic rules. With automatic annotations, the amount of\navailable data significantly scales up (100k), making it possible to\neffectively pretrain deep models for finetuning on human captions. To showcase\nthe potential of annotated poses, we present three multi-modal learning tasks\nthat utilize the PoseScript dataset. Firstly, we develop a pipeline that maps\n3D poses and textual descriptions into a joint embedding space, allowing for\ncross-modal retrieval of relevant poses from large-scale datasets. Secondly, we\nestablish a baseline for a text-conditioned model generating 3D poses. Thirdly,\nwe present a learned process for generating pose descriptions. These\napplications demonstrate the versatility and usefulness of annotated poses in\nvarious tasks and pave the way for future research in the field.\n","authors":["Ginger Delmas","Philippe Weinzaepfel","Thomas Lucas","Francesc Moreno-Noguer","Grégory Rogez"],"pdf_url":"https://arxiv.org/pdf/2210.11795v2.pdf","comment":"Extended version of the ECCV 2022 paper"},{"id":"http://arxiv.org/abs/2106.01061v2","updated":"2024-01-19T13:44:46Z","published":"2021-06-02T10:26:13Z","title":"Rethinking Cross-modal Interaction from a Top-down Perspective for\n  Referring Video Object Segmentation","summary":"  Referring video object segmentation (RVOS) aims to segment video objects with\nthe guidance of natural language reference. Previous methods typically tackle\nRVOS through directly grounding linguistic reference over the image lattice.\nSuch bottom-up strategy fails to explore object-level cues, easily leading to\ninferior results. In this work, we instead put forward a two-stage, top-down\nRVOS solution. First, an exhaustive set of object tracklets is constructed by\npropagating object masks detected from several sampled frames to the entire\nvideo. Second, a Transformer-based tracklet-language grounding module is\nproposed, which models instance-level visual relations and cross-modal\ninteractions simultaneously and efficiently. Our model ranks first place on\nCVPR2021 Referring Youtube-VOS challenge.\n","authors":["Chen Liang","Yu Wu","Tianfei Zhou","Wenguan Wang","Zongxin Yang","Yunchao Wei","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2106.01061v2.pdf","comment":"Champion solution in YouTube-VOS 2021 Track 3. Extended version\n  published in https://ieeexplore.ieee.org/abstract/document/10083244"},{"id":"http://arxiv.org/abs/2301.13359v4","updated":"2024-01-19T13:25:03Z","published":"2023-01-31T01:24:45Z","title":"IM-IAD: Industrial Image Anomaly Detection Benchmark in Manufacturing","summary":"  Image anomaly detection (IAD) is an emerging and vital computer vision task\nin industrial manufacturing (IM). Recently, many advanced algorithms have been\nreported, but their performance deviates considerably with various IM settings.\nWe realize that the lack of a uniform IM benchmark is hindering the development\nand usage of IAD methods in real-world applications. In addition, it is\ndifficult for researchers to analyze IAD algorithms without a uniform\nbenchmark. To solve this problem, we propose a uniform IM benchmark, for the\nfirst time, to assess how well these algorithms perform, which includes various\nlevels of supervision (unsupervised versus fully supervised), learning\nparadigms (few-shot, continual and noisy label), and efficiency (memory usage\nand inference speed). Then, we construct a comprehensive image anomaly\ndetection benchmark (IM-IAD), which includes 19 algorithms on seven major\ndatasets with a uniform setting. Extensive experiments (17,017 total) on IM-IAD\nprovide in-depth insights into IAD algorithm redesign or selection. Moreover,\nthe proposed IM-IAD benchmark challenges existing algorithms and suggests\nfuture research directions. To foster reproducibility and accessibility, the\nsource code of IM-IAD is uploaded on the website,\nhttps://github.com/M-3LAB/IM-IAD.\n","authors":["Guoyang Xie","Jinbao Wang","Jiaqi Liu","Jiayi Lyu","Yong Liu","Chengjie Wang","Feng Zheng","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2301.13359v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13310v2","updated":"2024-01-19T13:03:04Z","published":"2023-05-22T17:59:43Z","title":"Matcher: Segment Anything with One Shot Using All-Purpose Feature\n  Matching","summary":"  Powered by large-scale pre-training, vision foundation models exhibit\nsignificant potential in open-world image understanding. However, unlike large\nlanguage models that excel at directly tackling various language tasks, vision\nfoundation models require a task-specific model structure followed by\nfine-tuning on specific tasks. In this work, we present Matcher, a novel\nperception paradigm that utilizes off-the-shelf vision foundation models to\naddress various perception tasks. Matcher can segment anything by using an\nin-context example without training. Additionally, we design three effective\ncomponents within the Matcher framework to collaborate with these foundation\nmodels and unleash their full potential in diverse perception tasks. Matcher\ndemonstrates impressive generalization performance across various segmentation\ntasks, all without training. For example, it achieves 52.7% mIoU on COCO-20$^i$\nwith one example, surpassing the state-of-the-art specialist model by 1.6%. In\naddition, Matcher achieves 33.0% mIoU on the proposed LVIS-92$^i$ for one-shot\nsemantic segmentation, outperforming the state-of-the-art generalist model by\n14.4%. Our visualization results further showcase the open-world generality and\nflexibility of Matcher when applied to images in the wild. Our code can be\nfound at https://github.com/aim-uofa/Matcher.\n","authors":["Yang Liu","Muzhi Zhu","Hengtao Li","Hao Chen","Xinlong Wang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2305.13310v2.pdf","comment":"Accepted to ICLR2024"},{"id":"http://arxiv.org/abs/2203.09773v2","updated":"2024-01-19T13:01:44Z","published":"2022-03-18T07:35:26Z","title":"Local-Global Context Aware Transformer for Language-Guided Video\n  Segmentation","summary":"  We explore the task of language-guided video segmentation (LVS). Previous\nalgorithms mostly adopt 3D CNNs to learn video representation, struggling to\ncapture long-term context and easily suffering from visual-linguistic\nmisalignment. In light of this, we present Locater (local-global context aware\nTransformer), which augments the Transformer architecture with a finite memory\nso as to query the entire video with the language expression in an efficient\nmanner. The memory is designed to involve two components -- one for\npersistently preserving global video content, and one for dynamically gathering\nlocal temporal context and segmentation history. Based on the memorized\nlocal-global context and the particular content of each frame, Locater\nholistically and flexibly comprehends the expression as an adaptive query\nvector for each frame. The vector is used to query the corresponding frame for\nmask generation. The memory also allows Locater to process videos with linear\ntime complexity and constant size memory, while Transformer-style\nself-attention computation scales quadratically with sequence length. To\nthoroughly examine the visual grounding capability of LVS models, we contribute\na new LVS dataset, A2D-S+, which is built upon A2D-S dataset but poses\nincreased challenges in disambiguating among similar objects. Experiments on\nthree LVS datasets and our A2D-S+ show that Locater outperforms previous\nstate-of-the-arts. Further, we won the 1st place in the Referring Video Object\nSegmentation Track of the 3rd Large-scale Video Object Segmentation Challenge,\nwhere Locater served as the foundation for the winning solution. Our code and\ndataset are available at: https://github.com/leonnnop/Locater\n","authors":["Chen Liang","Wenguan Wang","Tianfei Zhou","Jiaxu Miao","Yawei Luo","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2203.09773v2.pdf","comment":"Accepted by TPAMI. Code, data: https://github.com/leonnnop/Locater"},{"id":"http://arxiv.org/abs/2401.10666v1","updated":"2024-01-19T12:40:54Z","published":"2024-01-19T12:40:54Z","title":"MixNet: Towards Effective and Efficient UHD Low-Light Image Enhancement","summary":"  With the continuous advancement of imaging devices, the prevalence of\nUltra-High-Definition (UHD) images is rising. Although many image restoration\nmethods have achieved promising results, they are not directly applicable to\nUHD images on devices with limited computational resources due to the\ninherently high computational complexity of UHD images. In this paper, we focus\non the task of low-light image enhancement (LLIE) and propose a novel LLIE\nmethod called MixNet, which is designed explicitly for UHD images. To capture\nthe long-range dependency of features without introducing excessive\ncomputational complexity, we present the Global Feature Modulation Layer\n(GFML). GFML associates features from different views by permuting the feature\nmaps, enabling efficient modeling of long-range dependency. In addition, we\nalso design the Local Feature Modulation Layer (LFML) and Feed-forward Layer\n(FFL) to capture local features and transform features into a compact\nrepresentation. This way, our MixNet achieves effective LLIE with few model\nparameters and low computational complexity. We conducted extensive experiments\non both synthetic and real-world datasets, and the comprehensive results\ndemonstrate that our proposed method surpasses the performance of current\nstate-of-the-art methods. The code will be available at\n\\url{https://github.com/zzr-idam/MixNet}.\n","authors":["Chen Wu","Zhuoran Zheng","Xiuyi Jia","Wenqi Ren"],"pdf_url":"https://arxiv.org/pdf/2401.10666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15420v2","updated":"2024-01-19T12:34:42Z","published":"2023-11-26T21:04:28Z","title":"Data-Driven Modelling for Harmonic Current Emission in Low-Voltage Grid\n  Using MCReSANet with Interpretability Analysis","summary":"  Even though the use of power electronics PE loads offers enhanced electrical\nenergy conversion efficiency and control, they remain the primary sources of\nharmonics in grids. When diverse loads are connected in the distribution\nsystem, their interactions complicate establishing analytical models for the\nrelationship between harmonic voltages and currents. To solve this, our paper\npresents a data-driven model using MCReSANet to construct the highly nonlinear\nbetween harmonic voltage and current. Two datasets from PCCs in Finland and\nGermany are utilized, which demonstrates that MCReSANet is capable of\nestablishing accurate nonlinear mappings, even in the presence of various\nnetwork characteristics for selected Finland and Germany datasets. The model\nbuilt by MCReSANet can improve the MAE by 10% and 14% compared to the CNN, and\nby 8% and 17% compared to the MLP for both Finnish and German datasets, also\nshowing much lower model uncertainty than others. This is a crucial\nprerequisite for more precise SHAP value-based feature importance analysis,\nwhich is a method for the model interpretability analysis in this paper. The\nresults by feature importance analysis show the detailed relationships between\neach order of harmonic voltage and current in the distribution system. There is\nan interactive impact on each order of harmonic current, but some orders of\nharmonic voltages have a dominant influence on harmonic current emissions:\npositive sequence and zero sequence harmonics have the dominant importance in\nthe Finnish and German networks, respectively, which conforms to the pattern of\nconnected load types in two selected Finnish and German datasets. This paper\nenhances the potential for understanding and predicting harmonic current\nemissions by diverse PE loads in distribution systems, which is beneficial to\nmore effective management for optimizing power quality in diverse grid\nenvironments.\n","authors":["Jieyu Yao","Hao Yu","Paul Judge","Jiabin Jia","Sasa Djokic","Verner Püvi","Matti Lehtonen","Jan Meyer"],"pdf_url":"https://arxiv.org/pdf/2311.15420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16516v2","updated":"2024-01-19T12:29:47Z","published":"2023-12-27T10:49:19Z","title":"ConstScene: Dataset and Model for Advancing Robust Semantic Segmentation\n  in Construction Environments","summary":"  The increasing demand for autonomous machines in construction environments\nnecessitates the development of robust object detection algorithms that can\nperform effectively across various weather and environmental conditions. This\npaper introduces a new semantic segmentation dataset specifically tailored for\nconstruction sites, taking into account the diverse challenges posed by adverse\nweather and environmental conditions. The dataset is designed to enhance the\ntraining and evaluation of object detection models, fostering their\nadaptability and reliability in real-world construction applications. Our\ndataset comprises annotated images captured under a wide range of different\nweather conditions, including but not limited to sunny days, rainy periods,\nfoggy atmospheres, and low-light situations. Additionally, environmental\nfactors such as the existence of dirt/mud on the camera lens are integrated\ninto the dataset through actual captures and synthetic generation to simulate\nthe complex conditions prevalent in construction sites. We also generate\nsynthetic images of the annotations including precise semantic segmentation\nmasks for various objects commonly found in construction environments, such as\nwheel loader machines, personnel, cars, and structural elements. To demonstrate\nthe dataset's utility, we evaluate state-of-the-art object detection algorithms\non our proposed benchmark. The results highlight the dataset's success in\nadversarial training models across diverse conditions, showcasing its efficacy\ncompared to existing datasets that lack such environmental variability.\n","authors":["Maghsood Salimi","Mohammad Loni","Sara Afshar","Antonio Cicchetti","Marjan Sirjani"],"pdf_url":"https://arxiv.org/pdf/2312.16516v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2401.10659v1","updated":"2024-01-19T12:26:51Z","published":"2024-01-19T12:26:51Z","title":"BadODD: Bangladeshi Autonomous Driving Object Detection Dataset","summary":"  We propose a comprehensive dataset for object detection in diverse driving\nenvironments across 9 districts in Bangladesh. The dataset, collected\nexclusively from smartphone cameras, provided a realistic representation of\nreal-world scenarios, including day and night conditions. Most existing\ndatasets lack suitable classes for autonomous navigation on Bangladeshi roads,\nmaking it challenging for researchers to develop models that can handle the\nintricacies of road scenarios. To address this issue, the authors proposed a\nnew set of classes based on characteristics rather than local vehicle names.\nThe dataset aims to encourage the development of models that can handle the\nunique challenges of Bangladeshi road scenarios for the effective deployment of\nautonomous vehicles. The dataset did not consist of any online images to\nsimulate real-world conditions faced by autonomous vehicles. The classification\nof vehicles is challenging because of the diverse range of vehicles on\nBangladeshi roads, including those not found elsewhere in the world. The\nproposed classification system is scalable and can accommodate future vehicles,\nmaking it a valuable resource for researchers in the autonomous vehicle sector.\n","authors":["Mirza Nihal Baig","Rony Hajong","Mahdi Murshed Patwary","Mohammad Shahidur Rahman","Husne Ara Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2401.10659v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2312.08010v2","updated":"2024-01-19T12:19:48Z","published":"2023-12-13T09:33:08Z","title":"EZ-CLIP: Efficient Zeroshot Video Action Recognition","summary":"  Recent advancements in large-scale pre-training of visual-language models on\npaired image-text data have demonstrated impressive generalization capabilities\nfor zero-shot tasks. Building on this success, efforts have been made to adapt\nthese image-based visual-language models, such as CLIP, for videos extending\ntheir zero-shot capabilities to the video domain. While these adaptations have\nshown promising results, they come at a significant computational cost and\nstruggle with effectively modeling the crucial temporal aspects inherent to the\nvideo domain. In this study, we present EZ-CLIP, a simple and efficient\nadaptation of CLIP that addresses these challenges. EZ-CLIP leverages temporal\nvisual prompting for seamless temporal adaptation, requiring no fundamental\nalterations to the core CLIP architecture while preserving its remarkable\ngeneralization abilities. Moreover, we introduce a novel learning objective\nthat guides the temporal visual prompts to focus on capturing motion, thereby\nenhancing its learning capabilities from video data. We conducted extensive\nexperiments on five different benchmark datasets, thoroughly evaluating EZ-CLIP\nfor zero-shot learning and base-to-novel video action recognition, and also\ndemonstrating its potential for few-shot generalization.Impressively, with a\nmere 5.2 million learnable parameters (as opposed to the 71.1 million in the\nprior best model), EZ-CLIP can be efficiently trained on a single GPU,\noutperforming existing approaches in several evaluations.\n","authors":["Shahzad Ahmad","Sukalpa Chanda","Yogesh S Rawat"],"pdf_url":"https://arxiv.org/pdf/2312.08010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07823v4","updated":"2024-01-19T12:18:28Z","published":"2023-12-13T01:16:50Z","title":"Semantic Lens: Instance-Centric Semantic Alignment for Video\n  Super-Resolution","summary":"  As a critical clue of video super-resolution (VSR), inter-frame alignment\nsignificantly impacts overall performance. However, accurate pixel-level\nalignment is a challenging task due to the intricate motion interweaving in the\nvideo. In response to this issue, we introduce a novel paradigm for VSR named\nSemantic Lens, predicated on semantic priors drawn from degraded videos.\nSpecifically, video is modeled as instances, events, and scenes via a Semantic\nExtractor. Those semantics assist the Pixel Enhancer in understanding the\nrecovered contents and generating more realistic visual results. The distilled\nglobal semantics embody the scene information of each frame, while the\ninstance-specific semantics assemble the spatial-temporal contexts related to\neach instance. Furthermore, we devise a Semantics-Powered Attention\nCross-Embedding (SPACE) block to bridge the pixel-level features with semantic\nknowledge, composed of a Global Perspective Shifter (GPS) and an\nInstance-Specific Semantic Embedding Encoder (ISEE). Concretely, the GPS module\ngenerates pairs of affine transformation parameters for pixel-level feature\nmodulation conditioned on global semantics. After that, the ISEE module\nharnesses the attention mechanism to align the adjacent frames in the\ninstance-centric semantic space. In addition, we incorporate a simple yet\neffective pre-alignment module to alleviate the difficulty of model training.\nExtensive experiments demonstrate the superiority of our model over existing\nstate-of-the-art VSR methods.\n","authors":["Qi Tang","Yao Zhao","Meiqin Liu","Jian Jin","Chao Yao"],"pdf_url":"https://arxiv.org/pdf/2312.07823v4.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2401.10643v1","updated":"2024-01-19T11:45:10Z","published":"2024-01-19T11:45:10Z","title":"A Comprehensive Survey on Deep-Learning-based Vehicle Re-Identification:\n  Models, Data Sets and Challenges","summary":"  Vehicle re-identification (ReID) endeavors to associate vehicle images\ncollected from a distributed network of cameras spanning diverse traffic\nenvironments. This task assumes paramount importance within the spectrum of\nvehicle-centric technologies, playing a pivotal role in deploying Intelligent\nTransportation Systems (ITS) and advancing smart city initiatives. Rapid\nadvancements in deep learning have significantly propelled the evolution of\nvehicle ReID technologies in recent years. Consequently, undertaking a\ncomprehensive survey of methodologies centered on deep learning for vehicle\nre-identification has become imperative and inescapable. This paper extensively\nexplores deep learning techniques applied to vehicle ReID. It outlines the\ncategorization of these methods, encompassing supervised and unsupervised\napproaches, delves into existing research within these categories, introduces\ndatasets and evaluation criteria, and delineates forthcoming challenges and\npotential research directions. This comprehensive assessment examines the\nlandscape of deep learning in vehicle ReID and establishes a foundation and\nstarting point for future works. It aims to serve as a complete reference by\nhighlighting challenges and emerging trends, fostering advancements and\napplications in vehicle ReID utilizing deep learning models.\n","authors":["Ali Amiri","Aydin Kaya","Ali Seydi Keceli"],"pdf_url":"https://arxiv.org/pdf/2401.10643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10640v1","updated":"2024-01-19T11:35:52Z","published":"2024-01-19T11:35:52Z","title":"A comprehensive study on fidelity metrics for XAI","summary":"  The use of eXplainable Artificial Intelligence (XAI) systems has introduced a\nset of challenges that need resolution. Herein, we focus on how to correctly\nselect an XAI method, an open questions within the field. The inherent\ndifficulty of this task is due to the lack of a ground truth. Several authors\nhave proposed metrics to approximate the fidelity of different XAI methods.\nThese metrics lack verification and have concerning disagreements. In this\nstudy, we proposed a novel methodology to verify fidelity metrics, using a\nwell-known transparent model, namely a decision tree. This model allowed us to\nobtain explanations with perfect fidelity. Our proposal constitutes the first\nobjective benchmark for these metrics, facilitating a comparison of existing\nproposals, and surpassing existing methods. We applied our benchmark to assess\nthe existing fidelity metrics in two different experiments, each using public\ndatasets comprising 52,000 images. The images from these datasets had a size a\n128 by 128 pixels and were synthetic data that simplified the training process.\nAll metric values, indicated a lack of fidelity, with the best one showing a 30\n\\% deviation from the expected values for perfect explanation. Our\nexperimentation led us to conclude that the current fidelity metrics are not\nreliable enough to be used in real scenarios. From this finding, we deemed it\nnecessary to development new metrics, to avoid the detected problems, and we\nrecommend the usage of our proposal as a benchmark within the scientific\ncommunity to address these limitations.\n","authors":["Miquel Miró-Nicolau","Antoni Jaume-i-Capó","Gabriel Moyà-Alcover"],"pdf_url":"https://arxiv.org/pdf/2401.10640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10637v1","updated":"2024-01-19T11:35:07Z","published":"2024-01-19T11:35:07Z","title":"Towards Universal Unsupervised Anomaly Detection in Medical Imaging","summary":"  The increasing complexity of medical imaging data underscores the need for\nadvanced anomaly detection methods to automatically identify diverse\npathologies. Current methods face challenges in capturing the broad spectrum of\nanomalies, often limiting their use to specific lesion types in brain scans. To\naddress this challenge, we introduce a novel unsupervised approach, termed\n\\textit{Reversed Auto-Encoders (RA)}, designed to create realistic\npseudo-healthy reconstructions that enable the detection of a wider range of\npathologies. We evaluate the proposed method across various imaging modalities,\nincluding magnetic resonance imaging (MRI) of the brain, pediatric wrist X-ray,\nand chest X-ray, and demonstrate superior performance in detecting anomalies\ncompared to existing state-of-the-art methods. Our unsupervised anomaly\ndetection approach may enhance diagnostic accuracy in medical imaging by\nidentifying a broader range of unknown pathologies. Our code is publicly\navailable at: \\url{https://github.com/ci-ber/RA}.\n","authors":["Cosmin I. Bercea","Benedikt Wiestler","Daniel Rueckert","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2401.10637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10620v1","updated":"2024-01-19T10:52:57Z","published":"2024-01-19T10:52:57Z","title":"Polytopic Autoencoders with Smooth Clustering for Reduced-order\n  Modelling of Flows","summary":"  With the advancement of neural networks, there has been a notable increase,\nboth in terms of quantity and variety, in research publications concerning the\napplication of autoencoders to reduced-order models. We propose a polytopic\nautoencoder architecture that includes a lightweight nonlinear encoder, a\nconvex combination decoder, and a smooth clustering network. Supported by\nseveral proofs, the model architecture ensures that all reconstructed states\nlie within a polytope, accompanied by a metric indicating the quality of the\nconstructed polytopes, referred to as polytope error. Additionally, it offers a\nminimal number of convex coordinates for polytopic linear-parameter varying\nsystems while achieving acceptable reconstruction errors compared to proper\northogonal decomposition (POD). To validate our proposed model, we conduct\nsimulations involving two flow scenarios with the incompressible Navier-Stokes\nequation. Numerical results demonstrate the guaranteed properties of the model,\nlow reconstruction errors compared to POD, and the improvement in error using a\nclustering network.\n","authors":["Jan Heiland","Yongho Kim"],"pdf_url":"https://arxiv.org/pdf/2401.10620v1.pdf","comment":"28 pages, 18 figures"},{"id":"http://arxiv.org/abs/2401.10608v1","updated":"2024-01-19T10:37:27Z","published":"2024-01-19T10:37:27Z","title":"M2ORT: Many-To-One Regression Transformer for Spatial Transcriptomics\n  Prediction from Histopathology Images","summary":"  The advancement of Spatial Transcriptomics (ST) has facilitated the\nspatially-aware profiling of gene expressions based on histopathology images.\nAlthough ST data offers valuable insights into the micro-environment of tumors,\nits acquisition cost remains expensive. Therefore, directly predicting the ST\nexpressions from digital pathology images is desired. Current methods usually\nadopt existing regression backbones for this task, which ignore the inherent\nmulti-scale hierarchical data structure of digital pathology images. To address\nthis limit, we propose M2ORT, a many-to-one regression Transformer that can\naccommodate the hierarchical structure of the pathology images through a\ndecoupled multi-scale feature extractor. Different from traditional models that\nare trained with one-to-one image-label pairs, M2ORT accepts multiple pathology\nimages of different magnifications at a time to jointly predict the gene\nexpressions at their corresponding common ST spot, aiming at learning a\nmany-to-one relationship through training. We have tested M2ORT on three public\nST datasets and the experimental results show that M2ORT can achieve\nstate-of-the-art performance with fewer parameters and floating-point\noperations (FLOPs). The code is available at:\nhttps://github.com/Dootmaan/M2ORT/.\n","authors":["Hongyi Wang","Xiuju Du","Jing Liu","Shuyi Ouyang","Yen-Wei Chen","Lanfen Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10191v2","updated":"2024-01-19T10:01:36Z","published":"2024-01-18T18:25:29Z","title":"Divide and not forget: Ensemble of selectively trained experts in\n  Continual Learning","summary":"  Class-incremental learning is becoming more popular as it helps models widen\ntheir applicability while not forgetting what they already know. A trend in\nthis area is to use a mixture-of-expert technique, where different models work\ntogether to solve the task. However, the experts are usually trained all at\nonce using whole task data, which makes them all prone to forgetting and\nincreasing computational burden. To address this limitation, we introduce a\nnovel approach named SEED. SEED selects only one, the most optimal expert for a\nconsidered task, and uses data from this task to fine-tune only this expert.\nFor this purpose, each expert represents each class with a Gaussian\ndistribution, and the optimal expert is selected based on the similarity of\nthose distributions. Consequently, SEED increases diversity and heterogeneity\nwithin the experts while maintaining the high stability of this ensemble\nmethod. The extensive experiments demonstrate that SEED achieves\nstate-of-the-art performance in exemplar-free settings across various\nscenarios, showing the potential of expert diversification through data in\ncontinual learning.\n","authors":["Grzegorz Rypeść","Sebastian Cygert","Valeriya Khan","Tomasz Trzciński","Bartosz Zieliński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2401.10191v2.pdf","comment":"Accepted for ICLR 2024 (main track), code is available at:\n  https://github.com/grypesc/SEED"},{"id":"http://arxiv.org/abs/2401.10588v1","updated":"2024-01-19T09:58:06Z","published":"2024-01-19T09:58:06Z","title":"DGL: Dynamic Global-Local Prompt Tuning for Text-Video Retrieval","summary":"  Text-video retrieval is a critical multi-modal task to find the most relevant\nvideo for a text query. Although pretrained models like CLIP have demonstrated\nimpressive potential in this area, the rising cost of fully finetuning these\nmodels due to increasing model size continues to pose a problem. To address\nthis challenge, prompt tuning has emerged as an alternative. However, existing\nworks still face two problems when adapting pretrained image-text models to\ndownstream video-text tasks: (1) The visual encoder could only encode\nframe-level features and failed to extract global-level general video\ninformation. (2) Equipping the visual and text encoder with separated prompts\nfailed to mitigate the visual-text modality gap. To this end, we propose DGL, a\ncross-modal Dynamic prompt tuning method with Global-Local video attention. In\ncontrast to previous prompt tuning methods, we employ the shared latent space\nto generate local-level text and frame prompts that encourage inter-modal\ninteraction. Furthermore, we propose modeling video in a global-local attention\nmechanism to capture global video information from the perspective of prompt\ntuning. Extensive experiments reveal that when only 0.67% parameters are tuned,\nour cross-modal prompt tuning strategy DGL outperforms or is comparable to\nfully finetuning methods on MSR-VTT, VATEX, LSMDC, and ActivityNet datasets.\nCode will be available at https://github.com/knightyxp/DGL\n","authors":["Xiangpeng Yang","Linchao Zhu","Xiaohan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2401.10588v1.pdf","comment":"AAAI2024, Code will be available at https://github.com/knightyxp/DGL"},{"id":"http://arxiv.org/abs/2303.06088v6","updated":"2024-01-19T09:45:02Z","published":"2023-03-10T17:09:04Z","title":"Towards domain-invariant Self-Supervised Learning with Batch Styles\n  Standardization","summary":"  In Self-Supervised Learning (SSL), models are typically pretrained,\nfine-tuned, and evaluated on the same domains. However, they tend to perform\npoorly when evaluated on unseen domains, a challenge that Unsupervised Domain\nGeneralization (UDG) seeks to address. Current UDG methods rely on domain\nlabels, which are often challenging to collect, and domain-specific\narchitectures that lack scalability when confronted with numerous domains,\nmaking the current methodology impractical and rigid. Inspired by\ncontrastive-based UDG methods that mitigate spurious correlations by\nrestricting comparisons to examples from the same domain, we hypothesize that\neliminating style variability within a batch could provide a more convenient\nand flexible way to reduce spurious correlations without requiring domain\nlabels. To verify this hypothesis, we introduce Batch Styles Standardization\n(BSS), a relatively simple yet powerful Fourier-based method to standardize the\nstyle of images in a batch specifically designed for integration with SSL\nmethods to tackle UDG. Combining BSS with existing SSL methods offers serious\nadvantages over prior UDG methods: (1) It eliminates the need for domain labels\nor domain-specific network components to enhance domain-invariance in SSL\nrepresentations, and (2) offers flexibility as BSS can be seamlessly integrated\nwith diverse contrastive-based but also non-contrastive-based SSL methods.\nExperiments on several UDG datasets demonstrate that it significantly improves\ndownstream task performances on unseen domains, often outperforming or rivaling\nwith UDG methods. Finally, this work clarifies the underlying mechanisms\ncontributing to BSS's effectiveness in improving domain-invariance in SSL\nrepresentations and performances on unseen domain.\n","authors":["Marin Scalbert","Maria Vakalopoulou","Florent Couzinié-Devy"],"pdf_url":"https://arxiv.org/pdf/2303.06088v6.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10578v1","updated":"2024-01-19T09:41:09Z","published":"2024-01-19T09:41:09Z","title":"3D Shape Completion on Unseen Categories:A Weakly-supervised Approach","summary":"  3D shapes captured by scanning devices are often incomplete due to occlusion.\n3D shape completion methods have been explored to tackle this limitation.\nHowever, most of these methods are only trained and tested on a subset of\ncategories, resulting in poor generalization to unseen categories. In this\npaper, we introduce a novel weakly-supervised framework to reconstruct the\ncomplete shapes from unseen categories. We first propose an end-to-end\nprior-assisted shape learning network that leverages data from the seen\ncategories to infer a coarse shape. Specifically, we construct a prior bank\nconsisting of representative shapes from the seen categories. Then, we design a\nmulti-scale pattern correlation module for learning the complete shape of the\ninput by analyzing the correlation between local patterns within the input and\nthe priors at various scales. In addition, we propose a self-supervised shape\nrefinement model to further refine the coarse shape. Considering the shape\nvariability of 3D objects across categories, we construct a category-specific\nprior bank to facilitate shape refinement. Then, we devise a voxel-based\npartial matching loss and leverage the partial scans to drive the refinement\nprocess. Extensive experimental results show that our approach is superior to\nstate-of-the-art methods by a large margin.\n","authors":["Lintai Wu","Junhui Hou","Linqi Song","Yong Xu"],"pdf_url":"https://arxiv.org/pdf/2401.10578v1.pdf","comment":"13 pages,8 figures"},{"id":"http://arxiv.org/abs/2401.10564v1","updated":"2024-01-19T09:01:20Z","published":"2024-01-19T09:01:20Z","title":"Dream360: Diverse and Immersive Outdoor Virtual Scene Creation via\n  Transformer-Based 360 Image Outpainting","summary":"  360 images, with a field-of-view (FoV) of 180x360, provide immersive and\nrealistic environments for emerging virtual reality (VR) applications, such as\nvirtual tourism, where users desire to create diverse panoramic scenes from a\nnarrow FoV photo they take from a viewpoint via portable devices. It thus\nbrings us to a technical challenge: `How to allow the users to freely create\ndiverse and immersive virtual scenes from a narrow FoV image with a specified\nviewport?' To this end, we propose a transformer-based 360 image outpainting\nframework called Dream360, which can generate diverse, high-fidelity, and\nhigh-resolution panoramas from user-selected viewports, considering the\nspherical properties of 360 images. Compared with existing methods, e.g., [3],\nwhich primarily focus on inputs with rectangular masks and central locations\nwhile overlooking the spherical property of 360 images, our Dream360 offers\nhigher outpainting flexibility and fidelity based on the spherical\nrepresentation. Dream360 comprises two key learning stages: (I) codebook-based\npanorama outpainting via Spherical-VQGAN (S-VQGAN), and (II) frequency-aware\nrefinement with a novel frequency-aware consistency loss. Specifically, S-VQGAN\nlearns a sphere-specific codebook from spherical harmonic (SH) values,\nproviding a better representation of spherical data distribution for scene\nmodeling. The frequency-aware refinement matches the resolution and further\nimproves the semantic consistency and visual fidelity of the generated results.\nOur Dream360 achieves significantly lower Frechet Inception Distance (FID)\nscores and better visual fidelity than existing methods. We also conducted a\nuser study involving 15 participants to interactively evaluate the quality of\nthe generated results in VR, demonstrating the flexibility and superiority of\nour Dream360 framework.\n","authors":["Hao Ai","Zidong Cao","Haonan Lu","Chen Chen","Jian Ma","Pengyuan Zhou","Tae-Kyun Kim","Pan Hui","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2401.10564v1.pdf","comment":"11 pages, accepted to IEEE VR 2024"},{"id":"http://arxiv.org/abs/2401.10561v1","updated":"2024-01-19T08:54:54Z","published":"2024-01-19T08:54:54Z","title":"MAEDiff: Masked Autoencoder-enhanced Diffusion Models for Unsupervised\n  Anomaly Detection in Brain Images","summary":"  Unsupervised anomaly detection has gained significant attention in the field\nof medical imaging due to its capability of relieving the costly pixel-level\nannotation. To achieve this, modern approaches usually utilize generative\nmodels to produce healthy references of the diseased images and then identify\nthe abnormalities by comparing the healthy references and the original diseased\nimages. Recently, diffusion models have exhibited promising potential for\nunsupervised anomaly detection in medical images for their good mode coverage\nand high sample quality. However, the intrinsic characteristics of the medical\nimages, e.g. the low contrast, and the intricate anatomical structure of the\nhuman body make the reconstruction challenging. Besides, the global information\nof medical images often remain underutilized. To address these two issues, we\npropose a novel Masked Autoencoder-enhanced Diffusion Model (MAEDiff) for\nunsupervised anomaly detection in brain images. The MAEDiff involves a\nhierarchical patch partition. It generates healthy images by overlapping\nupper-level patches and implements a mechanism based on the masked autoencoders\noperating on the sub-level patches to enhance the condition on the unnoised\nregions. Extensive experiments on data of tumors and multiple sclerosis lesions\ndemonstrate the effectiveness of our method.\n","authors":["Rui Xu","Yunke Wang","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2401.10561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10560v1","updated":"2024-01-19T08:52:24Z","published":"2024-01-19T08:52:24Z","title":"360ORB-SLAM: A Visual SLAM System for Panoramic Images with Depth\n  Completion Network","summary":"  To enhance the performance and effect of AR/VR applications and visual\nassistance and inspection systems, visual simultaneous localization and mapping\n(vSLAM) is a fundamental task in computer vision and robotics. However,\ntraditional vSLAM systems are limited by the camera's narrow field-of-view,\nresulting in challenges such as sparse feature distribution and lack of dense\ndepth information. To overcome these limitations, this paper proposes a\n360ORB-SLAM system for panoramic images that combines with a depth completion\nnetwork. The system extracts feature points from the panoramic image, utilizes\na panoramic triangulation module to generate sparse depth information, and\nemploys a depth completion network to obtain a dense panoramic depth map.\nExperimental results on our novel panoramic dataset constructed based on Carla\ndemonstrate that the proposed method achieves superior scale accuracy compared\nto existing monocular SLAM methods and effectively addresses the challenges of\nfeature association and scale ambiguity. The integration of the depth\ncompletion network enhances system stability and mitigates the impact of\ndynamic elements on SLAM performance.\n","authors":["Yichen Chen","Yiqi Pan","Ruyu Liu","Haoyu Zhang","Guodao Zhang","Bo Sun","Jianhua Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10560v1.pdf","comment":"6 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.02119v3","updated":"2024-01-19T08:50:28Z","published":"2023-09-05T10:52:21Z","title":"Hierarchical Masked 3D Diffusion Model for Video Outpainting","summary":"  Video outpainting aims to adequately complete missing areas at the edges of\nvideo frames. Compared to image outpainting, it presents an additional\nchallenge as the model should maintain the temporal consistency of the filled\narea. In this paper, we introduce a masked 3D diffusion model for video\noutpainting. We use the technique of mask modeling to train the 3D diffusion\nmodel. This allows us to use multiple guide frames to connect the results of\nmultiple video clip inferences, thus ensuring temporal consistency and reducing\njitter between adjacent frames. Meanwhile, we extract the global frames of the\nvideo as prompts and guide the model to obtain information other than the\ncurrent video clip using cross-attention. We also introduce a hybrid\ncoarse-to-fine inference pipeline to alleviate the artifact accumulation\nproblem. The existing coarse-to-fine pipeline only uses the infilling strategy,\nwhich brings degradation because the time interval of the sparse frames is too\nlarge. Our pipeline benefits from bidirectional learning of the mask modeling\nand thus can employ a hybrid strategy of infilling and interpolation when\ngenerating sparse frames. Experiments show that our method achieves\nstate-of-the-art results in video outpainting tasks. More results and codes are\nprovided at our https://fanfanda.github.io/M3DDM/.\n","authors":["Fanda Fan","Chaoxu Guo","Litong Gong","Biao Wang","Tiezheng Ge","Yuning Jiang","Chunjie Luo","Jianfeng Zhan"],"pdf_url":"https://arxiv.org/pdf/2309.02119v3.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2401.10556v1","updated":"2024-01-19T08:44:52Z","published":"2024-01-19T08:44:52Z","title":"Symbol as Points: Panoptic Symbol Spotting via Point-based\n  Representation","summary":"  This work studies the problem of panoptic symbol spotting, which is to spot\nand parse both countable object instances (windows, doors, tables, etc.) and\nuncountable stuff (wall, railing, etc.) from computer-aided design (CAD)\ndrawings. Existing methods typically involve either rasterizing the vector\ngraphics into images and using image-based methods for symbol spotting, or\ndirectly building graphs and using graph neural networks for symbol\nrecognition. In this paper, we take a different approach, which treats graphic\nprimitives as a set of 2D points that are locally connected and use point cloud\nsegmentation methods to tackle it. Specifically, we utilize a point transformer\nto extract the primitive features and append a mask2former-like spotting head\nto predict the final output. To better use the local connection information of\nprimitives and enhance their discriminability, we further propose the attention\nwith connection module (ACM) and contrastive connection learning scheme (CCL).\nFinally, we propose a KNN interpolation mechanism for the mask attention module\nof the spotting head to better handle primitive mask downsampling, which is\nprimitive-level in contrast to pixel-level for the image. Our approach, named\nSymPoint, is simple yet effective, outperforming recent state-of-the-art method\nGAT-CADNet by an absolute increase of 9.6% PQ and 10.4% RQ on the FloorPlanCAD\ndataset. The source code and models will be available at\nhttps://github.com/nicehuster/SymPoint.\n","authors":["Wenlong Liu","Tianyu Yang","Yuhan Wang","Qizhi Yu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10556v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2309.02773v2","updated":"2024-01-19T08:01:15Z","published":"2023-09-06T06:31:08Z","title":"Diffusion Model is Secretly a Training-free Open Vocabulary Semantic\n  Segmenter","summary":"  The pre-trained text-image discriminative models, such as CLIP, has been\nexplored for open-vocabulary semantic segmentation with unsatisfactory results\ndue to the loss of crucial localization information and awareness of object\nshapes. Recently, there has been a growing interest in expanding the\napplication of generative models from generation tasks to semantic\nsegmentation. These approaches utilize generative models either for generating\nannotated data or extracting features to facilitate semantic segmentation. This\ntypically involves generating a considerable amount of synthetic data or\nrequiring additional mask annotations. To this end, we uncover the potential of\ngenerative text-to-image diffusion models (e.g., Stable Diffusion) as highly\nefficient open-vocabulary semantic segmenters, and introduce a novel\ntraining-free approach named DiffSegmenter. The insight is that to generate\nrealistic objects that are semantically faithful to the input text, both the\ncomplete object shapes and the corresponding semantics are implicitly learned\nby diffusion models. We discover that the object shapes are characterized by\nthe self-attention maps while the semantics are indicated through the\ncross-attention maps produced by the denoising U-Net, forming the basis of our\nsegmentation results.Additionally, we carefully design effective textual\nprompts and a category filtering mechanism to further enhance the segmentation\nresults. Extensive experiments on three benchmark datasets show that the\nproposed DiffSegmenter achieves impressive results for open-vocabulary semantic\nsegmentation.\n","authors":["Jinglong Wang","Xiawei Li","Jing Zhang","Qingyuan Xu","Qin Zhou","Qian Yu","Lu Sheng","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2309.02773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10541v1","updated":"2024-01-19T07:44:32Z","published":"2024-01-19T07:44:32Z","title":"I-SplitEE: Image classification in Split Computing DNNs with Early Exits","summary":"  The recent advances in Deep Neural Networks (DNNs) stem from their\nexceptional performance across various domains. However, their inherent large\nsize hinders deploying these networks on resource-constrained devices like\nedge, mobile, and IoT platforms. Strategies have emerged, from partial cloud\ncomputation offloading (split computing) to integrating early exits within DNN\nlayers. Our work presents an innovative unified approach merging early exits\nand split computing. We determine the 'splitting layer', the optimal depth in\nthe DNN for edge device computations, and whether to infer on edge device or be\noffloaded to the cloud for inference considering accuracy, computational\nefficiency, and communication costs. Also, Image classification faces diverse\nenvironmental distortions, influenced by factors like time of day, lighting,\nand weather. To adapt to these distortions, we introduce I-SplitEE, an online\nunsupervised algorithm ideal for scenarios lacking ground truths and with\nsequential data. Experimental validation using Caltech-256 and Cifar-10\ndatasets subjected to varied distortions showcases I-SplitEE's ability to\nreduce costs by a minimum of 55% with marginal performance degradation of at\nmost 5%.\n","authors":["Divya Jyoti Bajpai","Aastha Jaiswal","Manjesh Kumar Hanawal"],"pdf_url":"https://arxiv.org/pdf/2401.10541v1.pdf","comment":"To appear in proceedings of IEEE International Conference on\n  Communications 2024"},{"id":"http://arxiv.org/abs/2401.10537v1","updated":"2024-01-19T07:31:44Z","published":"2024-01-19T07:31:44Z","title":"Learning Position-Aware Implicit Neural Network for Real-World Face\n  Inpainting","summary":"  Face inpainting requires the model to have a precise global understanding of\nthe facial position structure. Benefiting from the powerful capabilities of\ndeep learning backbones, recent works in face inpainting have achieved decent\nperformance in ideal setting (square shape with $512px$). However, existing\nmethods often produce a visually unpleasant result, especially in the\nposition-sensitive details (e.g., eyes and nose), when directly applied to\narbitrary-shaped images in real-world scenarios. The visually unpleasant\nposition-sensitive details indicate the shortcomings of existing methods in\nterms of position information processing capability. In this paper, we propose\nan \\textbf{I}mplicit \\textbf{N}eural \\textbf{I}npainting \\textbf{N}etwork\n(IN$^2$) to handle arbitrary-shape face images in real-world scenarios by\nexplicit modeling for position information. Specifically, a downsample\nprocessing encoder is proposed to reduce information loss while obtaining the\nglobal semantic feature. A neighbor hybrid attention block is proposed with a\nhybrid attention mechanism to improve the facial understanding ability of the\nmodel without restricting the shape of the input. Finally, an implicit neural\npyramid decoder is introduced to explicitly model position information and\nbridge the gap between low-resolution features and high-resolution output.\nExtensive experiments demonstrate the superiority of the proposed method in\nreal-world face inpainting task.\n","authors":["Bo Zhao","Huan Yang","Jianlong Fu"],"pdf_url":"https://arxiv.org/pdf/2401.10537v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.16451v3","updated":"2024-01-19T07:27:18Z","published":"2023-12-27T07:35:17Z","title":"Domain Generalization with Vital Phase Augmentation","summary":"  Deep neural networks have shown remarkable performance in image\nclassification. However, their performance significantly deteriorates with\ncorrupted input data. Domain generalization methods have been proposed to train\nrobust models against out-of-distribution data. Data augmentation in the\nfrequency domain is one of such approaches that enable a model to learn phase\nfeatures to establish domain-invariant representations. This approach changes\nthe amplitudes of the input data while preserving the phases. However, using\nfixed phases leads to susceptibility to phase fluctuations because amplitudes\nand phase fluctuations commonly occur in out-of-distribution. In this study, to\naddress this problem, we introduce an approach using finite variation of the\nphases of input data rather than maintaining fixed phases. Based on the\nassumption that the degree of domain-invariant features varies for each phase,\nwe propose a method to distinguish phases based on this degree. In addition, we\npropose a method called vital phase augmentation (VIPAug) that applies the\nvariation to the phases differently according to the degree of domain-invariant\nfeatures of given phases. The model depends more on the vital phases that\ncontain more domain-invariant features for attaining robustness to amplitude\nand phase fluctuations. We present experimental evaluations of our proposed\napproach, which exhibited improved performance for both clean and corrupted\ndata. VIPAug achieved SOTA performance on the benchmark CIFAR-10 and CIFAR-100\ndatasets, as well as near-SOTA performance on the ImageNet-100 and ImageNet\ndatasets. Our code is available at https://github.com/excitedkid/vipaug.\n","authors":["Ingyun Lee","Wooju Lee","Hyun Myung"],"pdf_url":"https://arxiv.org/pdf/2312.16451v3.pdf","comment":"Accepted by AAAI-24"},{"id":"http://arxiv.org/abs/2309.06023v4","updated":"2024-01-19T07:22:30Z","published":"2023-09-12T07:50:54Z","title":"Learning from History: Task-agnostic Model Contrastive Learning for\n  Image Restoration","summary":"  Contrastive learning has emerged as a prevailing paradigm for high-level\nvision tasks, which, by introducing properly negative samples, has also been\nexploited for low-level vision tasks to achieve a compact optimization space to\naccount for their ill-posed nature. However, existing methods rely on manually\npredefined and task-oriented negatives, which often exhibit pronounced\ntask-specific biases. To address this challenge, our paper introduces an\ninnovative method termed 'learning from history', which dynamically generates\nnegative samples from the target model itself. Our approach, named Model\nContrastive paradigm for Image Restoration (MCIR), rejuvenates latency models\nas negative models, making it compatible with diverse image restoration tasks.\nWe propose the Self-Prior guided Negative loss (SPN) to enable it. This\napproach significantly enhances existing models when retrained with the\nproposed model contrastive paradigm. The results show significant improvements\nin image restoration across various tasks and architectures. For example,\nmodels retrained with SPN outperform the original FFANet and DehazeFormer by\n3.41 dB and 0.57 dB on the RESIDE indoor dataset for image dehazing. Similarly,\nthey achieve notable improvements of 0.47 dB on SPA-Data over IDT for image\nderaining and 0.12 dB on Manga109 for a 4x scale super-resolution over\nlightweight SwinIR, respectively. Code and retrained models are available at\nhttps://github.com/Aitical/MCIR.\n","authors":["Gang Wu","Junjun Jiang","Kui Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06023v4.pdf","comment":"Camera Ready Version. Accepted to The 38th Annual AAAI Conference on\n  Artificial Intelligence (AAAI 2024)"},{"id":"http://arxiv.org/abs/2401.10530v1","updated":"2024-01-19T07:12:36Z","published":"2024-01-19T07:12:36Z","title":"NWPU-MOC: A Benchmark for Fine-grained Multi-category Object Counting in\n  Aerial Images","summary":"  Object counting is a hot topic in computer vision, which aims to estimate the\nnumber of objects in a given image. However, most methods only count objects of\na single category for an image, which cannot be applied to scenes that need to\ncount objects with multiple categories simultaneously, especially in aerial\nscenes. To this end, this paper introduces a Multi-category Object Counting\n(MOC) task to estimate the numbers of different objects (cars, buildings,\nships, etc.) in an aerial image. Considering the absence of a dataset for this\ntask, a large-scale Dataset (NWPU-MOC) is collected, consisting of 3,416 scenes\nwith a resolution of 1024 $\\times$ 1024 pixels, and well-annotated using 14\nfine-grained object categories. Besides, each scene contains RGB and Near\nInfrared (NIR) images, of which the NIR spectrum can provide richer\ncharacterization information compared with only the RGB spectrum. Based on\nNWPU-MOC, the paper presents a multi-spectrum, multi-category object counting\nframework, which employs a dual-attention module to fuse the features of RGB\nand NIR and subsequently regress multi-channel density maps corresponding to\neach object category. In addition, to modeling the dependency between different\nchannels in the density map with each object category, a spatial contrast loss\nis designed as a penalty for overlapping predictions at the same spatial\nposition. Experimental results demonstrate that the proposed method achieves\nstate-of-the-art performance compared with some mainstream counting algorithms.\nThe dataset, code and models are publicly available at\nhttps://github.com/lyongo/NWPU-MOC.\n","authors":["Junyu Gao","Liangliang Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2401.10530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10529v1","updated":"2024-01-19T07:10:13Z","published":"2024-01-19T07:10:13Z","title":"Mementos: A Comprehensive Benchmark for Multimodal Large Language Model\n  Reasoning over Image Sequences","summary":"  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in\nhandling a variety of visual-language tasks. However, current MLLM benchmarks\nare predominantly designed to evaluate reasoning based on static information\nabout a single image, and the ability of modern MLLMs to extrapolate from image\nsequences, which is essential for understanding our ever-changing world, has\nbeen less investigated. To address this challenge, this paper introduces\nMementos, a new benchmark designed to assess MLLMs' sequential image reasoning\nabilities. Mementos features 4,761 diverse image sequences with varying\nlengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning\nperformance. Through a careful evaluation of nine recent MLLMs on Mementos,\nincluding GPT-4V and Gemini, we find that they struggle to accurately describe\ndynamic information about given image sequences, often leading to\nhallucinations/misrepresentations of objects and their corresponding behaviors.\nOur quantitative analysis and case studies identify three key factors impacting\nMLLMs' sequential image reasoning: the correlation between object and\nbehavioral hallucinations, the influence of cooccurring behaviors, and the\ncompounding impact of behavioral hallucinations. Our dataset is available at\nhttps://github.com/umd-huang-lab/Mementos.\n","authors":["Xiyao Wang","Yuhang Zhou","Xiaoyu Liu","Hongjin Lu","Yuancheng Xu","Feihong He","Jaehong Yoon","Taixi Lu","Gedas Bertasius","Mohit Bansal","Huaxiu Yao","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.10529v1.pdf","comment":"27 pages, 23 figures"},{"id":"http://arxiv.org/abs/2401.10526v1","updated":"2024-01-19T07:06:58Z","published":"2024-01-19T07:06:58Z","title":"On mitigating stability-plasticity dilemma in CLIP-guided image morphing\n  via geodesic distillation loss","summary":"  Large-scale language-vision pre-training models, such as CLIP, have achieved\nremarkable text-guided image morphing results by leveraging several\nunconditional generative models. However, existing CLIP-guided image morphing\nmethods encounter difficulties when morphing photorealistic images.\nSpecifically, existing guidance fails to provide detailed explanations of the\nmorphing regions within the image, leading to misguidance. In this paper, we\nobserved that such misguidance could be effectively mitigated by simply using a\nproper regularization loss. Our approach comprises two key components: 1) a\ngeodesic cosine similarity loss that minimizes inter-modality features (i.e.,\nimage and text) on a projected subspace of CLIP space, and 2) a latent\nregularization loss that minimizes intra-modality features (i.e., image and\nimage) on the image manifold. By replacing the na\\\"ive directional CLIP loss in\na drop-in replacement manner, our method achieves superior morphing results on\nboth images and videos for various benchmarks, including CLIP-inversion.\n","authors":["Yeongtak Oh","Saehyung Lee","Uiwon Hwang","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2401.10526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07567v2","updated":"2024-01-19T07:04:56Z","published":"2024-01-15T09:59:43Z","title":"Bias-Conflict Sample Synthesis and Adversarial Removal Debias Strategy\n  for Temporal Sentence Grounding in Video","summary":"  Temporal Sentence Grounding in Video (TSGV) is troubled by dataset bias\nissue, which is caused by the uneven temporal distribution of the target\nmoments for samples with similar semantic components in input videos or query\ntexts. Existing methods resort to utilizing prior knowledge about bias to\nartificially break this uneven distribution, which only removes a limited\namount of significant language biases. In this work, we propose the\nbias-conflict sample synthesis and adversarial removal debias strategy\n(BSSARD), which dynamically generates bias-conflict samples by explicitly\nleveraging potentially spurious correlations between single-modality features\nand the temporal position of the target moments. Through adversarial training,\nits bias generators continuously introduce biases and generate bias-conflict\nsamples to deceive its grounding model. Meanwhile, the grounding model\ncontinuously eliminates the introduced biases, which requires it to model\nmulti-modality alignment information. BSSARD will cover most kinds of coupling\nrelationships and disrupt language and visual biases simultaneously. Extensive\nexperiments on Charades-CD and ActivityNet-CD demonstrate the promising\ndebiasing capability of BSSARD. Source codes are available at\nhttps://github.com/qzhb/BSSARD.\n","authors":["Zhaobo Qi","Yibo Yuan","Xiaowen Ruan","Shuhui Wang","Weigang Zhang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2401.07567v2.pdf","comment":"accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.10525v1","updated":"2024-01-19T07:01:07Z","published":"2024-01-19T07:01:07Z","title":"Focaler-IoU: More Focused Intersection over Union Loss","summary":"  Bounding box regression plays a crucial role in the field of object\ndetection, and the positioning accuracy of object detection largely depends on\nthe loss function of bounding box regression. Existing researchs improve\nregression performance by utilizing the geometric relationship between bounding\nboxes, while ignoring the impact of difficult and easy sample distribution on\nbounding box regression. In this article, we analyzed the impact of difficult\nand easy sample distribution on regression results, and then proposed\nFocaler-IoU, which can improve detector performance in different detection\ntasks by focusing on different regression samples. Finally, comparative\nexperiments were conducted using existing advanced detectors and regression\nmethods for different detection tasks, and the detection performance was\nfurther improved by using the method proposed in this paper.Code is available\nat \\url{https://github.com/malagoutou/Focaler-IoU}.\n","authors":["Hao Zhang","Shuaijie Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10525v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2312.17663"},{"id":"http://arxiv.org/abs/2401.10512v1","updated":"2024-01-19T06:04:48Z","published":"2024-01-19T06:04:48Z","title":"Exploring Color Invariance through Image-Level Ensemble Learning","summary":"  In the field of computer vision, the persistent presence of color bias,\nresulting from fluctuations in real-world lighting and camera conditions,\npresents a substantial challenge to the robustness of models. This issue is\nparticularly pronounced in complex wide-area surveillance scenarios, such as\nperson re-identification and industrial dust segmentation, where models often\nexperience a decline in performance due to overfitting on color information\nduring training, given the presence of environmental variations. Consequently,\nthere is a need to effectively adapt models to cope with the complexities of\ncamera conditions. To address this challenge, this study introduces a learning\nstrategy named Random Color Erasing, which draws inspiration from ensemble\nlearning. This strategy selectively erases partial or complete color\ninformation in the training data without disrupting the original image\nstructure, thereby achieving a balanced weighting of color features and other\nfeatures within the neural network. This approach mitigates the risk of\noverfitting and enhances the model's ability to handle color variation, thereby\nimproving its overall robustness. The approach we propose serves as an ensemble\nlearning strategy, characterized by robust interpretability. A comprehensive\nanalysis of this methodology is presented in this paper. Across various tasks\nsuch as person re-identification and semantic segmentation, our approach\nconsistently improves strong baseline methods. Notably, in comparison to\nexisting methods that prioritize color robustness, our strategy significantly\nenhances performance in cross-domain scenarios. The code available at\n\\url{https://github.com/layumi/Person\\_reID\\_baseline\\_pytorch/blob/master/random\\_erasing.py}\nor \\url{https://github.com/finger-monkey/Data-Augmentation}.\n","authors":["Yunpeng Gong","Jiaquan Li","Lifei Chen","Min Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.10512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10511v1","updated":"2024-01-19T06:03:01Z","published":"2024-01-19T06:03:01Z","title":"GMC-IQA: Exploiting Global-correlation and Mean-opinion Consistency for\n  No-reference Image Quality Assessment","summary":"  Due to the subjective nature of image quality assessment (IQA), assessing\nwhich image has better quality among a sequence of images is more reliable than\nassigning an absolute mean opinion score for an image. Thus, IQA models are\nevaluated by global correlation consistency (GCC) metrics like PLCC and SROCC,\nrather than mean opinion consistency (MOC) metrics like MAE and MSE. However,\nmost existing methods adopt MOC metrics to define their loss functions, due to\nthe infeasible computation of GCC metrics during training. In this work, we\nconstruct a novel loss function and network to exploit Global-correlation and\nMean-opinion Consistency, forming a GMC-IQA framework. Specifically, we propose\na novel GCC loss by defining a pairwise preference-based rank estimation to\nsolve the non-differentiable problem of SROCC and introducing a queue mechanism\nto reserve previous data to approximate the global results of the whole data.\nMoreover, we propose a mean-opinion network, which integrates diverse opinion\nfeatures to alleviate the randomness of weight learning and enhance the model\nrobustness. Experiments indicate that our method outperforms SOTA methods on\nmultiple authentic datasets with higher accuracy and generalization. We also\nadapt the proposed loss to various networks, which brings better performance\nand more stable training.\n","authors":["Zewen Chen","Juan Wang","Bing Li","Chunfeng Yuan","Weiming Hu","Junxian Liu","Peng Li","Yan Wang","Youqun Zhang","Congxuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05594v3","updated":"2024-01-19T05:50:58Z","published":"2024-01-10T23:55:16Z","title":"Wasserstein Distance-based Expansion of Low-Density Latent Regions for\n  Unknown Class Detection","summary":"  This paper addresses the significant challenge in open-set object detection\n(OSOD): the tendency of state-of-the-art detectors to erroneously classify\nunknown objects as known categories with high confidence. We present a novel\napproach that effectively identifies unknown objects by distinguishing between\nhigh and low-density regions in latent space. Our method builds upon the\nOpen-Det (OD) framework, introducing two new elements to the loss function.\nThese elements enhance the known embedding space's clustering and expand the\nunknown space's low-density regions. The first addition is the Class\nWasserstein Anchor (CWA), a new function that refines the classification\nboundaries. The second is a spectral normalisation step, improving the\nrobustness of the model. Together, these augmentations to the existing\nContrastive Feature Learner (CFL) and Unknown Probability Learner (UPL) loss\nfunctions significantly improve OSOD performance. Our proposed OpenDet-CWA\n(OD-CWA) method demonstrates: a) a reduction in open-set errors by\napproximately 17%-22%, b) an enhancement in novelty detection capability by\n1.5%-16%, and c) a decrease in the wilderness index by 2%-20% across various\nopen-set scenarios. These results represent a substantial advancement in the\nfield, showcasing the potential of our approach in managing the complexities of\nopen-set object detection.\n","authors":["Prakash Mallick","Feras Dayoub","Jamie Sherrah"],"pdf_url":"https://arxiv.org/pdf/2401.05594v3.pdf","comment":"8 Full length pages, followed by 2 supplementary pages, total of 9\n  Figures"},{"id":"http://arxiv.org/abs/2208.09424v3","updated":"2024-01-19T05:32:54Z","published":"2022-08-19T16:16:59Z","title":"Hierarchical Compositional Representations for Few-shot Action\n  Recognition","summary":"  Recently action recognition has received more and more attention for its\ncomprehensive and practical applications in intelligent surveillance and\nhuman-computer interaction. However, few-shot action recognition has not been\nwell explored and remains challenging because of data scarcity. In this paper,\nwe propose a novel hierarchical compositional representations (HCR) learning\napproach for few-shot action recognition. Specifically, we divide a complicated\naction into several sub-actions by carefully designed hierarchical clustering\nand further decompose the sub-actions into more fine-grained spatially\nattentional sub-actions (SAS-actions). Although there exist large differences\nbetween base classes and novel classes, they can share similar patterns in\nsub-actions or SAS-actions. Furthermore, we adopt the Earth Mover's Distance in\nthe transportation problem to measure the similarity between video samples in\nterms of sub-action representations. It computes the optimal matching flows\nbetween sub-actions as distance metric, which is favorable for comparing\nfine-grained patterns. Extensive experiments show our method achieves the\nstate-of-the-art results on HMDB51, UCF101 and Kinetics datasets.\n","authors":["Changzhen Li","Jie Zhang","Shuzhe Wu","Xin Jin","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2208.09424v3.pdf","comment":"Accepted by Computer Vision and Image Understanding"},{"id":"http://arxiv.org/abs/2401.10501v1","updated":"2024-01-19T05:28:51Z","published":"2024-01-19T05:28:51Z","title":"Enhancing medical vision-language contrastive learning via\n  inter-matching relation modelling","summary":"  Medical image representations can be learned through medical vision-language\ncontrastive learning (mVLCL) where medical imaging reports are used as weak\nsupervision through image-text alignment. These learned image representations\ncan be transferred to and benefit various downstream medical vision tasks such\nas disease classification and segmentation. Recent mVLCL methods attempt to\nalign image sub-regions and the report keywords as local-matchings. However,\nthese methods aggregate all local-matchings via simple pooling operations while\nignoring the inherent relations between them. These methods therefore fail to\nreason between local-matchings that are semantically related, e.g.,\nlocal-matchings that correspond to the disease word and the location word\n(semantic-relations), and also fail to differentiate such clinically important\nlocal-matchings from others that correspond to less meaningful words, e.g.,\nconjunction words (importance-relations). Hence, we propose a mVLCL method that\nmodels the inter-matching relations between local-matchings via a\nrelation-enhanced contrastive learning framework (RECLF). In RECLF, we\nintroduce a semantic-relation reasoning module (SRM) and an importance-relation\nreasoning module (IRM) to enable more fine-grained report supervision for image\nrepresentation learning. We evaluated our method using four public benchmark\ndatasets on four downstream tasks, including segmentation, zero-shot\nclassification, supervised classification, and cross-modal retrieval. Our\nresults demonstrated the superiority of our RECLF over the state-of-the-art\nmVLCL methods with consistent improvements across single-modal and cross-modal\ntasks. These results suggest that our RECLF, by modelling the inter-matching\nrelations, can learn improved medical image representations with better\ngeneralization capabilities.\n","authors":["Mingjian Li","Mingyuan Meng","Michael Fulham","David Dagan Feng","Lei Bi","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2401.10501v1.pdf","comment":"11 pages, 5 figures. Under review"},{"id":"http://arxiv.org/abs/2401.09895v2","updated":"2024-01-19T05:27:15Z","published":"2024-01-18T11:14:32Z","title":"Skeleton-Guided Instance Separation for Fine-Grained Segmentation in\n  Microscopy","summary":"  One of the fundamental challenges in microscopy (MS) image analysis is\ninstance segmentation (IS), particularly when segmenting cluster regions where\nmultiple objects of varying sizes and shapes may be connected or even\noverlapped in arbitrary orientations. Existing IS methods usually fail in\nhandling such scenarios, as they rely on coarse instance representations such\nas keypoints and horizontal bounding boxes (h-bboxes). In this paper, we\npropose a novel one-stage framework named A2B-IS to address this challenge and\nenhance the accuracy of IS in MS images. Our approach represents each instance\nwith a pixel-level mask map and a rotated bounding box (r-bbox). Unlike\ntwo-stage methods that use box proposals for segmentations, our method\ndecouples mask and box predictions, enabling simultaneous processing to\nstreamline the model pipeline. Additionally, we introduce a Gaussian skeleton\nmap to aid the IS task in two key ways: (1) It guides anchor placement,\nreducing computational costs while improving the model's capacity to learn\nRoI-aware features by filtering out noise from background regions. (2) It\nensures accurate isolation of densely packed instances by rectifying erroneous\nbox predictions near instance boundaries. To further enhance the performance,\nwe integrate two modules into the framework: (1) An Atrous Attention Block\n(A2B) designed to extract high-resolution feature maps with fine-grained\nmultiscale information, and (2) A Semi-Supervised Learning (SSL) strategy that\nleverages both labeled and unlabeled images for model training. Our method has\nbeen thoroughly validated on two large-scale MS datasets, demonstrating its\nsuperiority over most state-of-the-art approaches.\n","authors":["Jun Wang","Chengfeng Zhou","Zhaoyan Ming","Lina Wei","Xudong Jiang","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2401.09895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12653v2","updated":"2024-01-19T04:37:18Z","published":"2023-12-19T22:53:32Z","title":"Diagnosis Of Takotsubo Syndrome By Robust Feature Selection From The\n  Complex Latent Space Of DL-based Segmentation Network","summary":"  Researchers have shown significant correlations among segmented objects in\nvarious medical imaging modalities and disease related pathologies. Several\nstudies showed that using hand crafted features for disease prediction neglects\nthe immense possibility to use latent features from deep learning (DL) models\nwhich may reduce the overall accuracy of differential diagnosis. However,\ndirectly using classification or segmentation models on medical to learn latent\nfeatures opt out robust feature selection and may lead to overfitting. To fill\nthis gap, we propose a novel feature selection technique using the latent space\nof a segmentation model that can aid diagnosis. We evaluated our method in\ndifferentiating a rare cardiac disease: Takotsubo Syndrome (TTS) from the ST\nelevation myocardial infarction (STEMI) using echocardiogram videos (echo). TTS\ncan mimic clinical features of STEMI in echo and extremely hard to distinguish.\nOur approach shows promising results in differential diagnosis of TTS with 82%\ndiagnosis accuracy beating the previous state-of-the-art (SOTA) approach.\nMoreover, the robust feature selection technique using LASSO algorithm shows\ngreat potential in reducing the redundant features and creates a robust\npipeline for short- and long-term disease prognoses in the downstream analysis.\n","authors":["Fahim Ahmed Zaman","Wahidul Alam","Tarun Kanti Roy","Amanda Chang","Kan Liu","Xiaodong Wu"],"pdf_url":"https://arxiv.org/pdf/2312.12653v2.pdf","comment":"5 pages, 3 figures, conference"},{"id":"http://arxiv.org/abs/2401.10150v2","updated":"2024-01-19T04:27:05Z","published":"2024-01-18T17:22:37Z","title":"Motion-Zero: Zero-Shot Moving Object Control Framework for\n  Diffusion-Based Video Generation","summary":"  Recent large-scale pre-trained diffusion models have demonstrated a powerful\ngenerative ability to produce high-quality videos from detailed text\ndescriptions. However, exerting control over the motion of objects in videos\ngenerated by any video diffusion model is a challenging problem. In this paper,\nwe propose a novel zero-shot moving object trajectory control framework,\nMotion-Zero, to enable a bounding-box-trajectories-controlled text-to-video\ndiffusion model.To this end, an initial noise prior module is designed to\nprovide a position-based prior to improve the stability of the appearance of\nthe moving object and the accuracy of position. In addition, based on the\nattention map of the U-net, spatial constraints are directly applied to the\ndenoising process of diffusion models, which further ensures the positional and\nspatial consistency of moving objects during the inference. Furthermore,\ntemporal consistency is guaranteed with a proposed shift temporal attention\nmechanism. Our method can be flexibly applied to various state-of-the-art video\ndiffusion models without any training process. Extensive experiments\ndemonstrate our proposed method can control the motion trajectories of objects\nand generate high-quality videos.\n","authors":["Changgu Chen","Junwei Shu","Lianggangxu Chen","Gaoqi He","Changbo Wang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2401.10150v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.09721v2","updated":"2024-01-19T04:07:33Z","published":"2024-01-18T04:51:41Z","title":"Fast graph-based denoising for point cloud color information","summary":"  Point clouds are utilized in various 3D applications such as cross-reality\n(XR) and realistic 3D displays. In some applications, e.g., for live streaming\nusing a 3D point cloud, real-time point cloud denoising methods are required to\nenhance the visual quality. However, conventional high-precision denoising\nmethods cannot be executed in real time for large-scale point clouds owing to\nthe complexity of graph constructions with K nearest neighbors and noise level\nestimation. This paper proposes a fast graph-based denoising (FGBD) for a\nlarge-scale point cloud. First, high-speed graph construction is achieved by\nscanning a point cloud in various directions and searching adjacent\nneighborhoods on the scanning lines. Second, we propose a fast noise level\nestimation method using eigenvalues of the covariance matrix on a graph.\nFinally, we also propose a new low-cost filter selection method to enhance\ndenoising accuracy to compensate for the degradation caused by the acceleration\nalgorithms. In our experiments, we succeeded in reducing the processing time\ndramatically while maintaining accuracy relative to conventional denoising\nmethods. Denoising was performed at 30fps, with frames containing approximately\n1 million points.\n","authors":["Ryosuke Watanabe","Keisuke Nonaka","Eduardo Pavez","Tatsuya Kobayashi","Antonio Ortega"],"pdf_url":"https://arxiv.org/pdf/2401.09721v2.pdf","comment":"Published in the proceeding of 2024 IEEE International Conference on\n  Acoustics, Speech and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2401.10475v1","updated":"2024-01-19T03:54:58Z","published":"2024-01-19T03:54:58Z","title":"CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short\n  Video Search Scenarios","summary":"  Vision-Language Models pre-trained on large-scale image-text datasets have\nshown superior performance in downstream tasks such as image retrieval. Most of\nthe images for pre-training are presented in the form of open domain\ncommon-sense visual elements. Differently, video covers in short video search\nscenarios are presented as user-originated contents that provide important\nvisual summaries of videos. In addition, a portion of the video covers come\nwith manually designed cover texts that provide semantic complements. In order\nto fill in the gaps in short video cover data, we establish the first\nlarge-scale cover-text benchmark for Chinese short video search scenarios.\nSpecifically, we release two large-scale datasets CBVS-5M/10M to provide short\nvideo covers, and the manual fine-labeling dataset CBVS-20K to provide real\nuser queries, which serves as an image-text benchmark test in the Chinese short\nvideo search field. To integrate the semantics of cover text in the case of\nmodality missing, we propose UniCLIP where cover texts play a guiding role\nduring training, however are not relied upon by inference. Extensive evaluation\non CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has\nbeen deployed to Tencent's online video search systems with hundreds of\nmillions of visits and achieved significant gains. The complete dataset, code\nand checkpoints will be available upon release.\n","authors":["Xiangshuo Qiao","Xianxin Li","Xiaozhe Qu","Jie Zhang","Yang Liu","Yu Luo","Cihang Jin","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2401.10475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10474v1","updated":"2024-01-19T03:50:19Z","published":"2024-01-19T03:50:19Z","title":"LDReg: Local Dimensionality Regularized Self-Supervised Learning","summary":"  Representations learned via self-supervised learning (SSL) can be susceptible\nto dimensional collapse, where the learned representation subspace is of\nextremely low dimensionality and thus fails to represent the full data\ndistribution and modalities. Dimensional collapse also known as the\n\"underfilling\" phenomenon is one of the major causes of degraded performance on\ndownstream tasks. Previous work has investigated the dimensional collapse\nproblem of SSL at a global level. In this paper, we demonstrate that\nrepresentations can span over high dimensional space globally, but collapse\nlocally. To address this, we propose a method called $\\textit{local\ndimensionality regularization (LDReg)}$. Our formulation is based on the\nderivation of the Fisher-Rao metric to compare and optimize local distance\ndistributions at an asymptotically small radius for each data point. By\nincreasing the local intrinsic dimensionality, we demonstrate through a range\nof experiments that LDReg improves the representation quality of SSL. The\nresults also show that LDReg can regularize dimensionality at both local and\nglobal levels.\n","authors":["Hanxun Huang","Ricardo J. G. B. Campello","Sarah Monazam Erfani","Xingjun Ma","Michael E. Houle","James Bailey"],"pdf_url":"https://arxiv.org/pdf/2401.10474v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2309.09466v2","updated":"2024-01-19T03:37:57Z","published":"2023-09-18T04:01:25Z","title":"Progressive Text-to-Image Diffusion with Soft Latent Direction","summary":"  In spite of the rapidly evolving landscape of text-to-image generation, the\nsynthesis and manipulation of multiple entities while adhering to specific\nrelational constraints pose enduring challenges. This paper introduces an\ninnovative progressive synthesis and editing operation that systematically\nincorporates entities into the target image, ensuring their adherence to\nspatial and relational constraints at each sequential step. Our key insight\nstems from the observation that while a pre-trained text-to-image diffusion\nmodel adeptly handles one or two entities, it often falters when dealing with a\ngreater number. To address this limitation, we propose harnessing the\ncapabilities of a Large Language Model (LLM) to decompose intricate and\nprotracted text descriptions into coherent directives adhering to stringent\nformats. To facilitate the execution of directives involving distinct semantic\noperations-namely insertion, editing, and erasing-we formulate the Stimulus,\nResponse, and Fusion (SRF) framework. Within this framework, latent regions are\ngently stimulated in alignment with each operation, followed by the fusion of\nthe responsive latent components to achieve cohesive entity manipulation. Our\nproposed framework yields notable advancements in object synthesis,\nparticularly when confronted with intricate and lengthy textual inputs.\nConsequently, it establishes a new benchmark for text-to-image generation\ntasks, further elevating the field's performance standards.\n","authors":["YuTeng Ye","Jiale Cai","Hang Zhou","Guanwen Li","Youjia Zhang","Zikai Song","Chenxing Gao","Junqing Yu","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2309.09466v2.pdf","comment":"14 pages, 15 figures"},{"id":"http://arxiv.org/abs/2401.10090v2","updated":"2024-01-19T03:31:49Z","published":"2024-01-18T15:56:23Z","title":"Cross-Modality Perturbation Synergy Attack for Person Re-identification","summary":"  In recent years, there has been significant research focusing on addressing\nsecurity concerns in single-modal person re-identification (ReID) systems that\nare based on RGB images. However, the safety of cross-modality scenarios, which\nare more commonly encountered in practical applications involving images\ncaptured by infrared cameras, has not received adequate attention. The main\nchallenge in cross-modality ReID lies in effectively dealing with visual\ndifferences between different modalities. For instance, infrared images are\ntypically grayscale, unlike visible images that contain color information.\nExisting attack methods have primarily focused on the characteristics of the\nvisible image modality, overlooking the features of other modalities and the\nvariations in data distribution among different modalities. This oversight can\npotentially undermine the effectiveness of these methods in image retrieval\nacross diverse modalities. This study represents the first exploration into the\nsecurity of cross-modality ReID models and proposes a universal perturbation\nattack specifically designed for cross-modality ReID. This attack optimizes\nperturbations by leveraging gradients from diverse modality data, thereby\ndisrupting the discriminator and reinforcing the differences between\nmodalities. We conducted experiments on two widely used cross-modality\ndatasets, namely RegDB and SYSU, which not only demonstrated the effectiveness\nof our method but also provided insights for future enhancements in the\nrobustness of cross-modality ReID systems.\n","authors":["Yunpeng Gong","Zhun Zhong","Zhiming Luo","Yansong Qu","Rongrong Ji","Min Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.10090v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10461v1","updated":"2024-01-19T03:01:07Z","published":"2024-01-19T03:01:07Z","title":"Learning to Robustly Reconstruct Low-light Dynamic Scenes from Spike\n  Streams","summary":"  As a neuromorphic sensor with high temporal resolution, spike camera can\ngenerate continuous binary spike streams to capture per-pixel light intensity.\nWe can use reconstruction methods to restore scene details in high-speed\nscenarios. However, due to limited information in spike streams, low-light\nscenes are difficult to effectively reconstruct. In this paper, we propose a\nbidirectional recurrent-based reconstruction framework, including a\nLight-Robust Representation (LR-Rep) and a fusion module, to better handle such\nextreme conditions. LR-Rep is designed to aggregate temporal information in\nspike streams, and a fusion module is utilized to extract temporal features.\nAdditionally, we have developed a reconstruction benchmark for high-speed\nlow-light scenes. Light sources in the scenes are carefully aligned to\nreal-world conditions. Experimental results demonstrate the superiority of our\nmethod, which also generalizes well to real spike streams. Related codes and\nproposed datasets will be released after publication.\n","authors":["Liwen Hu","Ziluo Ding","Mianzhi Liu","Lei Ma","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2401.10461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14197v2","updated":"2024-01-19T02:46:00Z","published":"2023-10-22T06:16:16Z","title":"Diffusion-based Data Augmentation for Nuclei Image Segmentation","summary":"  Nuclei segmentation is a fundamental but challenging task in the quantitative\nanalysis of histopathology images. Although fully-supervised deep\nlearning-based methods have made significant progress, a large number of\nlabeled images are required to achieve great segmentation performance.\nConsidering that manually labeling all nuclei instances for a dataset is\ninefficient, obtaining a large-scale human-annotated dataset is time-consuming\nand labor-intensive. Therefore, augmenting a dataset with only a few labeled\nimages to improve the segmentation performance is of significant research and\napplication value. In this paper, we introduce the first diffusion-based\naugmentation method for nuclei segmentation. The idea is to synthesize a large\nnumber of labeled images to facilitate training the segmentation model. To\nachieve this, we propose a two-step strategy. In the first step, we train an\nunconditional diffusion model to synthesize the Nuclei Structure that is\ndefined as the representation of pixel-level semantic and distance transform.\nEach synthetic nuclei structure will serve as a constraint on histopathology\nimage synthesis and is further post-processed to be an instance map. In the\nsecond step, we train a conditioned diffusion model to synthesize\nhistopathology images based on nuclei structures. The synthetic histopathology\nimages paired with synthetic instance maps will be added to the real dataset\nfor training the segmentation model. The experimental results show that by\naugmenting 10% labeled real dataset with synthetic samples, one can achieve\ncomparable segmentation results with the fully-supervised baseline. The code is\nreleased in: https://github.com/lhaof/Nudiff\n","authors":["Xinyi Yu","Guanbin Li","Wei Lou","Siqi Liu","Xiang Wan","Yan Chen","Haofeng Li"],"pdf_url":"https://arxiv.org/pdf/2310.14197v2.pdf","comment":"MICCAI 2023, released code: https://github.com/lhaof/Nudiff"},{"id":"http://arxiv.org/abs/2311.15497v3","updated":"2024-01-19T02:45:44Z","published":"2023-11-27T02:48:06Z","title":"Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning\n  and Optimization Functions for Enhanced Precision","summary":"  Image registration has traditionally been done using two distinct approaches:\nlearning based methods, relying on robust deep neural networks, and\noptimization-based methods, applying complex mathematical transformations to\nwarp images accordingly. Of course, both paradigms offer advantages and\ndisadvantages, and, in this work, we seek to combine their respective strengths\ninto a single streamlined framework, using the outputs of the learning based\nmethod as initial parameters for optimization while prioritizing computational\npower for the image pairs that offer the greatest loss. Our investigations\nshowed improvements of up to 1.6% in test data, while maintaining the same\ninference time, and a substantial 1.0% points performance gain in deformation\nfield smoothness.\n","authors":["Gabriel De Araujo","Shanlin Sun","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2311.15497v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.06551v4","updated":"2024-01-19T02:42:20Z","published":"2022-08-13T02:50:35Z","title":"Exploiting Multiple Sequence Lengths in Fast End to End Training for\n  Image Captioning","summary":"  We introduce a method called the Expansion mechanism that processes the input\nunconstrained by the number of elements in the sequence. By doing so, the model\ncan learn more effectively compared to traditional attention-based approaches.\nTo support this claim, we design a novel architecture ExpansionNet v2 that\nachieved strong results on the MS COCO 2014 Image Captioning challenge and the\nState of the Art in its respective category, with a score of 143.7 CIDErD in\nthe offline test split, 140.8 CIDErD in the online evaluation server and 72.9\nAllCIDEr on the nocaps validation set. Additionally, we introduce an End to End\ntraining algorithm up to 2.8 times faster than established alternatives. Source\ncode available at: https://github.com/jchenghu/ExpansionNet_v2\n","authors":["Jia Cheng Hu","Roberto Cavicchioli","Alessandro Capotondi"],"pdf_url":"https://arxiv.org/pdf/2208.06551v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10110v2","updated":"2024-01-19T02:31:02Z","published":"2024-01-18T16:27:09Z","title":"VIPTR: A Vision Permutable Extractor for Fast and Efficient Scene Text\n  Recognition","summary":"  Scene Text Recognition (STR) is a challenging task that involves recognizing\ntext within images of natural scenes. Although current state-of-the-art models\nfor STR exhibit high performance, they typically suffer from low inference\nefficiency due to their reliance on hybrid architectures comprised of visual\nencoders and sequence decoders. In this work, we propose the VIsion Permutable\nextractor for fast and efficient scene Text Recognition (VIPTR), which achieves\nan impressive balance between high performance and rapid inference speeds in\nthe domain of STR. Specifically, VIPTR leverages a visual-semantic extractor\nwith a pyramid structure, characterized by multiple self-attention layers,\nwhile eschewing the traditional sequence decoder. This design choice results in\na lightweight and efficient model capable of handling inputs of varying sizes.\nExtensive experimental results on various standard datasets for both Chinese\nand English scene text recognition validate the superiority of VIPTR. Notably,\nthe VIPTR-T (Tiny) variant delivers highly competitive accuracy on par with\nother lightweight models and achieves SOTA inference speeds. Meanwhile, the\nVIPTR-L (Large) variant attains greater recognition accuracy, while maintaining\na low parameter count and favorable inference speed. Our proposed method\nprovides a compelling solution for the STR challenge, which blends high\naccuracy with efficiency and greatly benefits real-world applications requiring\nfast and reliable text recognition. The code is publicly available at\nhttps://github.com/cxfyxl/VIPTR.\n","authors":["Xianfu Cheng","Weixiao Zhou","Xiang Li","Xiaoming Chen","Jian Yang","Tongliang Li","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2401.10110v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2205.00159 by other authors"},{"id":"http://arxiv.org/abs/2312.06946v2","updated":"2024-01-19T02:08:07Z","published":"2023-12-12T02:55:14Z","title":"WaterHE-NeRF: Water-ray Tracing Neural Radiance Fields for Underwater\n  Scene Reconstruction","summary":"  Neural Radiance Field (NeRF) technology demonstrates immense potential in\nnovel viewpoint synthesis tasks, due to its physics-based volumetric rendering\nprocess, which is particularly promising in underwater scenes. Addressing the\nlimitations of existing underwater NeRF methods in handling light attenuation\ncaused by the water medium and the lack of real Ground Truth (GT) supervision,\nthis study proposes WaterHE-NeRF. We develop a new water-ray tracing field by\nRetinex theory that precisely encodes color, density, and illuminance\nattenuation in three-dimensional space. WaterHE-NeRF, through its illuminance\nattenuation mechanism, generates both degraded and clear multi-view images and\noptimizes image restoration by combining reconstruction loss with Wasserstein\ndistance. Additionally, the use of histogram equalization (HE) as pseudo-GT\nenhances the network's accuracy in preserving original details and color\ndistribution. Extensive experiments on real underwater datasets and synthetic\ndatasets validate the effectiveness of WaterHE-NeRF. Our code will be made\npublicly available.\n","authors":["Jingchun Zhou","Tianyu Liang","Dehuan Zhang","Zongxin He"],"pdf_url":"https://arxiv.org/pdf/2312.06946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18999v3","updated":"2024-01-19T01:57:15Z","published":"2023-10-29T12:55:53Z","title":"DynPoint: Dynamic Neural Point For View Synthesis","summary":"  The introduction of neural radiance fields has greatly improved the\neffectiveness of view synthesis for monocular videos. However, existing\nalgorithms face difficulties when dealing with uncontrolled or lengthy\nscenarios, and require extensive training time specific to each new scenario.\nTo tackle these limitations, we propose DynPoint, an algorithm designed to\nfacilitate the rapid synthesis of novel views for unconstrained monocular\nvideos. Rather than encoding the entirety of the scenario information into a\nlatent representation, DynPoint concentrates on predicting the explicit 3D\ncorrespondence between neighboring frames to realize information aggregation.\nSpecifically, this correspondence prediction is achieved through the estimation\nof consistent depth and scene flow information across frames. Subsequently, the\nacquired correspondence is utilized to aggregate information from multiple\nreference frames to a target frame, by constructing hierarchical neural point\nclouds. The resulting framework enables swift and accurate view synthesis for\ndesired views of target frames. The experimental results obtained demonstrate\nthe considerable acceleration of training time achieved - typically an order of\nmagnitude - by our proposed method while yielding comparable outcomes compared\nto prior approaches. Furthermore, our method exhibits strong robustness in\nhandling long-duration videos without learning a canonical representation of\nvideo content.\n","authors":["Kaichen Zhou","Jia-Xing Zhong","Sangyun Shin","Kai Lu","Yiyuan Yang","Andrew Markham","Niki Trigoni"],"pdf_url":"https://arxiv.org/pdf/2310.18999v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.10766v2","updated":"2024-01-19T01:51:45Z","published":"2023-01-25T18:59:15Z","title":"On the Adversarial Robustness of Camera-based 3D Object Detection","summary":"  In recent years, camera-based 3D object detection has gained widespread\nattention for its ability to achieve high performance with low computational\ncost. However, the robustness of these methods to adversarial attacks has not\nbeen thoroughly examined, especially when considering their deployment in\nsafety-critical domains like autonomous driving. In this study, we conduct the\nfirst comprehensive investigation of the robustness of leading camera-based 3D\nobject detection approaches under various adversarial conditions. We\nsystematically analyze the resilience of these models under two attack\nsettings: white-box and black-box; focusing on two primary objectives:\nclassification and localization. Additionally, we delve into two types of\nadversarial attack techniques: pixel-based and patch-based. Our experiments\nyield four interesting findings: (a) bird's-eye-view-based representations\nexhibit stronger robustness against localization attacks; (b)\ndepth-estimation-free approaches have the potential to show stronger\nrobustness; (c) accurate depth estimation effectively improves robustness for\ndepth-estimation-based methods; (d) incorporating multi-frame benign inputs can\neffectively mitigate adversarial attacks. We hope our findings can steer the\ndevelopment of future camera-based object detection models with enhanced\nadversarial robustness.\n","authors":["Shaoyuan Xie","Zichao Li","Zeyu Wang","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2301.10766v2.pdf","comment":"Transactions on Machine Learning Research, 2024. ISSN 2835-8856"},{"id":"http://arxiv.org/abs/2312.06955v2","updated":"2024-01-19T01:47:22Z","published":"2023-12-12T03:26:04Z","title":"IA2U: A Transfer Plugin with Multi-Prior for In-Air Model to Underwater","summary":"  In underwater environments, variations in suspended particle concentration\nand turbidity cause severe image degradation, posing significant challenges to\nimage enhancement (IE) and object detection (OD) tasks. Currently, in-air image\nenhancement and detection methods have made notable progress, but their\napplication in underwater conditions is limited due to the complexity and\nvariability of these environments. Fine-tuning in-air models saves high\noverhead and has more optional reference work than building an underwater model\nfrom scratch. To address these issues, we design a transfer plugin with\nmultiple priors for converting in-air models to underwater applications, named\nIA2U. IA2U enables efficient application in underwater scenarios, thereby\nimproving performance in Underwater IE and OD. IA2U integrates three types of\nunderwater priors: the water type prior that characterizes the degree of image\ndegradation, such as color and visibility; the degradation prior, focusing on\ndifferences in details and textures; and the sample prior, considering the\nenvironmental conditions at the time of capture and the characteristics of the\nphotographed object. Utilizing a Transformer-like structure, IA2U employs these\npriors as query conditions and a joint task loss function to achieve\nhierarchical enhancement of task-level underwater image features, therefore\nconsidering the requirements of two different tasks, IE and OD. Experimental\nresults show that IA2U combined with an in-air model can achieve superior\nperformance in underwater image enhancement and object detection tasks. The\ncode will be made publicly available.\n","authors":["Jingchun Zhou","Qilin Gai","Kin-man Lam","Xianping Fu"],"pdf_url":"https://arxiv.org/pdf/2312.06955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06999v2","updated":"2024-01-19T01:46:49Z","published":"2023-12-12T06:07:21Z","title":"DGNet: Dynamic Gradient-guided Network with Noise Suppression for\n  Underwater Image Enhancement","summary":"  Underwater image enhancement (UIE) is a challenging task due to the complex\ndegradation caused by underwater environments. To solve this issue, previous\nmethods often idealize the degradation process, and neglect the impact of\nmedium noise and object motion on the distribution of image features, limiting\nthe generalization and adaptability of the model. Previous methods use the\nreference gradient that is constructed from original images and synthetic\nground-truth images. This may cause the network performance to be influenced by\nsome low-quality training data. Our approach utilizes predicted images to\ndynamically update pseudo-labels, adding a dynamic gradient to optimize the\nnetwork's gradient space. This process improves image quality and avoids local\noptima. Moreover, we propose a Feature Restoration and Reconstruction module\n(FRR) based on a Channel Combination Inference (CCI) strategy and a Frequency\nDomain Smoothing module (FRS). These modules decouple other degradation\nfeatures while reducing the impact of various types of noise on network\nperformance. Experiments on multiple public datasets demonstrate the\nsuperiority of our method over existing state-of-the-art approaches, especially\nin achieving performance milestones: PSNR of 25.6dB and SSIM of 0.93 on the\nUIEB dataset. Its efficiency in terms of parameter size and inference time\nfurther attests to its broad practicality. The code will be made publicly\navailable.\n","authors":["Jingchun Zhou","Zongxin He","Dehuan Zhang","Kin-man Lam","Xianping Fu","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2312.06999v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10442v1","updated":"2024-01-19T01:11:44Z","published":"2024-01-19T01:11:44Z","title":"Path Choice Matters for Clear Attribution in Path Methods","summary":"  Rigorousness and clarity are both essential for interpretations of DNNs to\nengender human trust. Path methods are commonly employed to generate rigorous\nattributions that satisfy three axioms. However, the meaning of attributions\nremains ambiguous due to distinct path choices. To address the ambiguity, we\nintroduce \\textbf{Concentration Principle}, which centrally allocates high\nattributions to indispensable features, thereby endowing aesthetic and\nsparsity. We then present \\textbf{SAMP}, a model-agnostic interpreter, which\nefficiently searches the near-optimal path from a pre-defined set of\nmanipulation paths. Moreover, we propose the infinitesimal constraint (IC) and\nmomentum strategy (MS) to improve the rigorousness and optimality.\nVisualizations show that SAMP can precisely reveal DNNs by pinpointing salient\nimage pixels. We also perform quantitative experiments and observe that our\nmethod significantly outperforms the counterparts. Code:\nhttps://github.com/zbr17/SAMP.\n","authors":["Borui Zhang","Wenzhao Zheng","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2401.10442v1.pdf","comment":"ICLR 2024 accepted"},{"id":"http://arxiv.org/abs/2304.00746v3","updated":"2024-01-19T00:42:13Z","published":"2023-04-03T06:40:52Z","title":"OTS: A One-shot Learning Approach for Text Spotting in Historical\n  Manuscripts","summary":"  In the field of historical manuscript research, scholars frequently encounter\nnovel symbols in ancient texts, investing considerable effort in their\nidentification and documentation. Although some object detection methods have\nachieved impressive performance, they primarily excel at detecting categories\nincluded in training datasets, often failing to recognize novel symbols without\nretraining. To overcome this limitation, we propose a novel One-shot\nlearning-based Text Spotting (OTS) approach that accurately and reliably spots\nnovel characters with just one annotated support sample. Drawing inspiration\nfrom cognitive research, we introduce a spatial alignment module that finds,\nfocuses on, and learns the most discriminative spatial regions in the query\nimage based on one support image. Especially, since the low-resource spotting\ntask often faces the problem of example imbalance, we propose a novel loss\nfunction called torus loss which can make the embedding space of distance\nmetric more discriminative. Our approach is highly efficient and requires only\na few training samples while exhibiting the remarkable ability to handle novel\ncharacters and symbols. To enhance dataset diversity, a new manuscript dataset\nthat contains the ancient Dongba hieroglyphics (DBH) is created, a script\nassociated with China and developed by the ancestors of the Naxi minority. We\nconduct experiments on publicly available DBH, EGY, VML-HD, TKH, and NC\ndatasets. The experimental results demonstrate that OTS outperforms the\nstate-of-the-art methods in one-shot text spotting. Overall, our proposed\nmethod offers promising applications in text spotting in historical\nmanuscripts.\n","authors":["Wenbo Hu","Hongjian Zhan","Cong Liu","Bing Yin","Yue Lu"],"pdf_url":"https://arxiv.org/pdf/2304.00746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00110v3","updated":"2024-01-19T00:35:35Z","published":"2023-12-30T01:24:25Z","title":"Diffusion Model with Perceptual Loss","summary":"  Diffusion models trained with mean squared error loss tend to generate\nunrealistic samples. Current state-of-the-art models rely on classifier-free\nguidance to improve sample quality, yet its surprising effectiveness is not\nfully understood. In this paper, we show that the effectiveness of\nclassifier-free guidance partly originates from it being a form of implicit\nperceptual guidance. As a result, we can directly incorporate perceptual loss\nin diffusion training to improve sample quality. Since the score matching\nobjective used in diffusion training strongly resembles the denoising\nautoencoder objective used in unsupervised training of perceptual networks, the\ndiffusion model itself is a perceptual network and can be used to generate\nmeaningful perceptual loss. We propose a novel self-perceptual objective that\nresults in diffusion models capable of generating more realistic samples. For\nconditional generation, our method only improves sample quality without\nentanglement with the conditional input and therefore does not sacrifice sample\ndiversity. Our method can also improve sample quality for unconditional\ngeneration, which was not possible with classifier-free guidance before.\n","authors":["Shanchuan Lin","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2401.00110v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.10841v1","updated":"2024-01-19T17:40:50Z","published":"2024-01-19T17:40:50Z","title":"Using LLMs to discover emerging coded antisemitic hate-speech emergence\n  in extremist social media","summary":"  Online hate speech proliferation has created a difficult problem for social\nmedia platforms. A particular challenge relates to the use of coded language by\ngroups interested in both creating a sense of belonging for its users and\nevading detection. Coded language evolves quickly and its use varies over time.\nThis paper proposes a methodology for detecting emerging coded hate-laden\nterminology. The methodology is tested in the context of online antisemitic\ndiscourse. The approach considers posts scraped from social media platforms,\noften used by extremist users. The posts are scraped using seed expressions\nrelated to previously known discourse of hatred towards Jews. The method begins\nby identifying the expressions most representative of each post and calculating\ntheir frequency in the whole corpus. It filters out grammatically incoherent\nexpressions as well as previously encountered ones so as to focus on emergent\nwell-formed terminology. This is followed by an assessment of semantic\nsimilarity to known antisemitic terminology using a fine-tuned large language\nmodel, and subsequent filtering out of the expressions that are too distant\nfrom known expressions of hatred. Emergent antisemitic expressions containing\nterms clearly relating to Jewish topics are then removed to return only coded\nexpressions of hatred.\n","authors":["Dhanush Kikkisetti","Raza Ul Mustafa","Wendy Melillo","Roberto Corizzo","Zois Boukouvalas","Jeff Gill","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2401.10841v1.pdf","comment":"9 pages, 4 figures, 2 algorithms, 3 tables"},{"id":"http://arxiv.org/abs/2312.09631v2","updated":"2024-01-19T17:07:40Z","published":"2023-12-15T09:21:11Z","title":"Context-Driven Interactive Query Simulations Based on Generative Large\n  Language Models","summary":"  Simulating user interactions enables a more user-oriented evaluation of\ninformation retrieval (IR) systems. While user simulations are cost-efficient\nand reproducible, many approaches often lack fidelity regarding real user\nbehavior. Most notably, current user models neglect the user's context, which\nis the primary driver of perceived relevance and the interactions with the\nsearch results. To this end, this work introduces the simulation of\ncontext-driven query reformulations. The proposed query generation methods\nbuild upon recent Large Language Model (LLM) approaches and consider the user's\ncontext throughout the simulation of a search session. Compared to simple\ncontext-free query generation approaches, these methods show better\neffectiveness and allow the simulation of more efficient IR sessions.\nSimilarly, our evaluations consider more interaction context than current\nsession-based measures and reveal interesting complementary insights in\naddition to the established evaluation protocols. We conclude with directions\nfor future work and provide an entirely open experimental setup.\n","authors":["Björn Engelmann","Timo Breuer","Jana Isabelle Friese","Philipp Schaer","Norbert Fuhr"],"pdf_url":"https://arxiv.org/pdf/2312.09631v2.pdf","comment":"Accepted at ECIR 2024 (Full Paper)"},{"id":"http://arxiv.org/abs/2308.07107v3","updated":"2024-01-19T16:01:28Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":"  As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions, such as\nsearch agents, within this expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Haonan Chen","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v3.pdf","comment":"updated to version 2"},{"id":"http://arxiv.org/abs/2401.10733v1","updated":"2024-01-19T14:50:22Z","published":"2024-01-19T14:50:22Z","title":"Dynamic Q&A of Clinical Documents with Large Language Models","summary":"  Electronic health records (EHRs) house crucial patient data in clinical\nnotes. As these notes grow in volume and complexity, manual extraction becomes\nchallenging. This work introduces a natural language interface using large\nlanguage models (LLMs) for dynamic question-answering on clinical notes. Our\nchatbot, powered by Langchain and transformer-based LLMs, allows users to query\nin natural language, receiving relevant answers from clinical notes.\nExperiments, utilizing various embedding models and advanced LLMs, show Wizard\nVicuna's superior accuracy, albeit with high compute demands. Model\noptimization, including weight quantization, improves latency by approximately\n48 times. Promising results indicate potential, yet challenges such as model\nhallucinations and limited diverse medical case evaluations remain. Addressing\nthese gaps is crucial for unlocking the value in clinical notes and advancing\nAI-driven clinical decision-making.\n","authors":["Ran Elgedawy","Sudarshan Srinivasan","Ioana Danciu"],"pdf_url":"https://arxiv.org/pdf/2401.10733v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.10690v1","updated":"2024-01-19T13:41:08Z","published":"2024-01-19T13:41:08Z","title":"Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and\n  unfairness in dyadic regression models","summary":"  Dyadic regression models, which predict real-valued outcomes for pairs of\nentities, are fundamental in many domains (e.g. predicting the rating of a user\nto a product in Recommender Systems) and promising and under exploration in\nmany others (e.g. approximating the adequate dosage of a drug for a patient in\npersonalized pharmacology). In this work, we demonstrate that non-uniformity in\nthe observed value distributions of individual entities leads to severely\nbiased predictions in state-of-the-art models, skewing predictions towards the\naverage of observed past values for the entity and providing worse-than-random\npredictive power in eccentric yet equally important cases. We show that the\nusage of global error metrics like Root Mean Squared Error (RMSE) and Mean\nAbsolute Error (MAE) is insufficient to capture this phenomenon, which we name\neccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as\na new complementary metric that can quantify it in all studied models and\ndatasets. We also prove the adequateness of EAUC by using naive de-biasing\ncorrections to demonstrate that a lower model bias correlates with a lower EAUC\nand vice-versa. This work contributes a bias-aware evaluation of dyadic\nregression models to avoid potential unfairness and risks in critical\nreal-world applications of such systems.\n","authors":["Jorge Paz-Ruza","Amparo Alonso-Betanzos","Bertha Guijarro-Berdiñas","Brais Cancela","Carlos Eiras-Franco"],"pdf_url":"https://arxiv.org/pdf/2401.10690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10634v1","updated":"2024-01-19T11:22:04Z","published":"2024-01-19T11:22:04Z","title":"Automatic Construction of Multi-faceted User Profiles using Text\n  Clustering and its Application to Expert Recommendation and Filtering\n  Problems","summary":"  In the information age we are living in today, not only are we interested in\naccessing multimedia objects such as documents, videos, etc. but also in\nsearching for professional experts, people or celebrities, possibly for\nprofessional needs or just for fun. Information access systems need to be able\nto extract and exploit various sources of information (usually in text format)\nabout such individuals, and to represent them in a suitable way usually in the\nform of a profile. In this article, we tackle the problems of profile-based\nexpert recommendation and document filtering from a machine learning\nperspective by clustering expert textual sources to build profiles and capture\nthe different hidden topics in which the experts are interested. The experts\nwill then be represented by means of multi-faceted profiles. Our experiments\nshow that this is a valid technique to improve the performance of expert\nfinding and document filtering.\n","authors":["Luis M. de Campos","Juan M. Fernández-Luna","Juan F. Huete","Luis Redondo-Expósito"],"pdf_url":"https://arxiv.org/pdf/2401.10634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10617v1","updated":"2024-01-19T10:49:31Z","published":"2024-01-19T10:49:31Z","title":"LDA-based Term Profiles for Expert Finding in a Political Setting","summary":"  A common task in many political institutions (i.e. Parliament) is to find\npoliticians who are experts in a particular field. In order to tackle this\nproblem, the first step is to obtain politician profiles which include their\ninterests, and these can be automatically learned from their speeches. As a\npolitician may have various areas of expertise, one alternative is to use a set\nof subprofiles, each of which covers a different subject. In this study, we\npropose a novel approach for this task by using latent Dirichlet allocation\n(LDA) to determine the main underlying topics of each political speech, and to\ndistribute the related terms among the different topic-based subprofiles. With\nthis objective, we propose the use of fifteen distance and similarity measures\nto automatically determine the optimal number of topics discussed in a\ndocument, and to demonstrate that every measure converges into five strategies:\nEuclidean, Dice, Sorensen, Cosine and Overlap. Our experimental results showed\nthat the scores of the different accuracy metrics of the proposed strategies\ntended to be higher than those of the baselines for expert recommendation\ntasks, and that the use of an appropriate number of topics has proved relevant.\n","authors":["Luis M. de Campos","Juan M. Fernández-Luna","Juan F. Huete","Luis Redondo-Expósito"],"pdf_url":"https://arxiv.org/pdf/2401.10617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10611v1","updated":"2024-01-19T10:42:29Z","published":"2024-01-19T10:42:29Z","title":"Publication venue recommendation using profiles based on clustering","summary":"  In this paper we study the venue recommendation problem in order to help\nresearchers to identify a journal or conference to submit a given paper. A\ncommon approach to tackle this problem is to build profiles defining the scope\nof each venue. Then, these profiles are compared against the target paper. In\nour approach we will study how clustering techniques can be used to construct\ntopic-based profiles and use an Information Retrieval based approach to obtain\nthe final recommendations. Additionally, we will explore how the use of\nauthorship, representing a complementary piece of information, helps to improve\nthe recommendations.\n","authors":["Luis M. de Campos","Juan M. Fernández-Luna","Juan F. Huete"],"pdf_url":"https://arxiv.org/pdf/2401.10611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10607v1","updated":"2024-01-19T10:32:28Z","published":"2024-01-19T10:32:28Z","title":"Use of topical and temporal profiles and their hybridisation for\n  content-based recommendation","summary":"  In the context of content-based recommender systems, the aim of this paper is\nto determine how better profiles can be built and how these affect the\nrecommendation process based on the incorporation of temporality, i.e. the\ninclusion of time in the recommendation process, and topicality, i.e. the\nrepresentation of texts associated with users and items using topics and their\ncombination. The main contribution of the paper is to present two different\nways of hybridising these two dimensions and to evaluate and compare them with\nother alternatives.\n","authors":["Luis M. de Campos","Juan M. Fernández-Luna","Juan F. Huete"],"pdf_url":"https://arxiv.org/pdf/2401.10607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10545v1","updated":"2024-01-19T08:09:20Z","published":"2024-01-19T08:09:20Z","title":"Understanding Biases in ChatGPT-based Recommender Systems: Provider\n  Fairness, Temporal Stability, and Recency","summary":"  This study explores the nuanced capabilities and inherent biases of\nRecommender Systems using Large Language Models (RecLLMs), with a focus on\nChatGPT-based systems. It studies into the contrasting behaviors of generative\nmodels and traditional collaborative filtering models in movie recommendations.\nThe research primarily investigates prompt design strategies and their impact\non various aspects of recommendation quality, including accuracy, provider\nfairness, diversity, stability, genre dominance, and temporal freshness\n(recency).\n  Our experimental analysis reveals that the introduction of specific 'system\nroles' and 'prompt strategies' in RecLLMs significantly influences their\nperformance. For instance, role-based prompts enhance fairness and diversity in\nrecommendations, mitigating popularity bias. We find that while GPT-based\nmodels do not always match the performance of CF baselines, they exhibit a\nunique tendency to recommend newer and more diverse movie genres. Notably,\nGPT-based models tend to recommend more recent films, particularly those\nreleased post-2000, and show a preference for genres like \\sq{Drama} and\nComedy, and Romance (compared to CF Action, Adventure) presumably due to the\nRecLLMs' training on varied data sets, which allows them to capture recent\ntrends and discussions more effectively than CF models. Interestingly, our\nresults demonstrate that the 'Simple' and 'Chain of Thought (COT)' paradigms\nyield the highest accuracy. These findings imply the potential of combining\nthese strategies with scenarios that favor more recent content, thereby\noffering a more balanced and up-to-date recommendation experience. This study\ncontributes significantly to the understanding of emerging RecLLMs,\nparticularly in the context of harms and biases within these systems.\n","authors":["Yashar Deldjoo"],"pdf_url":"https://arxiv.org/pdf/2401.10545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04971v2","updated":"2024-01-19T07:52:57Z","published":"2024-01-10T07:31:26Z","title":"A Survey on Cross-Domain Sequential Recommendation","summary":"  Cross-domain sequential recommendation (CDSR) shifts the modeling of user\npreferences from flat to stereoscopic by integrating and learning interaction\ninformation from multiple domains at different granularities (ranging from\ninter-sequence to intra-sequence and from single-domain to cross-domain). In\nthis survey, we first define the CDSR problem using a four-dimensional tensor\nand then analyze its multi-type input representations under multidirectional\ndimensionality reductions. Following that, we provide a systematic overview\nfrom both macro and micro views. From a macro view, we abstract the multi-level\nfusion structures of various models across domains and discuss their bridges\nfor fusion. From a micro view, focusing on the existing models, we specifically\ndiscuss the basic technologies and then explain the auxiliary learning\ntechnologies. Finally, we exhibit the available public datasets and the\nrepresentative experimental results as well as provide some insights into\nfuture directions for research in CDSR.\n","authors":["Shu Chen","Zitao Xu","Weike Pan","Qiang Yang","Zhong Ming"],"pdf_url":"https://arxiv.org/pdf/2401.04971v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09885v2","updated":"2024-01-19T07:23:04Z","published":"2024-01-18T10:56:27Z","title":"Source Code Clone Detection Using Unsupervised Similarity Measures","summary":"  Assessing similarity in source code has gained significant attention in\nrecent years due to its importance in software engineering tasks such as clone\ndetection and code search and recommendation. This work presents a comparative\nanalysis of unsupervised similarity measures for identifying source code clone\ndetection. The goal is to overview the current state-of-the-art techniques,\ntheir strengths, and weaknesses. To do that, we compile the existing\nunsupervised strategies and evaluate their performance on a benchmark dataset\nto guide software engineers in selecting appropriate methods for their specific\nuse cases. The source code of this study is available at\nhttps://github.com/jorge-martinez-gil/codesim\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2401.09885v2.pdf","comment":"Accepted for publication as Full Paper in the Software Quality Days\n  2024, Vienna, Austria"},{"id":"http://arxiv.org/abs/2401.00368v2","updated":"2024-01-19T05:16:20Z","published":"2023-12-31T02:13:18Z","title":"Improving Text Embeddings with Large Language Models","summary":"  In this paper, we introduce a novel and simple method for obtaining\nhigh-quality text embeddings using only synthetic data and less than 1k\ntraining steps. Unlike existing methods that often depend on multi-stage\nintermediate pre-training with billions of weakly-supervised text pairs,\nfollowed by fine-tuning with a few labeled datasets, our method does not\nrequire building complex training pipelines or relying on manually collected\ndatasets that are often constrained by task diversity and language coverage. We\nleverage proprietary LLMs to generate diverse synthetic data for hundreds of\nthousands of text embedding tasks across nearly 100 languages. We then\nfine-tune open-source decoder-only LLMs on the synthetic data using standard\ncontrastive loss. Experiments demonstrate that our method achieves strong\nperformance on highly competitive text embedding benchmarks without using any\nlabeled data. Furthermore, when fine-tuned with a mixture of synthetic and\nlabeled data, our model sets new state-of-the-art results on the BEIR and MTEB\nbenchmarks.\n","authors":["Liang Wang","Nan Yang","Xiaolong Huang","Linjun Yang","Rangan Majumder","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2401.00368v2.pdf","comment":"20 pages, 15 tables"},{"id":"http://arxiv.org/abs/2401.10487v1","updated":"2024-01-19T04:24:07Z","published":"2024-01-19T04:24:07Z","title":"Generative Dense Retrieval: Memory Can Be a Burden","summary":"  Generative Retrieval (GR), autoregressively decoding relevant document\nidentifiers given a query, has been shown to perform well under the setting of\nsmall-scale corpora. By memorizing the document corpus with model parameters,\nGR implicitly achieves deep interaction between query and document. However,\nsuch a memorizing mechanism faces three drawbacks: (1) Poor memory accuracy for\nfine-grained features of documents; (2) Memory confusion gets worse as the\ncorpus size increases; (3) Huge memory update costs for new documents. To\nalleviate these problems, we propose the Generative Dense Retrieval (GDR)\nparadigm. Specifically, GDR first uses the limited memory volume to achieve\ninter-cluster matching from query to relevant document clusters.\nMemorizing-free matching mechanism from Dense Retrieval (DR) is then introduced\nto conduct fine-grained intra-cluster matching from clusters to relevant\ndocuments. The coarse-to-fine process maximizes the advantages of GR's deep\ninteraction and DR's scalability. Besides, we design a cluster identifier\nconstructing strategy to facilitate corpus memory and a cluster-adaptive\nnegative sampling strategy to enhance the intra-cluster mapping ability.\nEmpirical results show that GDR obtains an average of 3.0 R@100 improvement on\nNQ dataset under multiple settings and has better scalability.\n","authors":["Peiwen Yuan","Xinglin Wang","Shaoxiong Feng","Boyuan Pan","Yiwei Li","Heda Wang","Xupeng Miao","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2401.10487v1.pdf","comment":"EACL 2024 main"},{"id":"http://arxiv.org/abs/2401.10484v1","updated":"2024-01-19T04:17:50Z","published":"2024-01-19T04:17:50Z","title":"Enhancing Scalability in Recommender Systems through Lottery Ticket\n  Hypothesis and Knowledge Distillation-based Neural Network Pruning","summary":"  This study introduces an innovative approach aimed at the efficient pruning\nof neural networks, with a particular focus on their deployment on edge\ndevices. Our method involves the integration of the Lottery Ticket Hypothesis\n(LTH) with the Knowledge Distillation (KD) framework, resulting in the\nformulation of three distinct pruning models. These models have been developed\nto address scalability issue in recommender systems, whereby the complexities\nof deep learning models have hindered their practical deployment. With\njudicious application of the pruning techniques, we effectively curtail the\npower consumption and model dimensions without compromising on accuracy.\nEmpirical evaluation has been performed using two real world datasets from\ndiverse domains against two baselines. Gratifyingly, our approaches yielded a\nGPU computation-power reduction of up to 66.67%. Notably, our study contributes\nto the field of recommendation system by pioneering the application of LTH and\nKD.\n","authors":["Rajaram R","Manoj Bharadhwaj","Vasan VS","Nargis Pervin"],"pdf_url":"https://arxiv.org/pdf/2401.10484v1.pdf","comment":"Accepted in WITS 2023 as a workshop paper"},{"id":"http://arxiv.org/abs/2401.10963v1","updated":"2024-01-19T11:50:26Z","published":"2024-01-19T11:50:26Z","title":"On the selection of the correct number of terms for profile\n  construction: theoretical and empirical analysis","summary":"  In this paper, we examine the problem of building a user profile from a set\nof documents. This profile will consist of a subset of the most representative\nterms in the documents that best represent user preferences or interests.\nInspired by the discrete concentration theory we have conducted an axiomatic\nstudy of seven properties that a selection function should fulfill: the minimum\nand maximum uncertainty principle, invariant to adding zeros, invariant to\nscale transformations, principle of nominal increase, transfer principle and\nthe richest get richer inequality. We also present a novel selection function\nbased on the use of similarity metrics, and more specifically the cosine\nmeasure which is commonly used in information retrieval, and demonstrate that\nthis verifies six of the properties in addition to a weaker variant of the\ntransfer principle, thereby representing a good selection approach. The\ntheoretical study was complemented with an empirical study to compare the\nperformance of different selection criteria (weight- and unweight-based) using\nreal data in a parliamentary setting. In this study, we analyze the performance\nof the different functions focusing on the two main factors affecting the\nselection process: profile size (number of terms) and weight distribution.\nThese profiles are then used in a document filtering task to show that our\nsimilarity-based approach performs well in terms not only of recommendation\naccuracy but also efficiency (we obtain smaller profiles and consequently\nfaster recommendations).\n","authors":["Luis M. de Campos","Juan M. Fernández-Luna","Juan F. Huete"],"pdf_url":"https://arxiv.org/pdf/2401.10963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10961v1","updated":"2024-01-19T11:14:37Z","published":"2024-01-19T11:14:37Z","title":"Positive unlabeled learning for building recommender systems in a\n  parliamentary setting","summary":"  Our goal is to learn about the political interests and preferences of the\nMembers of Parliament by mining their parliamentary activity, in order to\ndevelop a recommendation/filtering system that, given a stream of documents to\nbe distributed among them, is able to decide which documents should receive\neach Member of Parliament. We propose to use positive unlabeled learning to\ntackle this problem, because we only have information about relevant documents\n(the own interventions of each Member of Parliament in the debates) but not\nabout irrelevant documents, so that we cannot use standard binary classifiers\ntrained with positive and negative examples. We have also developed a new\nalgorithm of this type, which compares favourably with: a) the baseline\napproach assuming that all the interventions of other Members of Parliament are\nirrelevant, b) another well-known positive unlabeled learning method and c) an\napproach based on information retrieval methods that matches documents and\nlegislators' representations. The experiments have been carried out with data\nfrom the regional Andalusian Parliament at Spain.\n","authors":["Luis M. de Camposa","Juan M. Fernández-Luna","Juan F. Huete","Luis Redondo-Expósito"],"pdf_url":"https://arxiv.org/pdf/2401.10961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10956v1","updated":"2024-01-19T05:54:35Z","published":"2024-01-19T05:54:35Z","title":"AI Revolution on Chat Bot: Evidence from a Randomized Controlled\n  Experiment","summary":"  In recent years, generative AI has undergone major advancements,\ndemonstrating significant promise in augmenting human productivity. Notably,\nlarge language models (LLM), with ChatGPT-4 as an example, have drawn\nconsiderable attention. Numerous articles have examined the impact of LLM-based\ntools on human productivity in lab settings and designed tasks or in\nobservational studies. Despite recent advances, field experiments applying\nLLM-based tools in realistic settings are limited. This paper presents the\nfindings of a field randomized controlled trial assessing the effectiveness of\nLLM-based tools in providing unmonitored support services for information\nretrieval.\n","authors":["Sida Peng","Wojciech Swiatek","Allen Gao","Paul Cullivan","Haoge Chang"],"pdf_url":"https://arxiv.org/pdf/2401.10956v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.10886v1","updated":"2024-01-19T18:57:46Z","published":"2024-01-19T18:57:46Z","title":"SCENES: Subpixel Correspondence Estimation With Epipolar Supervision","summary":"  Extracting point correspondences from two or more views of a scene is a\nfundamental computer vision problem with particular importance for relative\ncamera pose estimation and structure-from-motion. Existing local feature\nmatching approaches, trained with correspondence supervision on large-scale\ndatasets, obtain highly-accurate matches on the test sets. However, they do not\ngeneralise well to new datasets with different characteristics to those they\nwere trained on, unlike classic feature extractors. Instead, they require\nfinetuning, which assumes that ground-truth correspondences or ground-truth\ncamera poses and 3D structure are available. We relax this assumption by\nremoving the requirement of 3D structure, e.g., depth maps or point clouds, and\nonly require camera pose information, which can be obtained from odometry. We\ndo so by replacing correspondence losses with epipolar losses, which encourage\nputative matches to lie on the associated epipolar line. While weaker than\ncorrespondence supervision, we observe that this cue is sufficient for\nfinetuning existing models on new data. We then further relax the assumption of\nknown camera poses by using pose estimates in a novel bootstrapping approach.\nWe evaluate on highly challenging datasets, including an indoor drone dataset\nand an outdoor smartphone camera dataset, and obtain state-of-the-art results\nwithout strong supervision.\n","authors":["Dominik A. Kloepfer","João F. Henriques","Dylan Campbell"],"pdf_url":"https://arxiv.org/pdf/2401.10886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10874v1","updated":"2024-01-19T18:33:52Z","published":"2024-01-19T18:33:52Z","title":"Applications of flow models to the generation of correlated lattice QCD\n  ensembles","summary":"  Machine-learned normalizing flows can be used in the context of lattice\nquantum field theory to generate statistically correlated ensembles of lattice\ngauge fields at different action parameters. This work demonstrates how these\ncorrelations can be exploited for variance reduction in the computation of\nobservables. Three different proof-of-concept applications are demonstrated\nusing a novel residual flow architecture: continuum limits of gauge theories,\nthe mass dependence of QCD observables, and hadronic matrix elements based on\nthe Feynman-Hellmann approach. In all three cases, it is shown that statistical\nuncertainties are significantly reduced when machine-learned flows are\nincorporated as compared with the same calculations performed with uncorrelated\nensembles or direct reweighting.\n","authors":["Ryan Abbott","Aleksandar Botev","Denis Boyda","Daniel C. Hackett","Gurtej Kanwar","Sébastien Racanière","Danilo J. Rezende","Fernando Romero-López","Phiala E. Shanahan","Julian M. Urban"],"pdf_url":"https://arxiv.org/pdf/2401.10874v1.pdf","comment":"11 pages, 2 tables, 5 figures"},{"id":"http://arxiv.org/abs/2306.00119v2","updated":"2024-01-19T18:30:27Z","published":"2023-05-31T18:48:16Z","title":"Optimal Sets and Solution Paths of ReLU Networks","summary":"  We develop an analytical framework to characterize the set of optimal ReLU\nneural networks by reformulating the non-convex training problem as a convex\nprogram. We show that the global optima of the convex parameterization are\ngiven by a polyhedral set and then extend this characterization to the optimal\nset of the non-convex training objective. Since all stationary points of the\nReLU training problem can be represented as optima of sub-sampled convex\nprograms, our work provides a general expression for all critical points of the\nnon-convex objective. We then leverage our results to provide an optimal\npruning algorithm for computing minimal networks, establish conditions for the\nregularization path of ReLU networks to be continuous, and develop sensitivity\nresults for minimal ReLU networks.\n","authors":["Aaron Mishkin","Mert Pilanci"],"pdf_url":"https://arxiv.org/pdf/2306.00119v2.pdf","comment":"Minor updates and corrections to clarify the role of merge/split\n  symmetries in formation of ReLU optimal set and add missing sufficient\n  conditions for all minimal models to have the same cardinality"},{"id":"http://arxiv.org/abs/2401.10862v1","updated":"2024-01-19T18:05:34Z","published":"2024-01-19T18:05:34Z","title":"Pruning for Protection: Increasing Jailbreak Resistance in Aligned LLMs\n  Without Fine-Tuning","summary":"  Large Language Models (LLMs) are vulnerable to `Jailbreaking' prompts, a type\nof attack that can coax these models into generating harmful and illegal\ncontent. In this paper, we show that pruning up to 20% of LLM parameters\nmarkedly increases their resistance to such attacks without additional training\nand without sacrificing their performance in standard benchmarks. Intriguingly,\nwe discovered that the enhanced safety observed post-pruning correlates to the\ninitial safety training level of the model, hinting that the effect of pruning\ncould be more general and may hold for other LLM behaviors beyond safety.\nAdditionally, we introduce a curated dataset of 225 harmful tasks across five\ncategories, inserted into ten different Jailbreaking prompts, showing that\npruning aids LLMs in concentrating attention on task-relevant tokens in\njailbreaking prompts. Lastly, our experiments reveal that the prominent chat\nmodels, such as LLaMA-2 Chat, Vicuna, and Mistral Instruct exhibit high\nsusceptibility to jailbreaking attacks, with some categories achieving nearly\n70-100% success rate. These insights underline the potential of pruning as a\ngeneralizable approach for improving LLM safety, reliability, and potentially\nother desired behaviors.\n","authors":["Adib Hasan","Ileana Rugina","Alex Wang"],"pdf_url":"https://arxiv.org/pdf/2401.10862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10859v1","updated":"2024-01-19T18:03:21Z","published":"2024-01-19T18:03:21Z","title":"Ensembler: Combating model inversion attacks using model ensemble during\n  collaborative inference","summary":"  Deep learning models have exhibited remarkable performance across various\ndomains. Nevertheless, the burgeoning model sizes compel edge devices to\noffload a significant portion of the inference process to the cloud. While this\npractice offers numerous advantages, it also raises critical concerns regarding\nuser data privacy. In scenarios where the cloud server's trustworthiness is in\nquestion, the need for a practical and adaptable method to safeguard data\nprivacy becomes imperative. In this paper, we introduce Ensembler, an\nextensible framework designed to substantially increase the difficulty of\nconducting model inversion attacks for adversarial parties. Ensembler leverages\nmodel ensembling on the adversarial server, running in parallel with existing\napproaches that introduce perturbations to sensitive data during colloborative\ninference. Our experiments demonstrate that when combined with even basic\nGaussian noise, Ensembler can effectively shield images from reconstruction\nattacks, achieving recognition levels that fall below human performance in some\nstrict settings, significantly outperforming baseline methods lacking the\nEnsembler framework.\n","authors":["Dancheng Liu","Jinjun Xiong"],"pdf_url":"https://arxiv.org/pdf/2401.10859v1.pdf","comment":"in submission"},{"id":"http://arxiv.org/abs/2401.10841v1","updated":"2024-01-19T17:40:50Z","published":"2024-01-19T17:40:50Z","title":"Using LLMs to discover emerging coded antisemitic hate-speech emergence\n  in extremist social media","summary":"  Online hate speech proliferation has created a difficult problem for social\nmedia platforms. A particular challenge relates to the use of coded language by\ngroups interested in both creating a sense of belonging for its users and\nevading detection. Coded language evolves quickly and its use varies over time.\nThis paper proposes a methodology for detecting emerging coded hate-laden\nterminology. The methodology is tested in the context of online antisemitic\ndiscourse. The approach considers posts scraped from social media platforms,\noften used by extremist users. The posts are scraped using seed expressions\nrelated to previously known discourse of hatred towards Jews. The method begins\nby identifying the expressions most representative of each post and calculating\ntheir frequency in the whole corpus. It filters out grammatically incoherent\nexpressions as well as previously encountered ones so as to focus on emergent\nwell-formed terminology. This is followed by an assessment of semantic\nsimilarity to known antisemitic terminology using a fine-tuned large language\nmodel, and subsequent filtering out of the expressions that are too distant\nfrom known expressions of hatred. Emergent antisemitic expressions containing\nterms clearly relating to Jewish topics are then removed to return only coded\nexpressions of hatred.\n","authors":["Dhanush Kikkisetti","Raza Ul Mustafa","Wendy Melillo","Roberto Corizzo","Zois Boukouvalas","Jeff Gill","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2401.10841v1.pdf","comment":"9 pages, 4 figures, 2 algorithms, 3 tables"},{"id":"http://arxiv.org/abs/2309.14393v2","updated":"2024-01-19T17:33:44Z","published":"2023-09-25T14:50:04Z","title":"LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language\n  Models","summary":"  The carbon footprint associated with large language models (LLMs) is a\nsignificant concern, encompassing emissions from their training, inference,\nexperimentation, and storage processes, including operational and embodied\ncarbon emissions. An essential aspect is accurately estimating the carbon\nimpact of emerging LLMs even before their training, which heavily relies on GPU\nusage. Existing studies have reported the carbon footprint of LLM training, but\nonly one tool, mlco2, can predict the carbon footprint of new neural networks\nprior to physical training. However, mlco2 has several serious limitations. It\ncannot extend its estimation to dense or mixture-of-experts (MoE) LLMs,\ndisregards critical architectural parameters, focuses solely on GPUs, and\ncannot model embodied carbon footprints. Addressing these gaps, we introduce\n\\textit{\\carb}, an end-to-end carbon footprint projection model designed for\nboth dense and MoE LLMs. Compared to mlco2, \\carb~significantly enhances the\naccuracy of carbon footprint estimations for various LLMs. The source code is\nreleased at \\url{https://github.com/SotaroKaneda/MLCarbon}.\n","authors":["Ahmad Faiz","Sotaro Kaneda","Ruhan Wang","Rita Osi","Prateek Sharma","Fan Chen","Lei Jiang"],"pdf_url":"https://arxiv.org/pdf/2309.14393v2.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2211.13350v2","updated":"2024-01-19T17:33:36Z","published":"2022-11-23T23:31:14Z","title":"Choreographer: Learning and Adapting Skills in Imagination","summary":"  Unsupervised skill learning aims to learn a rich repertoire of behaviors\nwithout external supervision, providing artificial agents with the ability to\ncontrol and influence the environment. However, without appropriate knowledge\nand exploration, skills may provide control only over a restricted area of the\nenvironment, limiting their applicability. Furthermore, it is unclear how to\nleverage the learned skill behaviors for adapting to downstream tasks in a\ndata-efficient manner. We present Choreographer, a model-based agent that\nexploits its world model to learn and adapt skills in imagination. Our method\ndecouples the exploration and skill learning processes, being able to discover\nskills in the latent state space of the model. During adaptation, the agent\nuses a meta-controller to evaluate and adapt the learned skills efficiently by\ndeploying them in parallel in imagination. Choreographer is able to learn\nskills both from offline data, and by collecting data simultaneously with an\nexploration policy. The skills can be used to effectively adapt to downstream\ntasks, as we show in the URL benchmark, where we outperform previous approaches\nfrom both pixels and states inputs. The learned skills also explore the\nenvironment thoroughly, finding sparse rewards more frequently, as shown in\ngoal-reaching tasks from the DMC Suite and Meta-World. Website and code:\nhttps://skillchoreographer.github.io/\n","authors":["Pietro Mazzaglia","Tim Verbelen","Bart Dhoedt","Alexandre Lacoste","Sai Rajeswar"],"pdf_url":"https://arxiv.org/pdf/2211.13350v2.pdf","comment":"Accepted at ICLR 2023 (notable top 25%)"},{"id":"http://arxiv.org/abs/2401.10831v1","updated":"2024-01-19T17:27:21Z","published":"2024-01-19T17:27:21Z","title":"Understanding Video Transformers via Universal Concept Discovery","summary":"  This paper studies the problem of concept-based interpretability of\ntransformer representations for videos. Concretely, we seek to explain the\ndecision-making process of video transformers based on high-level,\nspatiotemporal concepts that are automatically discovered. Prior research on\nconcept-based interpretability has concentrated solely on image-level tasks.\nComparatively, video models deal with the added temporal dimension, increasing\ncomplexity and posing challenges in identifying dynamic concepts over time. In\nthis work, we systematically address these challenges by introducing the first\nVideo Transformer Concept Discovery (VTCD) algorithm. To this end, we propose\nan efficient approach for unsupervised identification of units of video\ntransformer representations - concepts, and ranking their importance to the\noutput of a model. The resulting concepts are highly interpretable, revealing\nspatio-temporal reasoning mechanisms and object-centric representations in\nunstructured video models. Performing this analysis jointly over a diverse set\nof supervised and self-supervised representations, we discover that some of\nthese mechanism are universal in video transformers. Finally, we demonstrate\nthat VTCDcan be used to improve model performance for fine-grained tasks.\n","authors":["Matthew Kowal","Achal Dave","Rares Ambrus","Adrien Gaidon","Konstantinos G. Derpanis","Pavel Tokmakov"],"pdf_url":"https://arxiv.org/pdf/2401.10831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10825v1","updated":"2024-01-19T17:21:05Z","published":"2024-01-19T17:21:05Z","title":"A survey on recent advances in named entity recognition","summary":"  Named Entity Recognition seeks to extract substrings within a text that name\nreal-world objects and to determine their type (for example, whether they refer\nto persons or organizations). In this survey, we first present an overview of\nrecent popular approaches, but we also look at graph- and transformer- based\nmethods including Large Language Models (LLMs) that have not had much coverage\nin other surveys. Second, we focus on methods designed for datasets with scarce\nannotations. Third, we evaluate the performance of the main NER implementations\non a variety of datasets with differing characteristics (as regards their\ndomain, their size, and their number of classes). We thus provide a deep\ncomparison of algorithms that are never considered together. Our experiments\nshed some light on how the characteristics of datasets affect the behavior of\nthe methods that we compare.\n","authors":["Imed Keraghel","Stanislas Morbieu","Mohamed Nadif"],"pdf_url":"https://arxiv.org/pdf/2401.10825v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2310.12955v2","updated":"2024-01-19T17:12:23Z","published":"2023-10-19T17:54:39Z","title":"Towards Robust Offline Reinforcement Learning under Diverse Data\n  Corruption","summary":"  Offline reinforcement learning (RL) presents a promising approach for\nlearning reinforced policies from offline datasets without the need for costly\nor unsafe interactions with the environment. However, datasets collected by\nhumans in real-world environments are often noisy and may even be maliciously\ncorrupted, which can significantly degrade the performance of offline RL. In\nthis work, we first investigate the performance of current offline RL\nalgorithms under comprehensive data corruption, including states, actions,\nrewards, and dynamics. Our extensive experiments reveal that implicit\nQ-learning (IQL) demonstrates remarkable resilience to data corruption among\nvarious offline RL algorithms. Furthermore, we conduct both empirical and\ntheoretical analyses to understand IQL's robust performance, identifying its\nsupervised policy learning scheme as the key factor. Despite its relative\nrobustness, IQL still suffers from heavy-tail targets of Q functions under\ndynamics corruption. To tackle this challenge, we draw inspiration from robust\nstatistics to employ the Huber loss to handle the heavy-tailedness and utilize\nquantile estimators to balance penalization for corrupted data and learning\nstability. By incorporating these simple yet effective modifications into IQL,\nwe propose a more robust offline RL approach named Robust IQL (RIQL). Extensive\nexperiments demonstrate that RIQL exhibits highly robust performance when\nsubjected to diverse data corruption scenarios.\n","authors":["Rui Yang","Han Zhong","Jiawei Xu","Amy Zhang","Chongjie Zhang","Lei Han","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.12955v2.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10819v1","updated":"2024-01-19T17:09:32Z","published":"2024-01-19T17:09:32Z","title":"Optimisation in Neurosymbolic Learning Systems","summary":"  Neurosymbolic AI aims to integrate deep learning with symbolic AI. This\nintegration has many promises, such as decreasing the amount of data required\nto train a neural network, improving the explainability and interpretability of\nanswers given by models and verifying the correctness of trained systems. We\nstudy neurosymbolic learning, where we have both data and background knowledge\nexpressed using symbolic languages. How do we connect the symbolic and neural\ncomponents to communicate this knowledge? One option is fuzzy reasoning, which\nstudies degrees of truth. For example, being tall is not a binary concept.\nInstead, probabilistic reasoning studies the probability that something is true\nor will happen. Our first research question studies how different forms of\nfuzzy reasoning combine with learning. We find surprising results like a\nconnection to the Raven paradox stating we confirm \"ravens are black\" when we\nobserve a green apple. In this study, we did not use the background knowledge\nwhen we deployed our models after training. In our second research question, we\nstudied how to use background knowledge in deployed models. We developed a new\nneural network layer based on fuzzy reasoning. Probabilistic reasoning is a\nnatural fit for neural networks, which we usually train to be probabilistic.\nHowever, they are expensive to compute and do not scale well to large tasks. In\nour third research question, we study how to connect probabilistic reasoning\nwith neural networks by sampling to estimate averages, while in the final\nresearch question, we study scaling probabilistic neurosymbolic learning to\nmuch larger problems than before. Our insight is to train a neural network with\nsynthetic data to predict the result of probabilistic reasoning.\n","authors":["Emile van Krieken"],"pdf_url":"https://arxiv.org/pdf/2401.10819v1.pdf","comment":"PhD dissertation"},{"id":"http://arxiv.org/abs/2401.10816v1","updated":"2024-01-19T17:03:37Z","published":"2024-01-19T17:03:37Z","title":"Co-Pilot for Health: Personalized Algorithmic AI Nudging to Improve\n  Health Outcomes","summary":"  The ability to shape health behaviors of large populations automatically,\nacross wearable types and disease conditions at scale has tremendous potential\nto improve global health outcomes. We designed and implemented an AI driven\nplatform for digital algorithmic nudging, enabled by a Graph-Neural Network\n(GNN) based Recommendation System, and granular health behavior data from\nwearable fitness devices. Here we describe the efficacy results of this\nplatform with its capabilities of personalized and contextual nudging to\n$n=84,764$ individuals over a 12-week period in Singapore. We statistically\nvalidated that participants in the target group who received such AI optimized\ndaily nudges increased daily physical activity like step count by 6.17% ($p =\n3.09\\times10^{-4}$) and weekly minutes of Moderate to Vigorous Physical\nActivity (MVPA) by 7.61% ($p = 1.16\\times10^{-2}$), compared to matched\nparticipants in control group who did not receive any nudges. Further, such\nnudges were very well received, with a 13.1% of nudges sent being opened (open\nrate), and 11.7% of the opened nudges rated useful compared to 1.9% rated as\nnot useful thereby demonstrating significant improvement in population level\nengagement metrics.\n","authors":["Jodi Chiam","Aloysius Lim","Cheryl Nott","Nicholas Mark","Ankur Teredesai","Sunil Shinde"],"pdf_url":"https://arxiv.org/pdf/2401.10816v1.pdf","comment":"19 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.10811v1","updated":"2024-01-19T16:56:11Z","published":"2024-01-19T16:56:11Z","title":"Simulation Based Bayesian Optimization","summary":"  Bayesian Optimization (BO) is a powerful method for optimizing black-box\nfunctions by combining prior knowledge with ongoing function evaluations. BO\nconstructs a probabilistic surrogate model of the objective function given the\ncovariates, which is in turn used to inform the selection of future evaluation\npoints through an acquisition function. For smooth continuous search spaces,\nGaussian Processes (GPs) are commonly used as the surrogate model as they offer\nanalytical access to posterior predictive distributions, thus facilitating the\ncomputation and optimization of acquisition functions. However, in complex\nscenarios involving optimizations over categorical or mixed covariate spaces,\nGPs may not be ideal.\n  This paper introduces Simulation Based Bayesian Optimization (SBBO) as a\nnovel approach to optimizing acquisition functions that only requires\n\\emph{sampling-based} access to posterior predictive distributions. SBBO allows\nthe use of surrogate probabilistic models tailored for combinatorial spaces\nwith discrete variables. Any Bayesian model in which posterior inference is\ncarried out through Markov chain Monte Carlo can be selected as the surrogate\nmodel in SBBO. In applications involving combinatorial optimization, we\ndemonstrate empirically the effectiveness of SBBO method using various choices\nof surrogate models.\n","authors":["Roi Naveiro","Becky Tang"],"pdf_url":"https://arxiv.org/pdf/2401.10811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10809v1","updated":"2024-01-19T16:52:53Z","published":"2024-01-19T16:52:53Z","title":"Neglected Hessian component explains mysteries in Sharpness\n  regularization","summary":"  Recent work has shown that methods like SAM which either explicitly or\nimplicitly penalize second order information can improve generalization in deep\nlearning. Seemingly similar methods like weight noise and gradient penalties\noften fail to provide such benefits. We show that these differences can be\nexplained by the structure of the Hessian of the loss. First, we show that a\ncommon decomposition of the Hessian can be quantitatively interpreted as\nseparating the feature exploitation from feature exploration. The feature\nexploration, which can be described by the Nonlinear Modeling Error matrix\n(NME), is commonly neglected in the literature since it vanishes at\ninterpolation. Our work shows that the NME is in fact important as it can\nexplain why gradient penalties are sensitive to the choice of activation\nfunction. Using this insight we design interventions to improve performance. We\nalso provide evidence that challenges the long held equivalence of weight noise\nand gradient penalties. This equivalence relies on the assumption that the NME\ncan be ignored, which we find does not hold for modern networks since they\ninvolve significant feature learning. We find that regularizing feature\nexploitation but not feature exploration yields performance similar to gradient\npenalties.\n","authors":["Yann N. Dauphin","Atish Agarwala","Hossein Mobahi"],"pdf_url":"https://arxiv.org/pdf/2401.10809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.07626v3","updated":"2024-01-19T16:52:27Z","published":"2022-08-16T09:24:47Z","title":"Algorithmic Assistance with Recommendation-Dependent Preferences","summary":"  When an algorithm provides risk assessments, we typically think of them as\nhelpful inputs to human decisions, such as when risk scores are presented to\njudges or doctors. However, a decision-maker may not only react to the\ninformation provided by the algorithm. The decision-maker may also view the\nalgorithmic recommendation as a default action, making it costly for them to\ndeviate, such as when a judge is reluctant to overrule a high-risk assessment\nfor a defendant or a doctor fears the consequences of deviating from\nrecommended procedures. To address such unintended consequences of algorithmic\nassistance, we propose a principal-agent model of joint human-machine\ndecision-making. Within this model, we consider the effect and design of\nalgorithmic recommendations when they affect choices not just by shifting\nbeliefs, but also by altering preferences. We motivate this assumption from\ninstitutional factors, such as a desire to avoid audits, as well as from\nwell-established models in behavioral science that predict loss aversion\nrelative to a reference point, which here is set by the algorithm. We show that\nrecommendation-dependent preferences create inefficiencies where the\ndecision-maker is overly responsive to the recommendation. As a potential\nremedy, we discuss algorithms that strategically withhold recommendations, and\nshow how they can improve the quality of final decisions.\n","authors":["Bryce McLaughlin","Jann Spiess"],"pdf_url":"https://arxiv.org/pdf/2208.07626v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10805v1","updated":"2024-01-19T16:48:49Z","published":"2024-01-19T16:48:49Z","title":"Learning to Visually Connect Actions and their Effects","summary":"  In this work, we introduce the novel concept of visually Connecting Actions\nand Their Effects (CATE) in video understanding. CATE can have applications in\nareas like task planning and learning from demonstration. We propose different\nCATE-based task formulations, such as action selection and action\nspecification, where video understanding models connect actions and effects at\nsemantic and fine-grained levels. We observe that different formulations\nproduce representations capturing intuitive action properties. We also design\nvarious baseline models for action selection and action specification. Despite\nthe intuitive nature of the task, we observe that models struggle, and humans\noutperform them by a large margin. The study aims to establish a foundation for\nfuture efforts, showcasing the flexibility and versatility of connecting\nactions and effects in video understanding, with the hope of inspiring advanced\nformulations and models.\n","authors":["Eric Peh","Paritosh Parmar","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2401.10805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10800v1","updated":"2024-01-19T16:36:27Z","published":"2024-01-19T16:36:27Z","title":"Estimation of AMOC transition probabilities using a machine learning\n  based rare-event algorithm","summary":"  The Atlantic Meridional Overturning Circulation (AMOC) is an important\ncomponent of the global climate, known to be a tipping element, as it could\ncollapse under global warming. The main objective of this study is to compute\nthe probability that the AMOC collapses within a specified time window, using a\nrare-event algorithm called Trajectory-Adaptive Multilevel Splitting (TAMS).\nHowever, the efficiency and accuracy of TAMS depend on the choice of the score\nfunction. Although the definition of the optimal score function, called\n``committor function\" is known, it is impossible in general to compute it a\npriori. Here, we combine TAMS with a Next-Generation Reservoir Computing\ntechnique that estimates the committor function from the data generated by the\nrare-event algorithm. We test this technique in a stochastic box model of the\nAMOC for which two types of transition exist, the so-called F(ast)-transitions\nand S(low)-transitions. Results for the F-transtions compare favorably with\nthose in the literature where a physically-informed score function was used. We\nshow that coupling a rare-event algorithm with machine learning allows for a\ncorrect estimation of transition probabilities, transition times, and even\ntransition paths for a wide range of model parameters. We then extend these\nresults to the more difficult problem of S-transitions in the same model. In\nboth cases of F- and S-transitions, we also show how the Next-Generation\nReservoir Computing technique can be interpreted to retrieve an analytical\nestimate of the committor function.\n","authors":["Valérian Jacques-Dumas","René M. van Westen","Henk A. Dijkstra"],"pdf_url":"https://arxiv.org/pdf/2401.10800v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.10799v1","updated":"2024-01-19T16:34:37Z","published":"2024-01-19T16:34:37Z","title":"Novel Representation Learning Technique using Graphs for Performance\n  Analytics","summary":"  The performance analytics domain in High Performance Computing (HPC) uses\ntabular data to solve regression problems, such as predicting the execution\ntime. Existing Machine Learning (ML) techniques leverage the correlations among\nfeatures given tabular datasets, not leveraging the relationships between\nsamples directly. Moreover, since high-quality embeddings from raw features\nimprove the fidelity of the downstream predictive models, existing methods rely\non extensive feature engineering and pre-processing steps, costing time and\nmanual effort. To fill these two gaps, we propose a novel idea of transforming\ntabular performance data into graphs to leverage the advancement of Graph\nNeural Network-based (GNN) techniques in capturing complex relationships\nbetween features and samples. In contrast to other ML application domains, such\nas social networks, the graph is not given; instead, we need to build it. To\naddress this gap, we propose graph-building methods where nodes represent\nsamples, and the edges are automatically inferred iteratively based on the\nsimilarity between the features in the samples. We evaluate the effectiveness\nof the generated embeddings from GNNs based on how well they make even a simple\nfeed-forward neural network perform for regression tasks compared to other\nstate-of-the-art representation learning techniques. Our evaluation\ndemonstrates that even with up to 25% random missing values for each dataset,\nour method outperforms commonly used graph and Deep Neural Network (DNN)-based\napproaches and achieves up to 61.67% & 78.56% improvement in MSE loss over the\nDNN baseline respectively for HPC dataset and Machine Learning Datasets.\n","authors":["Tarek Ramadan","Ankur Lahiry","Tanzima Z. Islam"],"pdf_url":"https://arxiv.org/pdf/2401.10799v1.pdf","comment":"This paper has been accepted at 22nd International Conference on\n  Machine Learning and Applications (ICMLA2023)"},{"id":"http://arxiv.org/abs/2201.05158v3","updated":"2024-01-19T16:26:46Z","published":"2022-01-13T16:35:45Z","title":"Towards Quantum Graph Neural Networks: An Ego-Graph Learning Approach","summary":"  Quantum machine learning is a fast-emerging field that aims to tackle machine\nlearning using quantum algorithms and quantum computing. Due to the lack of\nphysical qubits and an effective means to map real-world data from Euclidean\nspace to Hilbert space, most of these methods focus on quantum analogies or\nprocess simulations rather than devising concrete architectures based on\nqubits. In this paper, we propose a novel hybrid quantum-classical algorithm\nfor graph-structured data, which we refer to as the Ego-graph based Quantum\nGraph Neural Network (egoQGNN). egoQGNN implements the GNN theoretical\nframework using the tensor product and unity matrix representation, which\ngreatly reduces the number of model parameters required. When controlled by a\nclassical computer, egoQGNN can accommodate arbitrarily sized graphs by\nprocessing ego-graphs from the input graph using a modestly-sized quantum\ndevice. The architecture is based on a novel mapping from real-world data to\nHilbert space. This mapping maintains the distance relations present in the\ndata and reduces information loss. Experimental results show that the proposed\nmethod outperforms competitive state-of-the-art models with only 1.68\\%\nparameters compared to those models.\n","authors":["Xing Ai","Zhihong Zhang","Luzhe Sun","Junchi Yan","Edwin Hancock"],"pdf_url":"https://arxiv.org/pdf/2201.05158v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10794v1","updated":"2024-01-19T16:26:35Z","published":"2024-01-19T16:26:35Z","title":"Deep Reinforcement Learning Empowered Activity-Aware Dynamic Health\n  Monitoring Systems","summary":"  In smart healthcare, health monitoring utilizes diverse tools and\ntechnologies to analyze patients' real-time biosignal data, enabling immediate\nactions and interventions. Existing monitoring approaches were designed on the\npremise that medical devices track several health metrics concurrently,\ntailored to their designated functional scope. This means that they report all\nrelevant health values within that scope, which can result in excess resource\nuse and the gathering of extraneous data due to monitoring irrelevant health\nmetrics. In this context, we propose Dynamic Activity-Aware Health Monitoring\nstrategy (DActAHM) for striking a balance between optimal monitoring\nperformance and cost efficiency, a novel framework based on Deep Reinforcement\nLearning (DRL) and SlowFast Model to ensure precise monitoring based on users'\nactivities. Specifically, with the SlowFast Model, DActAHM efficiently\nidentifies individual activities and captures these results for enhanced\nprocessing. Subsequently, DActAHM refines health metric monitoring in response\nto the identified activity by incorporating a DRL framework. Extensive\nexperiments comparing DActAHM against three state-of-the-art approaches\ndemonstrate it achieves 27.3% higher gain than the best-performing baseline\nthat fixes monitoring actions over timeline.\n","authors":["Ziqiaing Ye","Yulan Gao","Yue Xiao","Zehui Xiong","Dusit Niyato"],"pdf_url":"https://arxiv.org/pdf/2401.10794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10791v1","updated":"2024-01-19T16:23:53Z","published":"2024-01-19T16:23:53Z","title":"Early alignment in two-layer networks training is a two-edged sword","summary":"  Training neural networks with first order optimisation methods is at the core\nof the empirical success of deep learning. The scale of initialisation is a\ncrucial factor, as small initialisations are generally associated to a feature\nlearning regime, for which gradient descent is implicitly biased towards simple\nsolutions. This work provides a general and quantitative description of the\nearly alignment phase, originally introduced by Maennel et al. (2018) . For\nsmall initialisation and one hidden ReLU layer networks, the early stage of the\ntraining dynamics leads to an alignment of the neurons towards key directions.\nThis alignment induces a sparse representation of the network, which is\ndirectly related to the implicit bias of gradient flow at convergence. This\nsparsity inducing alignment however comes at the expense of difficulties in\nminimising the training objective: we also provide a simple data example for\nwhich overparameterised networks fail to converge towards global minima and\nonly converge to a spurious stationary point instead.\n","authors":["Etienne Boursier","Nicolas Flammarion"],"pdf_url":"https://arxiv.org/pdf/2401.10791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10790v1","updated":"2024-01-19T16:21:55Z","published":"2024-01-19T16:21:55Z","title":"Measuring the Impact of Scene Level Objects on Object Detection: Towards\n  Quantitative Explanations of Detection Decisions","summary":"  Although accuracy and other common metrics can provide a useful window into\nthe performance of an object detection model, they lack a deeper view of the\nmodel's decision process. Regardless of the quality of the training data and\nprocess, the features that an object detection model learns cannot be\nguaranteed. A model may learn a relationship between certain background\ncontext, i.e., scene level objects, and the presence of the labeled classes.\nFurthermore, standard performance verification and metrics would not identify\nthis phenomenon. This paper presents a new black box explainability method for\nadditional verification of object detection models by finding the impact of\nscene level objects on the identification of the objects within the image. By\ncomparing the accuracies of a model on test data with and without certain scene\nlevel objects, the contributions of these objects to the model's performance\nbecomes clearer. The experiment presented here will assess the impact of\nbuildings and people in image context on the detection of emergency road\nvehicles by a fine-tuned YOLOv8 model. A large increase in accuracy in the\npresence of a scene level object will indicate the model's reliance on that\nobject to make its detections. The results of this research lead to providing a\nquantitative explanation of the object detection model's decision process,\nenabling a deeper understanding of the model's performance.\n","authors":["Lynn Vonder Haar","Timothy Elvira","Luke Newcomb","Omar Ochoa"],"pdf_url":"https://arxiv.org/pdf/2401.10790v1.pdf","comment":"9 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2401.07961v2","updated":"2024-01-19T15:55:16Z","published":"2024-01-15T20:57:50Z","title":"Solution of the Probabilistic Lambert Problem: Connections with Optimal\n  Mass Transport, Schrödinger Bridge and Reaction-Diffusion PDEs","summary":"  Lambert's problem concerns with transferring a spacecraft from a given\ninitial to a given terminal position within prescribed flight time via velocity\ncontrol subject to a gravitational force field. We consider a probabilistic\nvariant of the Lambert problem where the knowledge of the endpoint constraints\nin position vectors are replaced by the knowledge of their respective joint\nprobability density functions. We show that the Lambert problem with endpoint\njoint probability density constraints is a generalized optimal mass transport\n(OMT) problem, thereby connecting this classical astrodynamics problem with a\nburgeoning area of research in modern stochastic control and stochastic machine\nlearning. This newfound connection allows us to rigorously establish the\nexistence and uniqueness of solution for the probabilistic Lambert problem. The\nsame connection also helps to numerically solve the probabilistic Lambert\nproblem via diffusion regularization, i.e., by leveraging further connection of\nthe OMT with the Schr\\\"odinger bridge problem (SBP). This also shows that the\nprobabilistic Lambert problem with additive dynamic process noise is in fact a\ngeneralized SBP, and can be solved numerically using the so-called\nSchr\\\"odinger factors, as we do in this work. We explain how the resulting\nanalysis leads to solving a boundary-coupled system of reaction-diffusion PDEs\nwhere the nonlinear gravitational potential appears as the reaction rate. We\npropose novel algorithms for the same, and present illustrative numerical\nresults. Our analysis and the algorithmic framework are nonparametric, i.e., we\nmake neither statistical (e.g., Gaussian, first few moments, mixture or\nexponential family, finite dimensionality of the sufficient statistic) nor\ndynamical (e.g., Taylor series) approximations.\n","authors":["Alexis M. H. Teter","Iman Nodozi","Abhishek Halder"],"pdf_url":"https://arxiv.org/pdf/2401.07961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10774v1","updated":"2024-01-19T15:48:40Z","published":"2024-01-19T15:48:40Z","title":"Medusa: Simple LLM Inference Acceleration Framework with Multiple\n  Decoding Heads","summary":"  The inference process in Large Language Models (LLMs) is often limited due to\nthe absence of parallelism in the auto-regressive decoding process, resulting\nin most operations being restricted by the memory bandwidth of accelerators.\nWhile methods such as speculative decoding have been suggested to address this\nissue, their implementation is impeded by the challenges associated with\nacquiring and maintaining a separate draft model. In this paper, we present\nMedusa, an efficient method that augments LLM inference by adding extra\ndecoding heads to predict multiple subsequent tokens in parallel. Using a\ntree-based attention mechanism, Medusa constructs multiple candidate\ncontinuations and verifies them simultaneously in each decoding step. By\nleveraging parallel processing, Medusa introduces only minimal overhead in\nterms of single-step latency while substantially reducing the number of\ndecoding steps required.\n  We present two levels of fine-tuning procedures for Medusa to meet the needs\nof different use cases: Medusa-1: Medusa is directly fine-tuned on top of a\nfrozen backbone LLM, enabling lossless inference acceleration. Medusa-2: Medusa\nis fine-tuned together with the backbone LLM, enabling better prediction\naccuracy of Medusa heads and higher speedup but needing a special training\nrecipe that preserves the backbone model's capabilities.\n  Moreover, we propose several extensions that improve or expand the utility of\nMedusa, including a self-distillation to handle situations where no training\ndata is available and a typical acceptance scheme to boost the acceptance rate\nwhile maintaining generation quality. We evaluate Medusa on models of various\nsizes and training procedures. Our experiments demonstrate that Medusa-1 can\nachieve over 2.2x speedup without compromising generation quality, while\nMedusa-2 further improves the speedup to 2.3-3.6x.\n","authors":["Tianle Cai","Yuhong Li","Zhengyang Geng","Hongwu Peng","Jason D. Lee","Deming Chen","Tri Dao"],"pdf_url":"https://arxiv.org/pdf/2401.10774v1.pdf","comment":"The code for this implementation is available at\n  https://github.com/FasterDecoding/Medusa"},{"id":"http://arxiv.org/abs/2401.10765v1","updated":"2024-01-19T15:37:11Z","published":"2024-01-19T15:37:11Z","title":"Starlit: Privacy-Preserving Federated Learning to Enhance Financial\n  Fraud Detection","summary":"  Federated Learning (FL) is a data-minimization approach enabling\ncollaborative model training across diverse clients with local data, avoiding\ndirect data exchange. However, state-of-the-art FL solutions to identify\nfraudulent financial transactions exhibit a subset of the following\nlimitations. They (1) lack a formal security definition and proof, (2) assume\nprior freezing of suspicious customers' accounts by financial institutions\n(limiting the solutions' adoption), (3) scale poorly, involving either $O(n^2)$\ncomputationally expensive modular exponentiation (where $n$ is the total number\nof financial institutions) or highly inefficient fully homomorphic encryption,\n(4) assume the parties have already completed the identity alignment phase,\nhence excluding it from the implementation, performance evaluation, and\nsecurity analysis, and (5) struggle to resist clients' dropouts. This work\nintroduces Starlit, a novel scalable privacy-preserving FL mechanism that\novercomes these limitations. It has various applications, such as enhancing\nfinancial fraud detection, mitigating terrorism, and enhancing digital health.\nWe implemented Starlit and conducted a thorough performance analysis using\nsynthetic data from a key player in global financial transactions. The\nevaluation indicates Starlit's scalability, efficiency, and accuracy.\n","authors":["Aydin Abadi","Bradley Doyle","Francesco Gini","Kieron Guinamard","Sasi Kumar Murakonda","Jack Liddell","Paul Mellor","Steven J. Murdoch","Mohammad Naseri","Hector Page","George Theodorakopoulos","Suzanne Weller"],"pdf_url":"https://arxiv.org/pdf/2401.10765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17046v2","updated":"2024-01-19T15:33:12Z","published":"2023-03-29T22:18:47Z","title":"Have it your way: Individualized Privacy Assignment for DP-SGD","summary":"  When training a machine learning model with differential privacy, one sets a\nprivacy budget. This budget represents a maximal privacy violation that any\nuser is willing to face by contributing their data to the training set. We\nargue that this approach is limited because different users may have different\nprivacy expectations. Thus, setting a uniform privacy budget across all points\nmay be overly conservative for some users or, conversely, not sufficiently\nprotective for others. In this paper, we capture these preferences through\nindividualized privacy budgets. To demonstrate their practicality, we introduce\na variant of Differentially Private Stochastic Gradient Descent (DP-SGD) which\nsupports such individualized budgets. DP-SGD is the canonical approach to\ntraining models with differential privacy. We modify its data sampling and\ngradient noising mechanisms to arrive at our approach, which we call\nIndividualized DP-SGD (IDP-SGD). Because IDP-SGD provides privacy guarantees\ntailored to the preferences of individual users and their data points, we find\nit empirically improves privacy-utility trade-offs.\n","authors":["Franziska Boenisch","Christopher Mühl","Adam Dziedzic","Roy Rinberg","Nicolas Papernot"],"pdf_url":"https://arxiv.org/pdf/2303.17046v2.pdf","comment":"Published at NeurIPS'2024"},{"id":"http://arxiv.org/abs/2205.14102v3","updated":"2024-01-19T15:30:04Z","published":"2022-05-27T17:12:26Z","title":"Group-level Brain Decoding with Deep Learning","summary":"  Decoding brain imaging data are gaining popularity, with applications in\nbrain-computer interfaces and the study of neural representations. Decoding is\ntypicallysubject-specific and does not generalise well over subjects, due to\nhigh amounts ofbetween subject variability. Techniques that overcome this will\nnot only providericher neuroscientific insights but also make it possible for\ngroup-level models to out-perform subject-specific models. Here, we propose a\nmethod that uses subjectembedding, analogous to word embedding in natural\nlanguage processing, to learnand exploit the structure in between-subject\nvariability as part of a decoding model,our adaptation of the WaveNet\narchitecture for classification. We apply this to mag-netoencephalography data,\nwhere 15 subjects viewed 118 different images, with30 examples per image; to\nclassify images using the entire 1 s window followingimage presentation. We\nshow that the combination of deep learning and subjectembedding is crucial to\nclosing the performance gap between subject- and group-level decoding models.\nImportantly, group models outperform subject models onlow-accuracy subjects\n(although slightly impair high-accuracy subjects) and can behelpful for\ninitialising subject models. While we have not generally found\ngroup-levelmodels to perform better than subject-level models, the performance\nof groupmodelling is expected to be even higher with bigger datasets. In order\nto providephysiological interpretation at the group level, we make use of\npermutation featureimportance. This provides insights into the spatiotemporal\nand spectral informationencoded in the models. All code is available on GitHub\n(https://github.com/ricsinaruto/MEG-group-decode).\n","authors":["Richard Csaky","Mats Van Es","Oiwi Parker Jones","Mark Woolrich"],"pdf_url":"https://arxiv.org/pdf/2205.14102v3.pdf","comment":"Published in Human Brain Mapping"},{"id":"http://arxiv.org/abs/2401.10754v1","updated":"2024-01-19T15:25:09Z","published":"2024-01-19T15:25:09Z","title":"Data Augmentation for Traffic Classification","summary":"  Data Augmentation (DA) -- enriching training data by adding synthetic samples\n-- is a technique widely adopted in Computer Vision (CV) and Natural Language\nProcessing (NLP) tasks to improve models performance. Yet, DA has struggled to\ngain traction in networking contexts, particularly in Traffic Classification\n(TC) tasks. In this work, we fulfill this gap by benchmarking 18 augmentation\nfunctions applied to 3 TC datasets using packet time series as input\nrepresentation and considering a variety of training conditions. Our results\nshow that (i) DA can reap benefits previously unexplored with (ii)\naugmentations acting on time series sequence order and masking being a better\nsuit for TC and (iii) simple latent space analysis can provide hints about why\naugmentations have positive or negative effects.\n","authors":["Chao Wang","Alessandro Finamore","Pietro Michiardi","Massimo Gallo","Dario Rossi"],"pdf_url":"https://arxiv.org/pdf/2401.10754v1.pdf","comment":"to appear at Passive and Active Measurements (PAM), 2024"},{"id":"http://arxiv.org/abs/2401.10753v1","updated":"2024-01-19T15:22:28Z","published":"2024-01-19T15:22:28Z","title":"BoolGebra: Attributed Graph-learning for Boolean Algebraic Manipulation","summary":"  Boolean algebraic manipulation is at the core of logic synthesis in\nElectronic Design Automation (EDA) design flow. Existing methods struggle to\nfully exploit optimization opportunities, and often suffer from an explosive\nsearch space and limited scalability efficiency. This work presents BoolGebra,\na novel attributed graph-learning approach for Boolean algebraic manipulation\nthat aims to improve fundamental logic synthesis. BoolGebra incorporates Graph\nNeural Networks (GNNs) and takes initial feature embeddings from both\nstructural and functional information as inputs. A fully connected neural\nnetwork is employed as the predictor for direct optimization result\npredictions, significantly reducing the search space and efficiently locating\nthe optimization space. The experiments involve training the BoolGebra model\nw.r.t design-specific and cross-design inferences using the trained model,\nwhere BoolGebra demonstrates generalizability for cross-design inference and\nits potential to scale from small, simple training datasets to large, complex\ninference datasets. Finally, BoolGebra is integrated with existing synthesis\ntool ABC to perform end-to-end logic minimization evaluation w.r.t SOTA\nbaselines.\n","authors":["Yingjie Li","Anthony Agnesina","Yanqing Zhang","Haoxing Ren","Cunxi Yu"],"pdf_url":"https://arxiv.org/pdf/2401.10753v1.pdf","comment":"DATE 2024 extended version. arXiv admin note: text overlap with\n  arXiv:2310.07846"},{"id":"http://arxiv.org/abs/2310.13384v2","updated":"2024-01-19T15:19:54Z","published":"2023-10-20T09:53:55Z","title":"Salted Inference: Enhancing Privacy while Maintaining Efficiency of\n  Split Inference in Mobile Computing","summary":"  In split inference, a deep neural network (DNN) is partitioned to run the\nearly part of the DNN at the edge and the later part of the DNN in the cloud.\nThis meets two key requirements for on-device machine learning: input privacy\nand computation efficiency. Still, an open question in split inference is\noutput privacy, given that the outputs of the DNN are observable in the cloud.\nWhile encrypted computing can protect output privacy too, homomorphic\nencryption requires substantial computation and communication resources from\nboth edge and cloud devices. In this paper, we introduce Salted DNNs: a novel\napproach that enables clients at the edge, who run the early part of the DNN,\nto control the semantic interpretation of the DNN's outputs at inference time.\nOur proposed Salted DNNs maintain classification accuracy and computation\nefficiency very close to the standard DNN counterparts. Experimental\nevaluations conducted on both images and wearable sensor data demonstrate that\nSalted DNNs attain classification accuracy very close to standard DNNs,\nparticularly when the Salted Layer is positioned within the early part to meet\nthe requirements of split inference. Our approach is general and can be applied\nto various types of DNNs. As a benchmark for future studies, we open-source our\ncode.\n","authors":["Mohammad Malekzadeh","Fahim Kawsar"],"pdf_url":"https://arxiv.org/pdf/2310.13384v2.pdf","comment":"To be appeared in the 25th International Workshop on Mobile Computing\n  Systems and Applications (HotMobile 2024)"},{"id":"http://arxiv.org/abs/2305.03077v2","updated":"2024-01-19T15:16:37Z","published":"2023-05-04T18:00:01Z","title":"Explaining dark matter halo density profiles with neural networks","summary":"  We use explainable neural networks to connect the evolutionary history of\ndark matter halos with their density profiles. The network captures independent\nfactors of variation in the density profiles within a low-dimensional\nrepresentation, which we physically interpret using mutual information. Without\nany prior knowledge of the halos' evolution, the network recovers the known\nrelation between the early time assembly and the inner profile, and discovers\nthat the profile beyond the virial radius is described by a single parameter\ncapturing the most recent mass accretion rate. The results illustrate the\npotential for machine-assisted scientific discovery in complicated\nastrophysical datasets.\n","authors":["Luisa Lucie-Smith","Hiranya V. Peiris","Andrew Pontzen"],"pdf_url":"https://arxiv.org/pdf/2305.03077v2.pdf","comment":"7 pages, 5 figures. Minor changes to match version accepted for\n  publication in PRL"},{"id":"http://arxiv.org/abs/2401.10746v1","updated":"2024-01-19T15:13:30Z","published":"2024-01-19T15:13:30Z","title":"A Systematic Evaluation of Euclidean Alignment with Deep Learning for\n  EEG Decoding","summary":"  Electroencephalography (EEG) signals are frequently used for various\nBrain-Computer Interface (BCI) tasks. While Deep Learning (DL) techniques have\nshown promising results, they are hindered by the substantial data\nrequirements. By leveraging data from multiple subjects, transfer learning\nenables more effective training of DL models. A technique that is gaining\npopularity is Euclidean Alignment (EA) due to its ease of use, low\ncomputational complexity, and compatibility with Deep Learning models. However,\nfew studies evaluate its impact on the training performance of shared and\nindividual DL models. In this work, we systematically evaluate the effect of EA\ncombined with DL for decoding BCI signals. We used EA to train shared models\nwith data from multiple subjects and evaluated its transferability to new\nsubjects. Our experimental results show that it improves decoding in the target\nsubject by 4.33% and decreases convergence time by more than 70%. We also\ntrained individual models for each subject to use as a majority-voting ensemble\nclassifier. In this scenario, using EA improved the 3-model ensemble accuracy\nby 3.7%. However, when compared to the shared model with EA, the ensemble\naccuracy was 3.62% lower.\n","authors":["Bruna Junqueira","Bruno Aristimunha","Sylvain Chevallier","Raphael Y. de Camargo"],"pdf_url":"https://arxiv.org/pdf/2401.10746v1.pdf","comment":"14 pages and 10 figures"},{"id":"http://arxiv.org/abs/2401.09796v2","updated":"2024-01-19T15:09:45Z","published":"2024-01-18T08:33:09Z","title":"A Fast, Performant, Secure Distributed Training Framework For Large\n  Language Model","summary":"  The distributed (federated) LLM is an important method for co-training the\ndomain-specific LLM using siloed data. However, maliciously stealing model\nparameters and data from the server or client side has become an urgent problem\nto be solved. In this paper, we propose a secure distributed LLM based on model\nslicing. In this case, we deploy the Trusted Execution Environment (TEE) on\nboth the client and server side, and put the fine-tuned structure (LoRA or\nembedding of P-tuning v2) into the TEE. Then, secure communication is executed\nin the TEE and general environments through lightweight encryption. In order to\nfurther reduce the equipment cost as well as increase the model performance and\naccuracy, we propose a split fine-tuning scheme. In particular, we split the\nLLM by layers and place the latter layers in a server-side TEE (the client does\nnot need a TEE). We then combine the proposed Sparsification Parameter\nFine-tuning (SPF) with the LoRA part to improve the accuracy of the downstream\ntask. Numerous experiments have shown that our method guarantees accuracy while\nmaintaining security.\n","authors":["Wei Huang","Yinggui Wang","Anda Cheng","Aihui Zhou","Chaofan Yu","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09796v2.pdf","comment":"Accepted by ICASSP 2024 (Federated LLM)"},{"id":"http://arxiv.org/abs/2306.17248v2","updated":"2024-01-19T15:01:52Z","published":"2023-06-29T18:34:37Z","title":"TemperatureGAN: Generative Modeling of Regional Atmospheric Temperatures","summary":"  Stochastic generators are useful for estimating climate impacts on various\nsectors. Projecting climate risk in various sectors, e.g. energy systems,\nrequires generators that are accurate (statistical resemblance to\nground-truth), reliable (do not produce erroneous examples), and efficient.\nLeveraging data from the North American Land Data Assimilation System, we\nintroduce TemperatureGAN, a Generative Adversarial Network conditioned on\nmonths, locations, and time periods, to generate 2m above ground atmospheric\ntemperatures at an hourly resolution. We propose evaluation methods and metrics\nto measure the quality of generated samples. We show that TemperatureGAN\nproduces high-fidelity examples with good spatial representation and temporal\ndynamics consistent with known diurnal cycles.\n","authors":["Emmanuel Balogun","Ram Rajagopal","Arun Majumdar"],"pdf_url":"https://arxiv.org/pdf/2306.17248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09234v2","updated":"2024-01-19T14:57:06Z","published":"2023-12-14T18:57:16Z","title":"Let's do the time-warp-attend: Learning topological invariants of\n  dynamical systems","summary":"  Dynamical systems across the sciences, from electrical circuits to ecological\nnetworks, undergo qualitative and often catastrophic changes in behavior,\ncalled bifurcations, when their underlying parameters cross a threshold.\nExisting methods predict oncoming catastrophes in individual systems but are\nprimarily time-series-based and struggle both to categorize qualitative\ndynamical regimes across diverse systems and to generalize to real data. To\naddress this challenge, we propose a data-driven, physically-informed\ndeep-learning framework for classifying dynamical regimes and characterizing\nbifurcation boundaries based on the extraction of topologically invariant\nfeatures. We focus on the paradigmatic case of the supercritical Hopf\nbifurcation, which is used to model periodic dynamics across a wide range of\napplications. Our convolutional attention method is trained with data\naugmentations that encourage the learning of topological invariants which can\nbe used to detect bifurcation boundaries in unseen systems and to design models\nof biological systems like oscillatory gene regulatory networks. We further\ndemonstrate our method's use in analyzing real data by recovering distinct\nproliferation and differentiation dynamics along pancreatic endocrinogenesis\ntrajectory in gene expression space based on single-cell data. Our method\nprovides valuable insights into the qualitative, long-term behavior of a wide\nrange of dynamical systems, and can detect bifurcations or catastrophic\ntransitions in large-scale physical and biological systems.\n","authors":["Noa Moriel","Matthew Ricci","Mor Nitzan"],"pdf_url":"https://arxiv.org/pdf/2312.09234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02901v2","updated":"2024-01-19T14:53:51Z","published":"2023-03-06T05:35:32Z","title":"$α$-divergence Improves the Entropy Production Estimation via\n  Machine Learning","summary":"  Recent years have seen a surge of interest in the algorithmic estimation of\nstochastic entropy production (EP) from trajectory data via machine learning. A\ncrucial element of such algorithms is the identification of a loss function\nwhose minimization guarantees the accurate EP estimation. In this study, we\nshow that there exists a host of loss functions, namely those implementing a\nvariational representation of the $\\alpha$-divergence, which can be used for\nthe EP estimation. By fixing $\\alpha$ to a value between $-1$ and $0$, the\n$\\alpha$-NEEP (Neural Estimator for Entropy Production) exhibits a much more\nrobust performance against strong nonequilibrium driving or slow dynamics,\nwhich adversely affects the existing method based on the Kullback-Leibler\ndivergence ($\\alpha = 0$). In particular, the choice of $\\alpha = -0.5$ tends\nto yield the optimal results. To corroborate our findings, we present an\nexactly solvable simplification of the EP estimation problem, whose loss\nfunction landscape and stochastic properties give deeper intuition into the\nrobustness of the $\\alpha$-NEEP.\n","authors":["Euijoon Kwon","Yongjoo Baek"],"pdf_url":"https://arxiv.org/pdf/2303.02901v2.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.10726v1","updated":"2024-01-19T14:43:04Z","published":"2024-01-19T14:43:04Z","title":"Empowering Aggregators with Practical Data-Driven Tools: Harnessing\n  Aggregated and Disaggregated Flexibility for Demand Response","summary":"  This study explores the crucial interplay between aggregators and building\noccupants in activating flexibility through Demand Response (DR) programs, with\na keen focus on achieving robust decarbonization and fortifying the resilience\nof the energy system amidst the uncertainties presented by Renewable Energy\nSources (RES). Firstly, it introduces a methodology of optimizing aggregated\nflexibility provision strategies in environments with limited data, utilizing\nDiscrete Fourier Transformation (DFT) and clustering techniques to identify\nbuilding occupant's activity patterns. Secondly, the study assesses the\ndisaggregated flexibility provision of Heating Ventilation and Air Conditioning\n(HVAC) systems during DR events, employing machine learning and optimization\ntechniques for precise, device-level analysis. The first approach offers a\nnon-intrusive pathway for aggregators to provide flexibility services in\nenvironments of a single smart meter for the whole building's consumption,\nwhile the second approach carefully considers building occupants' thermal\ncomfort profiles, while maximizing flexibility in case of existence of\ndedicated smart meters to the HVAC systems. Through the application of\ndata-driven techniques and encompassing case studies from both industrial and\nresidential buildings, this paper not only unveils pivotal opportunities for\naggregators in the balancing and emerging flexibility markets but also\nsuccessfully develops end-to-end practical tools for aggregators. Furthermore,\nthe efficacy of this tool is validated through detailed case studies,\nsubstantiating its operational capability and contributing to the evolution of\na resilient and efficient energy system.\n","authors":["Costas Mylonas","Donata Boric","Leila Luttenberger Maric","Alexandros Tsitsanis","Eleftheria Petrianou","Magda Foti"],"pdf_url":"https://arxiv.org/pdf/2401.10726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10724v1","updated":"2024-01-19T14:36:01Z","published":"2024-01-19T14:36:01Z","title":"Real-Time Zero-Day Intrusion Detection System for Automotive Controller\n  Area Network on FPGAs","summary":"  Increasing automation in vehicles enabled by increased connectivity to the\noutside world has exposed vulnerabilities in previously siloed automotive\nnetworks like controller area networks (CAN). Attributes of CAN such as\nbroadcast-based communication among electronic control units (ECUs) that\nlowered deployment costs are now being exploited to carry out active injection\nattacks like denial of service (DoS), fuzzing, and spoofing attacks. Research\nliterature has proposed multiple supervised machine learning models deployed as\nIntrusion detection systems (IDSs) to detect such malicious activity; however,\nthese are largely limited to identifying previously known attack vectors. With\nthe ever-increasing complexity of active injection attacks, detecting zero-day\n(novel) attacks in these networks in real-time (to prevent propagation) becomes\na problem of particular interest. This paper presents an\nunsupervised-learning-based convolutional autoencoder architecture for\ndetecting zero-day attacks, which is trained only on benign (attack-free) CAN\nmessages. We quantise the model using Vitis-AI tools from AMD/Xilinx targeting\na resource-constrained Zynq Ultrascale platform as our IDS-ECU system for\nintegration. The proposed model successfully achieves equal or higher\nclassification accuracy (> 99.5%) on unseen DoS, fuzzing, and spoofing attacks\nfrom a publicly available attack dataset when compared to the state-of-the-art\nunsupervised learning-based IDSs. Additionally, by cleverly overlapping IDS\noperation on a window of CAN messages with the reception, the model is able to\nmeet line-rate detection (0.43 ms per window) of high-speed CAN, which when\ncoupled with the low energy consumption per inference, makes this architecture\nideally suited for detecting zero-day attacks on critical CAN networks.\n","authors":["Shashwat Khandelwal","Shreejith Shanker"],"pdf_url":"https://arxiv.org/pdf/2401.10724v1.pdf","comment":"8 pages, 6 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.03976v2","updated":"2024-01-19T14:34:47Z","published":"2023-11-07T13:24:01Z","title":"A Foundation Graph Model","summary":"  The principal benefit of unsupervised graph representation learning is that a\npre-trained model can be fine-tuned where data or labels are scarce. Existing\napproaches are domain specific, maintaining consistent node and edge attributes\nacross the pre-training and target datasets. This precludes transfer to other\ndomains. A model capable of positive transfer on arbitrary tasks and domains\nwould represent the first foundation graph model.\n  In this work we use adversarial contrastive learning to present FoToM, a\ngraph pre-training method based on node and edge feature exclusion. We use\nFoToM to pre-train models over multiple graph domains, producing the first\nfoundation graph models. We demonstrate positive transfer on evaluation\ndatasets from multiple domains, including domains not present in pre-training\ndata. On all datasets performance is at worst on-par and on 76% significantly\nbetter than a supervised baseline ($P \\leq 0.01$), with an 8 to 40% reduction\nin error at 95% confidence. Contrary to other research, pre-training on a\ndataset with the target domain excluded leads us to better performance than\npre-training on a dataset from only the target domain. The multi-domain model\nat worst, matches, and on 56% of tasks, significantly outperforms single-domain\n($P \\leq 0.01$). These results include when node labels are used in evaluation,\nwhere performance is consistently superior to single-domain or non-pre-trained\nmodels. Notably, FoToM benefits scenarios in both large or scarce data regimes\nfor the target domains.\n","authors":["Alex O. Davies","Riku W. Green","Nirav S. Ajmeri","Telmo M. Silva Filho"],"pdf_url":"https://arxiv.org/pdf/2311.03976v2.pdf","comment":"Presented at the NeurIPS 2023 New Frontiers in Graph Learning\n  workshop"},{"id":"http://arxiv.org/abs/2401.10721v1","updated":"2024-01-19T14:32:50Z","published":"2024-01-19T14:32:50Z","title":"Generative Model for Constructing Reaction Path from Initial to Final\n  States","summary":"  Mapping out reaction pathways and their corresponding activation barriers is\na significant aspect of molecular simulation. Given their inherent complexity\nand nonlinearity, even generating a initial guess of these paths remains a\nchallenging problem. Presented in this paper is an innovative approach that\nutilizes neural networks to generate initial guess for these reaction pathways.\nThe proposed method is initiated by inputting the coordinates of the initial\nstate, followed by progressive alterations to its structure. This iterative\nprocess culminates in the generation of the approximate representation of the\nreaction path and the coordinates of the final state. The application of this\nmethod extends to complex reaction pathways illustrated by organic reactions.\nTraining was executed on the Transition1x dataset, an organic reaction pathway\ndataset. The results revealed generation of reactions that bore substantial\nsimilarities with the corresponding test data. The method's flexibility allows\nfor reactions to be generated either to conform to predetermined conditions or\nin a randomized manner.\n","authors":["Akihide Hayashi","So Takamoto","Ju Li","Daisuke Okanohara"],"pdf_url":"https://arxiv.org/pdf/2401.10721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10710v1","updated":"2024-01-19T14:18:32Z","published":"2024-01-19T14:18:32Z","title":"Classification with neural networks with quadratic decision functions","summary":"  Neural network with quadratic decision functions have been introduced as\nalternatives to standard neural networks with affine linear one. They are\nadvantageous when the objects to be identified are of compact basic geometries\nlike circles, ellipsis etc. In this paper we investigate the use of such ansatz\nfunctions for classification. In particular we test and compare the algorithm\non the MNIST dataset for classification of handwritten digits and for\nclassification of subspecies. We also show, that the implementation can be\nbased on the neural network structure in the software Tensorflow and Keras,\nrespectively.\n","authors":["Leon Frischauf","Otmar Scherzer","Cong Shi"],"pdf_url":"https://arxiv.org/pdf/2401.10710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15591v2","updated":"2024-01-19T14:08:23Z","published":"2023-12-25T02:32:05Z","title":"Privacy-Preserving Neural Graph Databases","summary":"  In the era of big data and rapidly evolving information systems, efficient\nand accurate data retrieval has become increasingly crucial. Neural graph\ndatabases (NGDBs) have emerged as a powerful paradigm that combines the\nstrengths of graph databases (graph DBs) and neural networks to enable\nefficient storage, retrieval, and analysis of graph-structured data. The usage\nof neural embedding storage and complex neural logical query answering provides\nNGDBs with generalization ability. When the graph is incomplete, by extracting\nlatent patterns and representations, neural graph databases can fill gaps in\nthe graph structure, revealing hidden relationships and enabling accurate query\nanswering. Nevertheless, this capability comes with inherent trade-offs, as it\nintroduces additional privacy risks to the database. Malicious attackers can\ninfer more sensitive information in the database using well-designed\ncombinatorial queries, such as by comparing the answer sets of where Turing\nAward winners born before 1950 and after 1940 lived, the living places of\nTuring Award winner Hinton are probably exposed, although the living places may\nhave been deleted in the training due to the privacy concerns. In this work,\ninspired by the privacy protection in graph embeddings, we propose a\nprivacy-preserving neural graph database (P-NGDB) to alleviate the risks of\nprivacy leakage in NGDBs. We introduce adversarial training techniques in the\ntraining stage to force the NGDBs to generate indistinguishable answers when\nqueried with private information, enhancing the difficulty of inferring\nsensitive information through combinations of multiple innocuous queries.\nExtensive experiment results on three datasets show that P-NGDB can effectively\nprotect private information in the graph database while delivering high-quality\npublic answers responses to queries.\n","authors":["Qi Hu","Haoran Li","Jiaxin Bai","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2312.15591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10700v1","updated":"2024-01-19T14:05:09Z","published":"2024-01-19T14:05:09Z","title":"Safe Offline Reinforcement Learning with Feasibility-Guided Diffusion\n  Model","summary":"  Safe offline RL is a promising way to bypass risky online interactions\ntowards safe policy learning. Most existing methods only enforce soft\nconstraints, i.e., constraining safety violations in expectation below\nthresholds predetermined. This can lead to potentially unsafe outcomes, thus\nunacceptable in safety-critical scenarios. An alternative is to enforce the\nhard constraint of zero violation. However, this can be challenging in offline\nsetting, as it needs to strike the right balance among three highly intricate\nand correlated aspects: safety constraint satisfaction, reward maximization,\nand behavior regularization imposed by offline datasets. Interestingly, we\ndiscover that via reachability analysis of safe-control theory, the hard safety\nconstraint can be equivalently translated to identifying the largest feasible\nregion given the offline dataset. This seamlessly converts the original trilogy\nproblem to a feasibility-dependent objective, i.e., maximizing reward value\nwithin the feasible region while minimizing safety risks in the infeasible\nregion. Inspired by these, we propose FISOR (FeasIbility-guided Safe Offline\nRL), which allows safety constraint adherence, reward maximization, and offline\npolicy learning to be realized via three decoupled processes, while offering\nstrong safety performance and stability. In FISOR, the optimal policy for the\ntranslated optimization problem can be derived in a special form of weighted\nbehavior cloning. Thus, we propose a novel energy-guided diffusion model that\ndoes not require training a complicated time-dependent classifier to extract\nthe policy, greatly simplifying the training. We compare FISOR against\nbaselines on DSRL benchmark for safe offline RL. Evaluation results show that\nFISOR is the only method that can guarantee safety satisfaction in all tasks,\nwhile achieving top returns in most tasks.\n","authors":["Yinan Zheng","Jianxiong Li","Dongjie Yu","Yujie Yang","Shengbo Eben Li","Xianyuan Zhan","Jingjing Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10700v1.pdf","comment":"ICLR 2024, 30pages, 11 figures"},{"id":"http://arxiv.org/abs/2401.09902v2","updated":"2024-01-19T14:04:22Z","published":"2024-01-18T11:32:50Z","title":"Interplay between depth and width for interpolation in neural ODEs","summary":"  Neural ordinary differential equations (neural ODEs) have emerged as a\nnatural tool for supervised learning from a control perspective, yet a complete\nunderstanding of their optimal architecture remains elusive. In this work, we\nexamine the interplay between their width $p$ and number of layer transitions\n$L$ (effectively the depth $L+1$). Specifically, we assess the model\nexpressivity in terms of its capacity to interpolate either a finite dataset\n$D$ comprising $N$ pairs of points or two probability measures in\n$\\mathbb{R}^d$ within a Wasserstein error margin $\\varepsilon>0$. Our findings\nreveal a balancing trade-off between $p$ and $L$, with $L$ scaling as\n$O(1+N/p)$ for dataset interpolation, and\n$L=O\\left(1+(p\\varepsilon^d)^{-1}\\right)$ for measure interpolation.\n  In the autonomous case, where $L=0$, a separate study is required, which we\nundertake focusing on dataset interpolation. We address the relaxed problem of\n$\\varepsilon$-approximate controllability and establish an error decay of\n$\\varepsilon\\sim O(\\log(p)p^{-1/d})$. This decay rate is a consequence of\napplying a universal approximation theorem to a custom-built Lipschitz vector\nfield that interpolates $D$. In the high-dimensional setting, we further\ndemonstrate that $p=O(N)$ neurons are likely sufficient to achieve exact\ncontrol.\n","authors":["Antonio Álvarez-López","Arselane Hadj Slimane","Enrique Zuazua"],"pdf_url":"https://arxiv.org/pdf/2401.09902v2.pdf","comment":"16 pages, 10 figures, double column"},{"id":"http://arxiv.org/abs/2401.10690v1","updated":"2024-01-19T13:41:08Z","published":"2024-01-19T13:41:08Z","title":"Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and\n  unfairness in dyadic regression models","summary":"  Dyadic regression models, which predict real-valued outcomes for pairs of\nentities, are fundamental in many domains (e.g. predicting the rating of a user\nto a product in Recommender Systems) and promising and under exploration in\nmany others (e.g. approximating the adequate dosage of a drug for a patient in\npersonalized pharmacology). In this work, we demonstrate that non-uniformity in\nthe observed value distributions of individual entities leads to severely\nbiased predictions in state-of-the-art models, skewing predictions towards the\naverage of observed past values for the entity and providing worse-than-random\npredictive power in eccentric yet equally important cases. We show that the\nusage of global error metrics like Root Mean Squared Error (RMSE) and Mean\nAbsolute Error (MAE) is insufficient to capture this phenomenon, which we name\neccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as\na new complementary metric that can quantify it in all studied models and\ndatasets. We also prove the adequateness of EAUC by using naive de-biasing\ncorrections to demonstrate that a lower model bias correlates with a lower EAUC\nand vice-versa. This work contributes a bias-aware evaluation of dyadic\nregression models to avoid potential unfairness and risks in critical\nreal-world applications of such systems.\n","authors":["Jorge Paz-Ruza","Amparo Alonso-Betanzos","Bertha Guijarro-Berdiñas","Brais Cancela","Carlos Eiras-Franco"],"pdf_url":"https://arxiv.org/pdf/2401.10690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10689v1","updated":"2024-01-19T13:39:05Z","published":"2024-01-19T13:39:05Z","title":"A Lightweight Multi-Attack CAN Intrusion Detection System on Hybrid\n  FPGAs","summary":"  Rising connectivity in vehicles is enabling new capabilities like connected\nautonomous driving and advanced driver assistance systems (ADAS) for improving\nthe safety and reliability of next-generation vehicles. This increased access\nto in-vehicle functions compromises critical capabilities that use legacy\ninvehicle networks like Controller Area Network (CAN), which has no inherent\nsecurity or authentication mechanism. Intrusion detection and mitigation\napproaches, particularly using machine learning models, have shown promising\nresults in detecting multiple attack vectors in CAN through their ability to\ngeneralise to new vectors. However, most deployments require dedicated\ncomputing units like GPUs to perform line-rate detection, consuming much higher\npower. In this paper, we present a lightweight multi-attack quantised machine\nlearning model that is deployed using Xilinx's Deep Learning Processing Unit IP\non a Zynq Ultrascale+ (XCZU3EG) FPGA, which is trained and validated using the\npublic CAN Intrusion Detection dataset. The quantised model detects denial of\nservice and fuzzing attacks with an accuracy of above 99 % and a false positive\nrate of 0.07%, which are comparable to the state-of-the-art techniques in the\nliterature. The Intrusion Detection System (IDS) execution consumes just 2.0 W\nwith software tasks running on the ECU and achieves a 25 % reduction in\nper-message processing latency over the state-of-the-art implementations. This\ndeployment allows the ECU function to coexist with the IDS with minimal changes\nto the tasks, making it ideal for real-time IDS in in-vehicle systems.\n","authors":["Shashwat Khandelwal","Shreejith Shanker"],"pdf_url":"https://arxiv.org/pdf/2401.10689v1.pdf","comment":"5 pages, 2 figures, 6 tables"},{"id":"http://arxiv.org/abs/2401.10686v1","updated":"2024-01-19T13:33:23Z","published":"2024-01-19T13:33:23Z","title":"Manipulating Sparse Double Descent","summary":"  This paper investigates the double descent phenomenon in two-layer neural\nnetworks, focusing on the role of L1 regularization and representation\ndimensions. It explores an alternative double descent phenomenon, named sparse\ndouble descent. The study emphasizes the complex relationship between model\ncomplexity, sparsity, and generalization, and suggests further research into\nmore diverse models and datasets. The findings contribute to a deeper\nunderstanding of neural network training and optimization.\n","authors":["Ya Shi Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10685v1","updated":"2024-01-19T13:32:55Z","published":"2024-01-19T13:32:55Z","title":"Towards End-to-End GPS Localization with Neural Pseudorange Correction","summary":"  Pseudorange errors are the root cause of localization inaccuracy in GPS.\nPrevious data-driven methods regress and eliminate pseudorange errors using\nhandcrafted intermediate labels. Unlike them, we propose an end-to-end GPS\nlocalization framework, E2E-PrNet, to train a neural network for pseudorange\ncorrection (PrNet) directly using the final task loss calculated with the\nground truth of GPS receiver states. The gradients of the loss with respect to\nlearnable parameters are backpropagated through a differentiable nonlinear\nleast squares optimizer to PrNet. The feasibility is verified with GPS data\ncollected by Android phones, showing that E2E-PrNet outperforms the\nstate-of-the-art end-to-end GPS localization methods.\n","authors":["Xu Weng","KV Ling","Haochen Liu","Kun Cao"],"pdf_url":"https://arxiv.org/pdf/2401.10685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10674v1","updated":"2024-01-19T13:13:38Z","published":"2024-01-19T13:13:38Z","title":"Deep Learning-based Embedded Intrusion Detection System for Automotive\n  CAN","summary":"  Rising complexity of in-vehicle electronics is enabling new capabilities like\nautonomous driving and active safety. However, rising automation also increases\nrisk of security threats which is compounded by lack of in-built security\nmeasures in legacy networks like CAN, allowing attackers to observe, tamper and\nmodify information shared over such broadcast networks. Various intrusion\ndetection approaches have been proposed to detect and tackle such threats, with\nmachine learning models proving highly effective. However, deploying machine\nlearning models will require high processing power through high-end processors\nor GPUs to perform them close to line rate. In this paper, we propose a hybrid\nFPGA-based ECU approach that can transparently integrate IDS functionality\nthrough a dedicated off-the-shelf hardware accelerator that implements a\ndeep-CNN intrusion detection model. Our results show that the proposed approach\nprovides an average accuracy of over 99% across multiple attack datasets with\n0.64% false detection rates while consuming 94% less energy and achieving 51.8%\nreduction in per-message processing latency when compared to IDS\nimplementations on GPUs.\n","authors":["Shashwat Khandelwal","Eashan Wadhwa","Shreejith Shanker"],"pdf_url":"https://arxiv.org/pdf/2401.10674v1.pdf","comment":"5 pages, 1 figure, 8 tables"},{"id":"http://arxiv.org/abs/2401.09691v2","updated":"2024-01-19T12:43:36Z","published":"2024-01-18T02:44:18Z","title":"Imitation Learning Inputting Image Feature to Each Layer of Neural\n  Network","summary":"  Imitation learning enables robots to learn and replicate human behavior from\ntraining data. Recent advances in machine learning enable end-to-end learning\napproaches that directly process high-dimensional observation data, such as\nimages. However, these approaches face a critical challenge when processing\ndata from multiple modalities, inadvertently ignoring data with a lower\ncorrelation to the desired output, especially when using short sampling\nperiods. This paper presents a useful method to address this challenge, which\namplifies the influence of data with a relatively low correlation to the output\nby inputting the data into each neural network layer. The proposed approach\neffectively incorporates diverse data sources into the learning process.\nThrough experiments using a simple pick-and-place operation with raw images and\njoint information as input, significant improvements in success rates are\ndemonstrated even when dealing with data from short sampling periods.\n","authors":["Koki Yamane","Sho Sakaino","Toshiaki Tsuji"],"pdf_url":"https://arxiv.org/pdf/2401.09691v2.pdf","comment":"6 pages, 4 figures, Accepted at AMC2024"},{"id":"http://arxiv.org/abs/2312.01185v2","updated":"2024-01-19T12:34:07Z","published":"2023-12-02T17:24:17Z","title":"A ripple in time: a discontinuity in American history","summary":"  In this note we use the State of the Union Address (SOTU) dataset from Kaggle\nto make some surprising (and some not so surprising) observations pertaining to\nthe general timeline of American history, and the character and nature of the\naddresses themselves. Our main approach is using vector embeddings, such as\nBERT (DistilBERT) and GPT-2.\n  While it is widely believed that BERT (and its variations) is most suitable\nfor NLP classification tasks, we find out that GPT-2 in conjunction with\nnonlinear dimension reduction methods such as UMAP provide better separation\nand stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In\nour case, no model fine-tuning is required, and the pre-trained out-of-the-box\nGPT-2 model is enough.\n  We also used a fine-tuned DistilBERT model for classification detecting which\nPresident delivered which address, with very good results (accuracy 93\\% - 95\\%\ndepending on the run). An analogous task was performed to determine the year of\nwriting, and we were able to pin it down to about 4 years (which is a single\npresidential term).\n  It is worth noting that SOTU addresses provide relatively small writing\nsamples (with about 8000 words on average, and varying widely from under 2000\nwords to more than 20000), and that the amount of authors is relatively large\n(we used SOTU addresses of 42 US presidents). This shows that the techniques\nemployed turn out to be rather efficient, while all the computations described\nin this note can be performed using a single GPU instance of Google Colab.\n  The accompanying code is available on GitHub.\n","authors":["Alexander Kolpakov","Igor Rivin"],"pdf_url":"https://arxiv.org/pdf/2312.01185v2.pdf","comment":"7 pages, 8 figures; GitHub repository\n  https://github.com/sashakolpakov/ripple_in_time"},{"id":"http://arxiv.org/abs/2312.08010v2","updated":"2024-01-19T12:19:48Z","published":"2023-12-13T09:33:08Z","title":"EZ-CLIP: Efficient Zeroshot Video Action Recognition","summary":"  Recent advancements in large-scale pre-training of visual-language models on\npaired image-text data have demonstrated impressive generalization capabilities\nfor zero-shot tasks. Building on this success, efforts have been made to adapt\nthese image-based visual-language models, such as CLIP, for videos extending\ntheir zero-shot capabilities to the video domain. While these adaptations have\nshown promising results, they come at a significant computational cost and\nstruggle with effectively modeling the crucial temporal aspects inherent to the\nvideo domain. In this study, we present EZ-CLIP, a simple and efficient\nadaptation of CLIP that addresses these challenges. EZ-CLIP leverages temporal\nvisual prompting for seamless temporal adaptation, requiring no fundamental\nalterations to the core CLIP architecture while preserving its remarkable\ngeneralization abilities. Moreover, we introduce a novel learning objective\nthat guides the temporal visual prompts to focus on capturing motion, thereby\nenhancing its learning capabilities from video data. We conducted extensive\nexperiments on five different benchmark datasets, thoroughly evaluating EZ-CLIP\nfor zero-shot learning and base-to-novel video action recognition, and also\ndemonstrating its potential for few-shot generalization.Impressively, with a\nmere 5.2 million learnable parameters (as opposed to the 71.1 million in the\nprior best model), EZ-CLIP can be efficiently trained on a single GPU,\noutperforming existing approaches in several evaluations.\n","authors":["Shahzad Ahmad","Sukalpa Chanda","Yogesh S Rawat"],"pdf_url":"https://arxiv.org/pdf/2312.08010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10657v1","updated":"2024-01-19T12:04:31Z","published":"2024-01-19T12:04:31Z","title":"FIMBA: Evaluating the Robustness of AI in Genomics via Feature\n  Importance Adversarial Attacks","summary":"  With the steady rise of the use of AI in bio-technical applications and the\nwidespread adoption of genomics sequencing, an increasing amount of AI-based\nalgorithms and tools is entering the research and production stage affecting\ncritical decision-making streams like drug discovery and clinical outcomes.\nThis paper demonstrates the vulnerability of AI models often utilized\ndownstream tasks on recognized public genomics datasets. We undermine model\nrobustness by deploying an attack that focuses on input transformation while\nmimicking the real data and confusing the model decision-making, ultimately\nyielding a pronounced deterioration in model performance. Further, we enhance\nour approach by generating poisoned data using a variational autoencoder-based\nmodel. Our empirical findings unequivocally demonstrate a decline in model\nperformance, underscored by diminished accuracy and an upswing in false\npositives and false negatives. Furthermore, we analyze the resulting\nadversarial samples via spectral analysis yielding conclusions for\ncountermeasures against such attacks.\n","authors":["Heorhii Skovorodnikov","Hoda Alkhzaimi"],"pdf_url":"https://arxiv.org/pdf/2401.10657v1.pdf","comment":"15 pages, core code available at:\n  https://github.com/HeorhiiS/fimba-attack"},{"id":"http://arxiv.org/abs/2401.10653v1","updated":"2024-01-19T11:59:13Z","published":"2024-01-19T11:59:13Z","title":"Attentive Fusion: A Transformer-based Approach to Multimodal Hate Speech\n  Detection","summary":"  With the recent surge and exponential growth of social media usage,\nscrutinizing social media content for the presence of any hateful content is of\nutmost importance. Researchers have been diligently working since the past\ndecade on distinguishing between content that promotes hatred and content that\ndoes not. Traditionally, the main focus has been on analyzing textual content.\nHowever, recent research attempts have also commenced into the identification\nof audio-based content. Nevertheless, studies have shown that relying solely on\naudio or text-based content may be ineffective, as recent upsurge indicates\nthat individuals often employ sarcasm in their speech and writing. To overcome\nthese challenges, we present an approach to identify whether a speech promotes\nhate or not utilizing both audio and textual representations. Our methodology\nis based on the Transformer framework that incorporates both audio and text\nsampling, accompanied by our very own layer called \"Attentive Fusion\". The\nresults of our study surpassed previous state-of-the-art techniques, achieving\nan impressive macro F1 score of 0.927 on the Test Set.\n","authors":["Atanu Mandal","Gargi Roy","Amit Barman","Indranil Dutta","Sudip Kumar Naskar"],"pdf_url":"https://arxiv.org/pdf/2401.10653v1.pdf","comment":"Accepted in 20th International Conference on Natural Language\n  Processing (ICON)"},{"id":"http://arxiv.org/abs/2401.10652v1","updated":"2024-01-19T11:58:13Z","published":"2024-01-19T11:58:13Z","title":"AutoChunk: Automated Activation Chunk for Memory-Efficient Long Sequence\n  Inference","summary":"  Large deep learning models have achieved impressive performance across a\nrange of applications. However, their large memory requirements, including\nparameter memory and activation memory, have become a significant challenge for\ntheir practical serving. While existing methods mainly address parameter\nmemory, the importance of activation memory has been overlooked. Especially for\nlong input sequences, activation memory is expected to experience a significant\nexponential growth as the length of sequences increases. In this approach, we\npropose AutoChunk, an automatic and adaptive compiler system that efficiently\nreduces activation memory for long sequence inference by chunk strategies. The\nproposed system generates chunk plans by optimizing through multiple stages. In\neach stage, the chunk search pass explores all possible chunk candidates and\nthe chunk selection pass identifies the optimal one. At runtime, AutoChunk\nemploys code generation to automatically apply chunk strategies. The\nexperiments demonstrate that AutoChunk can reduce over 80\\% of activation\nmemory while maintaining speed loss within 10%, extend max sequence length by\n3.2x to 11.7x, and outperform state-of-the-art methods by a large margin.\n","authors":["Xuanlei Zhao","Shenggan Cheng","Guangyang Lu","Jiarui Fang","Haotian Zhou","Bin Jia","Ziming Liu","Yang You"],"pdf_url":"https://arxiv.org/pdf/2401.10652v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10648v1","updated":"2024-01-19T11:48:52Z","published":"2024-01-19T11:48:52Z","title":"Area Modeling using Stay Information for Large-Scale Users and Analysis\n  for Influence of COVID-19","summary":"  Understanding how people use area in a city can be a valuable information in\na wide range of fields, from marketing to urban planning. Area usage is subject\nto change over time due to various events including seasonal shifts and\npandemics. Before the spread of smartphones, this data had been collected\nthrough questionnaire survey. However, this is not a sustainable approach in\nterms of time to results and cost. There are many existing studies on area\nmodeling, which characterize an area with some kind of information, using Point\nof Interest (POI) or inter-area movement data. However, since POI is data that\nis statically tied to space, and inter-area movement data ignores the behavior\nof people within an area, existing methods are not sufficient in terms of\ncapturing area usage changes. In this paper, we propose a novel area modeling\nmethod named Area2Vec, inspired by Word2Vec, which models areas based on\npeople's location data. This method is based on the discovery that it is\npossible to characterize an area based on its usage by using people's stay\ninformation in the area. And it is a novel method that can reflect the\ndynamically changing people's behavior in an area in the modeling results. We\nvalidated Area2vec by performing a functional classification of areas in a\ndistrict of Japan. The results show that Area2Vec can be usable in general area\nanalysis. We also investigated area usage changes due to COVID-19 in two\ndistricts in Japan. We could find that COVID-19 made people refrain from\nunnecessary going out, such as visiting entertainment areas.\n","authors":["Kazuyuki Shoji","Shunsuke Aoki","Takuro Yonezawa","Nobuo Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2401.10648v1.pdf","comment":"This paper is an English translation of the paper published in the\n  Transactions of the Information Processing Society of Japan\n  (http://doi.org/10.20729/00213190)"},{"id":"http://arxiv.org/abs/2401.10646v1","updated":"2024-01-19T11:47:49Z","published":"2024-01-19T11:47:49Z","title":"Empowering HWNs with Efficient Data Labeling: A Clustered Federated\n  Semi-Supervised Learning Approach","summary":"  Clustered Federated Multitask Learning (CFL) has gained considerable\nattention as an effective strategy for overcoming statistical challenges,\nparticularly when dealing with non independent and identically distributed (non\nIID) data across multiple users. However, much of the existing research on CFL\noperates under the unrealistic premise that devices have access to accurate\nground truth labels. This assumption becomes especially problematic in\nhierarchical wireless networks (HWNs), where edge networks contain a large\namount of unlabeled data, resulting in slower convergence rates and increased\nprocessing times, particularly when dealing with two layers of model\naggregation. To address these issues, we introduce a novel framework, Clustered\nFederated Semi-Supervised Learning (CFSL), designed for more realistic HWN\nscenarios. Our approach leverages a best-performing specialized model\nalgorithm, wherein each device is assigned a specialized model that is highly\nadept at generating accurate pseudo-labels for unlabeled data, even when the\ndata stems from diverse environments. We validate the efficacy of CFSL through\nextensive experiments, comparing it with existing methods highlighted in recent\nliterature. Our numerical results demonstrate that CFSL significantly improves\nupon key metrics such as testing accuracy, labeling accuracy, and labeling\nlatency under varying proportions of labeled and unlabeled data while also\naccommodating the non-IID nature of the data and the unique characteristics of\nwireless edge networks.\n","authors":["Moqbel Hamood","Abdullatif Albaseer","Mohamed Abdallah","Ala Al-Fuqaha"],"pdf_url":"https://arxiv.org/pdf/2401.10646v1.pdf","comment":"Accepted for IEEE Wireless Communications and Networking Conference\n  (WCNC) 2024"},{"id":"http://arxiv.org/abs/2401.10643v1","updated":"2024-01-19T11:45:10Z","published":"2024-01-19T11:45:10Z","title":"A Comprehensive Survey on Deep-Learning-based Vehicle Re-Identification:\n  Models, Data Sets and Challenges","summary":"  Vehicle re-identification (ReID) endeavors to associate vehicle images\ncollected from a distributed network of cameras spanning diverse traffic\nenvironments. This task assumes paramount importance within the spectrum of\nvehicle-centric technologies, playing a pivotal role in deploying Intelligent\nTransportation Systems (ITS) and advancing smart city initiatives. Rapid\nadvancements in deep learning have significantly propelled the evolution of\nvehicle ReID technologies in recent years. Consequently, undertaking a\ncomprehensive survey of methodologies centered on deep learning for vehicle\nre-identification has become imperative and inescapable. This paper extensively\nexplores deep learning techniques applied to vehicle ReID. It outlines the\ncategorization of these methods, encompassing supervised and unsupervised\napproaches, delves into existing research within these categories, introduces\ndatasets and evaluation criteria, and delineates forthcoming challenges and\npotential research directions. This comprehensive assessment examines the\nlandscape of deep learning in vehicle ReID and establishes a foundation and\nstarting point for future works. It aims to serve as a complete reference by\nhighlighting challenges and emerging trends, fostering advancements and\napplications in vehicle ReID utilizing deep learning models.\n","authors":["Ali Amiri","Aydin Kaya","Ali Seydi Keceli"],"pdf_url":"https://arxiv.org/pdf/2401.10643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10637v1","updated":"2024-01-19T11:35:07Z","published":"2024-01-19T11:35:07Z","title":"Towards Universal Unsupervised Anomaly Detection in Medical Imaging","summary":"  The increasing complexity of medical imaging data underscores the need for\nadvanced anomaly detection methods to automatically identify diverse\npathologies. Current methods face challenges in capturing the broad spectrum of\nanomalies, often limiting their use to specific lesion types in brain scans. To\naddress this challenge, we introduce a novel unsupervised approach, termed\n\\textit{Reversed Auto-Encoders (RA)}, designed to create realistic\npseudo-healthy reconstructions that enable the detection of a wider range of\npathologies. We evaluate the proposed method across various imaging modalities,\nincluding magnetic resonance imaging (MRI) of the brain, pediatric wrist X-ray,\nand chest X-ray, and demonstrate superior performance in detecting anomalies\ncompared to existing state-of-the-art methods. Our unsupervised anomaly\ndetection approach may enhance diagnostic accuracy in medical imaging by\nidentifying a broader range of unknown pathologies. Our code is publicly\navailable at: \\url{https://github.com/ci-ber/RA}.\n","authors":["Cosmin I. Bercea","Benedikt Wiestler","Daniel Rueckert","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2401.10637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10632v1","updated":"2024-01-19T11:20:31Z","published":"2024-01-19T11:20:31Z","title":"Interventional Fairness on Partially Known Causal Graphs: A Constrained\n  Optimization Approach","summary":"  Fair machine learning aims to prevent discrimination against individuals or\nsub-populations based on sensitive attributes such as gender and race. In\nrecent years, causal inference methods have been increasingly used in fair\nmachine learning to measure unfairness by causal effects. However, current\nmethods assume that the true causal graph is given, which is often not true in\nreal-world applications. To address this limitation, this paper proposes a\nframework for achieving causal fairness based on the notion of interventions\nwhen the true causal graph is partially known. The proposed approach involves\nmodeling fair prediction using a Partially Directed Acyclic Graph (PDAG),\nspecifically, a class of causal DAGs that can be learned from observational\ndata combined with domain knowledge. The PDAG is used to measure causal\nfairness, and a constrained optimization problem is formulated to balance\nbetween fairness and accuracy. Results on both simulated and real-world\ndatasets demonstrate the effectiveness of this method.\n","authors":["Aoqi Zuo","Yiqing Li","Susan Wei","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2401.10632v1.pdf","comment":"Accepted to ICLR24"},{"id":"http://arxiv.org/abs/2401.10620v1","updated":"2024-01-19T10:52:57Z","published":"2024-01-19T10:52:57Z","title":"Polytopic Autoencoders with Smooth Clustering for Reduced-order\n  Modelling of Flows","summary":"  With the advancement of neural networks, there has been a notable increase,\nboth in terms of quantity and variety, in research publications concerning the\napplication of autoencoders to reduced-order models. We propose a polytopic\nautoencoder architecture that includes a lightweight nonlinear encoder, a\nconvex combination decoder, and a smooth clustering network. Supported by\nseveral proofs, the model architecture ensures that all reconstructed states\nlie within a polytope, accompanied by a metric indicating the quality of the\nconstructed polytopes, referred to as polytope error. Additionally, it offers a\nminimal number of convex coordinates for polytopic linear-parameter varying\nsystems while achieving acceptable reconstruction errors compared to proper\northogonal decomposition (POD). To validate our proposed model, we conduct\nsimulations involving two flow scenarios with the incompressible Navier-Stokes\nequation. Numerical results demonstrate the guaranteed properties of the model,\nlow reconstruction errors compared to POD, and the improvement in error using a\nclustering network.\n","authors":["Jan Heiland","Yongho Kim"],"pdf_url":"https://arxiv.org/pdf/2401.10620v1.pdf","comment":"28 pages, 18 figures"},{"id":"http://arxiv.org/abs/2401.10603v1","updated":"2024-01-19T10:21:27Z","published":"2024-01-19T10:21:27Z","title":"ZnTrack -- Data as Code","summary":"  The past decade has seen tremendous breakthroughs in computation and there is\nno indication that this will slow any time soon. Machine learning, large-scale\ncomputing resources, and increased industry focus have resulted in rising\ninvestments in computer-driven solutions for data management, simulations, and\nmodel generation. However, with this growth in computation has come an even\nlarger expansion of data and with it, complexity in data storage, sharing, and\ntracking. In this work, we introduce ZnTrack, a Python-driven data versioning\ntool. ZnTrack builds upon established version control systems to provide a\nuser-friendly and easy-to-use interface for tracking parameters in experiments,\ndesigning workflows, and storing and sharing data. From this ability to reduce\nlarge datasets to a simple Python script emerges the concept of Data as Code, a\ncore component of the work presented here and an undoubtedly important concept\nas the age of computation continues to evolve. ZnTrack offers an open-source,\nFAIR data compatible Python package to enable users to harness these concepts\nof the future.\n","authors":["Fabian Zills","Moritz Schäfer","Samuel Tovey","Johannes Kästner","Christian Holm"],"pdf_url":"https://arxiv.org/pdf/2401.10603v1.pdf","comment":"22 pages, 10 figures, 2MB PDF"},{"id":"http://arxiv.org/abs/2311.11809v2","updated":"2024-01-19T10:10:27Z","published":"2023-11-20T14:42:13Z","title":"LogLead -- Fast and Integrated Log Loader, Enhancer, and Anomaly\n  Detector","summary":"  This paper introduces LogLead, a tool designed for efficient log analysis\nbenchmarking. LogLead combines three essential steps in log processing:\nloading, enhancing, and anomaly detection. The tool leverages Polars, a\nhigh-speed DataFrame library. We currently have Loaders for eight systems that\nare publicly available (HDFS, Hadoop, BGL, Thunderbird, Spirit, Liberty,\nTrainTicket, and GC Webshop). We have multiple enhancers with three parsers\n(Drain, Spell, LenMa), Bert embedding creation and other log representation\ntechniques like bag-of-words. LogLead integrates to five supervised and four\nunsupervised machine learning algorithms for anomaly detection from SKLearn. By\nintegrating diverse datasets, log representation methods and anomaly detectors,\nLogLead facilitates comprehensive benchmarking in log analysis research. We\nshow that log loading from raw file to dataframe is over 10x faster with\nLogLead compared to past solutions. We demonstrate roughly 2x improvement in\nDrain parsing speed by off-loading log message normalization to LogLead. Our\nbrief benchmarking on HDFS indicates that log representations extending beyond\nthe bag-of-words approach offer limited additional benefits. Tool URL:\nhttps://github.com/EvoTestOps/LogLead\n","authors":["Mika Mäntylä","Yuqing Wang","Jesse Nyyssölä"],"pdf_url":"https://arxiv.org/pdf/2311.11809v2.pdf","comment":"2024 IEEE International Conference on Software Analysis, Evolution\n  and Reengineering (SANER)"},{"id":"http://arxiv.org/abs/2401.10590v1","updated":"2024-01-19T10:02:20Z","published":"2024-01-19T10:02:20Z","title":"Adversarially Robust Signed Graph Contrastive Learning from Balance\n  Augmentation","summary":"  Signed graphs consist of edges and signs, which can be separated into\nstructural information and balance-related information, respectively. Existing\nsigned graph neural networks (SGNNs) typically rely on balance-related\ninformation to generate embeddings. Nevertheless, the emergence of recent\nadversarial attacks has had a detrimental impact on the balance-related\ninformation. Similar to how structure learning can restore unsigned graphs,\nbalance learning can be applied to signed graphs by improving the balance\ndegree of the poisoned graph. However, this approach encounters the challenge\n\"Irreversibility of Balance-related Information\" - while the balance degree\nimproves, the restored edges may not be the ones originally affected by\nattacks, resulting in poor defense effectiveness. To address this challenge, we\npropose a robust SGNN framework called Balance Augmented-Signed Graph\nContrastive Learning (BA-SGCL), which combines Graph Contrastive Learning\nprinciples with balance augmentation techniques. Experimental results\ndemonstrate that BA-SGCL not only enhances robustness against existing\nadversarial attacks but also achieves superior performance on link sign\nprediction task across various datasets.\n","authors":["Jialong Zhou","Xing Ai","Yuni Lai","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.10590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10191v2","updated":"2024-01-19T10:01:36Z","published":"2024-01-18T18:25:29Z","title":"Divide and not forget: Ensemble of selectively trained experts in\n  Continual Learning","summary":"  Class-incremental learning is becoming more popular as it helps models widen\ntheir applicability while not forgetting what they already know. A trend in\nthis area is to use a mixture-of-expert technique, where different models work\ntogether to solve the task. However, the experts are usually trained all at\nonce using whole task data, which makes them all prone to forgetting and\nincreasing computational burden. To address this limitation, we introduce a\nnovel approach named SEED. SEED selects only one, the most optimal expert for a\nconsidered task, and uses data from this task to fine-tune only this expert.\nFor this purpose, each expert represents each class with a Gaussian\ndistribution, and the optimal expert is selected based on the similarity of\nthose distributions. Consequently, SEED increases diversity and heterogeneity\nwithin the experts while maintaining the high stability of this ensemble\nmethod. The extensive experiments demonstrate that SEED achieves\nstate-of-the-art performance in exemplar-free settings across various\nscenarios, showing the potential of expert diversification through data in\ncontinual learning.\n","authors":["Grzegorz Rypeść","Sebastian Cygert","Valeriya Khan","Tomasz Trzciński","Bartosz Zieliński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2401.10191v2.pdf","comment":"Accepted for ICLR 2024 (main track), code is available at:\n  https://github.com/grypesc/SEED"},{"id":"http://arxiv.org/abs/2401.10586v1","updated":"2024-01-19T09:54:23Z","published":"2024-01-19T09:54:23Z","title":"PuriDefense: Randomized Local Implicit Adversarial Purification for\n  Defending Black-box Query-based Attacks","summary":"  Black-box query-based attacks constitute significant threats to Machine\nLearning as a Service (MLaaS) systems since they can generate adversarial\nexamples without accessing the target model's architecture and parameters.\nTraditional defense mechanisms, such as adversarial training, gradient masking,\nand input transformations, either impose substantial computational costs or\ncompromise the test accuracy of non-adversarial inputs. To address these\nchallenges, we propose an efficient defense mechanism, PuriDefense, that\nemploys random patch-wise purifications with an ensemble of lightweight\npurification models at a low level of inference cost. These models leverage the\nlocal implicit function and rebuild the natural image manifold. Our theoretical\nanalysis suggests that this approach slows down the convergence of query-based\nattacks by incorporating randomness into purifications. Extensive experiments\non CIFAR-10 and ImageNet validate the effectiveness of our proposed\npurifier-based defense mechanism, demonstrating significant improvements in\nrobustness against query-based attacks.\n","authors":["Ping Guo","Zhiyuan Yang","Xi Lin","Qingchuan Zhao","Qingfu Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12399v3","updated":"2024-01-19T09:49:46Z","published":"2023-11-21T07:22:48Z","title":"A Survey of Graph Meets Large Language Model: Progress and Future\n  Directions","summary":"  Graph plays a significant role in representing and analyzing complex\nrelationships in real-world applications such as citation networks, social\nnetworks, and biological data. Recently, Large Language Models (LLMs), which\nhave achieved tremendous success in various domains, have also been leveraged\nin graph-related tasks to surpass traditional Graph Neural Networks (GNNs)\nbased methods and yield state-of-the-art performance. In this survey, we first\npresent a comprehensive review and analysis of existing methods that integrate\nLLMs with graphs. First of all, we propose a new taxonomy, which organizes\nexisting methods into three categories based on the role (i.e., enhancer,\npredictor, and alignment component) played by LLMs in graph-related tasks. Then\nwe systematically survey the representative methods along the three categories\nof the taxonomy. Finally, we discuss the remaining limitations of existing\nstudies and highlight promising avenues for future research. The relevant\npapers are summarized and will be consistently updated at:\nhttps://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks.\n","authors":["Yuhan Li","Zhixun Li","Peisong Wang","Jia Li","Xiangguo Sun","Hong Cheng","Jeffrey Xu Yu"],"pdf_url":"https://arxiv.org/pdf/2311.12399v3.pdf","comment":"Work in progress; 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.10566v1","updated":"2024-01-19T09:10:58Z","published":"2024-01-19T09:10:58Z","title":"Robust Multi-Modal Density Estimation","summary":"  Development of multi-modal, probabilistic prediction models has lead to a\nneed for comprehensive evaluation metrics. While several metrics can\ncharacterize the accuracy of machine-learned models (e.g., negative\nlog-likelihood, Jensen-Shannon divergence), these metrics typically operate on\nprobability densities. Applying them to purely sample-based prediction models\nthus requires that the underlying density function is estimated. However,\ncommon methods such as kernel density estimation (KDE) have been demonstrated\nto lack robustness, while more complex methods have not been evaluated in\nmulti-modal estimation problems. In this paper, we present ROME (RObust\nMulti-modal density Estimator), a non-parametric approach for density\nestimation which addresses the challenge of estimating multi-modal, non-normal,\nand highly correlated distributions. ROME utilizes clustering to segment a\nmulti-modal set of samples into multiple uni-modal ones and then combines\nsimple KDE estimates obtained for individual clusters in a single multi-modal\nestimate. We compared our approach to state-of-the-art methods for density\nestimation as well as ablations of ROME, showing that it not only outperforms\nestablished methods but is also more robust to a variety of distributions. Our\nresults demonstrate that ROME can overcome the issues of over-fitting and\nover-smoothing exhibited by other estimators, promising a more robust\nevaluation of probabilistic machine learning models.\n","authors":["Anna Mészáros","Julian F. Schumann","Javier Alonso-Mora","Arkady Zgonnikov","Jens Kober"],"pdf_url":"https://arxiv.org/pdf/2401.10566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10559v1","updated":"2024-01-19T08:50:54Z","published":"2024-01-19T08:50:54Z","title":"OrchMoE: Efficient Multi-Adapter Learning with Task-Skill Synergy","summary":"  We advance the field of Parameter-Efficient Fine-Tuning (PEFT) with our novel\nmulti-adapter method, OrchMoE, which capitalizes on modular skill architecture\nfor enhanced forward transfer in neural networks. Unlike prior models that\ndepend on explicit task identification inputs, OrchMoE automatically discerns\ntask categories, streamlining the learning process. This is achieved through an\nintegrated mechanism comprising an Automatic Task Classification module and a\nTask-Skill Allocation module, which collectively deduce task-specific\nclassifications and tailor skill allocation matrices. Our extensive evaluations\non the 'Super Natural Instructions' dataset, featuring 1,600 diverse\ninstructional tasks, indicate that OrchMoE substantially outperforms comparable\nmulti-adapter baselines in terms of both performance and sample utilization\nefficiency, all while operating within the same parameter constraints. These\nfindings suggest that OrchMoE offers a significant leap forward in multi-task\nlearning efficiency.\n","authors":["Haowen Wang","Tao Sun","Kaixiang Ji","Jian Wang","Cong Fan","Jinjie Gu"],"pdf_url":"https://arxiv.org/pdf/2401.10559v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.10549v1","updated":"2024-01-19T08:26:44Z","published":"2024-01-19T08:26:44Z","title":"Unified View Imputation and Feature Selection Learning for Incomplete\n  Multi-view Data","summary":"  Although multi-view unsupervised feature selection (MUFS) is an effective\ntechnology for reducing dimensionality in machine learning, existing methods\ncannot directly deal with incomplete multi-view data where some samples are\nmissing in certain views. These methods should first apply predetermined values\nto impute missing data, then perform feature selection on the complete dataset.\nSeparating imputation and feature selection processes fails to capitalize on\nthe potential synergy where local structural information gleaned from feature\nselection could guide the imputation, thereby improving the feature selection\nperformance in turn. Additionally, previous methods only focus on leveraging\nsamples' local structure information, while ignoring the intrinsic locality of\nthe feature space. To tackle these problems, a novel MUFS method, called\nUNified view Imputation and Feature selectIon lEaRning (UNIFIER), is proposed.\nUNIFIER explores the local structure of multi-view data by adaptively learning\nsimilarity-induced graphs from both the sample and feature spaces. Then,\nUNIFIER dynamically recovers the missing views, guided by the sample and\nfeature similarity graphs during the feature selection procedure. Furthermore,\nthe half-quadratic minimization technique is used to automatically weight\ndifferent instances, alleviating the impact of outliers and unreliable restored\ndata. Comprehensive experimental results demonstrate that UNIFIER outperforms\nother state-of-the-art methods.\n","authors":["Yanyong Huang","Zongxin Shen","Tianrui Li","Fengmao Lv"],"pdf_url":"https://arxiv.org/pdf/2401.10549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10547v1","updated":"2024-01-19T08:13:10Z","published":"2024-01-19T08:13:10Z","title":"PhoGAD: Graph-based Anomaly Behavior Detection with Persistent Homology\n  Optimization","summary":"  A multitude of toxic online behaviors, ranging from network attacks to\nanonymous traffic and spam, have severely disrupted the smooth operation of\nnetworks. Due to the inherent sender-receiver nature of network behaviors,\ngraph-based frameworks are commonly used for detecting anomalous behaviors.\nHowever, in real-world scenarios, the boundary between normal and anomalous\nbehaviors tends to be ambiguous. The local heterophily of graphs interferes\nwith the detection, and existing methods based on nodes or edges introduce\nunwanted noise into representation results, thereby impacting the effectiveness\nof detection. To address these issues, we propose PhoGAD, a graph-based anomaly\ndetection framework. PhoGAD leverages persistent homology optimization to\nclarify behavioral boundaries. Building upon this, the weights of adjacent\nedges are designed to mitigate the effects of local heterophily. Subsequently,\nto tackle the noise problem, we conduct a formal analysis and propose a\ndisentangled representation-based explicit embedding method, ultimately\nachieving anomaly behavior detection. Experiments on intrusion, traffic, and\nspam datasets verify that PhoGAD has surpassed the performance of\nstate-of-the-art (SOTA) frameworks in detection efficacy. Notably, PhoGAD\ndemonstrates robust detection even with diminished anomaly proportions,\nhighlighting its applicability to real-world scenarios. The analysis of\npersistent homology demonstrates its effectiveness in capturing the topological\nstructure formed by normal edge features. Additionally, ablation experiments\nvalidate the effectiveness of the innovative mechanisms integrated within\nPhoGAD.\n","authors":["Ziqi Yuan","Haoyi Zhou","Tianyu Chen","Jianxin Li"],"pdf_url":"https://arxiv.org/pdf/2401.10547v1.pdf","comment":"Accepted by WSDM 2024"},{"id":"http://arxiv.org/abs/2401.08169v2","updated":"2024-01-19T07:48:24Z","published":"2024-01-16T07:18:47Z","title":"Statistical Test for Attention Map in Vision Transformer","summary":"  The Vision Transformer (ViT) demonstrates exceptional performance in various\ncomputer vision tasks. Attention is crucial for ViT to capture complex\nwide-ranging relationships among image patches, allowing the model to weigh the\nimportance of image patches and aiding our understanding of the decision-making\nprocess. However, when utilizing the attention of ViT as evidence in\nhigh-stakes decision-making tasks such as medical diagnostics, a challenge\narises due to the potential of attention mechanisms erroneously focusing on\nirrelevant regions. In this study, we propose a statistical test for ViT's\nattentions, enabling us to use the attentions as reliable quantitative evidence\nindicators for ViT's decision-making with a rigorously controlled error rate.\nUsing the framework called selective inference, we quantify the statistical\nsignificance of attentions in the form of p-values, which enables the\ntheoretically grounded quantification of the false positive detection\nprobability of attentions. We demonstrate the validity and the effectiveness of\nthe proposed method through numerical experiments and applications to brain\nimage diagnoses.\n","authors":["Tomohiro Shiraishi","Daiki Miwa","Teruyuki Katsuoka","Vo Nguyen Le Duy","Kouichi Taji","Ichiro Takeuchi"],"pdf_url":"https://arxiv.org/pdf/2401.08169v2.pdf","comment":"42pages, 17figures"},{"id":"http://arxiv.org/abs/2401.10541v1","updated":"2024-01-19T07:44:32Z","published":"2024-01-19T07:44:32Z","title":"I-SplitEE: Image classification in Split Computing DNNs with Early Exits","summary":"  The recent advances in Deep Neural Networks (DNNs) stem from their\nexceptional performance across various domains. However, their inherent large\nsize hinders deploying these networks on resource-constrained devices like\nedge, mobile, and IoT platforms. Strategies have emerged, from partial cloud\ncomputation offloading (split computing) to integrating early exits within DNN\nlayers. Our work presents an innovative unified approach merging early exits\nand split computing. We determine the 'splitting layer', the optimal depth in\nthe DNN for edge device computations, and whether to infer on edge device or be\noffloaded to the cloud for inference considering accuracy, computational\nefficiency, and communication costs. Also, Image classification faces diverse\nenvironmental distortions, influenced by factors like time of day, lighting,\nand weather. To adapt to these distortions, we introduce I-SplitEE, an online\nunsupervised algorithm ideal for scenarios lacking ground truths and with\nsequential data. Experimental validation using Caltech-256 and Cifar-10\ndatasets subjected to varied distortions showcases I-SplitEE's ability to\nreduce costs by a minimum of 55% with marginal performance degradation of at\nmost 5%.\n","authors":["Divya Jyoti Bajpai","Aastha Jaiswal","Manjesh Kumar Hanawal"],"pdf_url":"https://arxiv.org/pdf/2401.10541v1.pdf","comment":"To appear in proceedings of IEEE International Conference on\n  Communications 2024"},{"id":"http://arxiv.org/abs/2401.10535v1","updated":"2024-01-19T07:21:45Z","published":"2024-01-19T07:21:45Z","title":"The \"Colonial Impulse\" of Natural Language Processing: An Audit of\n  Bengali Sentiment Analysis Tools and Their Identity-based Biases","summary":"  While colonization has sociohistorically impacted people's identities across\nvarious dimensions, those colonial values and biases continue to be perpetuated\nby sociotechnical systems. One category of sociotechnical systems--sentiment\nanalysis tools--can also perpetuate colonial values and bias, yet less\nattention has been paid to how such tools may be complicit in perpetuating\ncoloniality, although they are often used to guide various practices (e.g.,\ncontent moderation). In this paper, we explore potential bias in sentiment\nanalysis tools in the context of Bengali communities that have experienced and\ncontinue to experience the impacts of colonialism. Drawing on identity\ncategories most impacted by colonialism amongst local Bengali communities, we\nfocused our analytic attention on gender, religion, and nationality. We\nconducted an algorithmic audit of all sentiment analysis tools for Bengali,\navailable on the Python package index (PyPI) and GitHub. Despite similar\nsemantic content and structure, our analyses showed that in addition to\ninconsistencies in output from different tools, Bengali sentiment analysis\ntools exhibit bias between different identity categories and respond\ndifferently to different ways of identity expression. Connecting our findings\nwith colonially shaped sociocultural structures of Bengali communities, we\ndiscuss the implications of downstream bias of sentiment analysis tools.\n","authors":["Dipto Das","Shion Guha","Jed Brubaker","Bryan Semaan"],"pdf_url":"https://arxiv.org/pdf/2401.10535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10529v1","updated":"2024-01-19T07:10:13Z","published":"2024-01-19T07:10:13Z","title":"Mementos: A Comprehensive Benchmark for Multimodal Large Language Model\n  Reasoning over Image Sequences","summary":"  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in\nhandling a variety of visual-language tasks. However, current MLLM benchmarks\nare predominantly designed to evaluate reasoning based on static information\nabout a single image, and the ability of modern MLLMs to extrapolate from image\nsequences, which is essential for understanding our ever-changing world, has\nbeen less investigated. To address this challenge, this paper introduces\nMementos, a new benchmark designed to assess MLLMs' sequential image reasoning\nabilities. Mementos features 4,761 diverse image sequences with varying\nlengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning\nperformance. Through a careful evaluation of nine recent MLLMs on Mementos,\nincluding GPT-4V and Gemini, we find that they struggle to accurately describe\ndynamic information about given image sequences, often leading to\nhallucinations/misrepresentations of objects and their corresponding behaviors.\nOur quantitative analysis and case studies identify three key factors impacting\nMLLMs' sequential image reasoning: the correlation between object and\nbehavioral hallucinations, the influence of cooccurring behaviors, and the\ncompounding impact of behavioral hallucinations. Our dataset is available at\nhttps://github.com/umd-huang-lab/Mementos.\n","authors":["Xiyao Wang","Yuhang Zhou","Xiaoyu Liu","Hongjin Lu","Yuancheng Xu","Feihong He","Jaehong Yoon","Taixi Lu","Gedas Bertasius","Mohit Bansal","Huaxiu Yao","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.10529v1.pdf","comment":"27 pages, 23 figures"},{"id":"http://arxiv.org/abs/2401.10522v1","updated":"2024-01-19T06:56:09Z","published":"2024-01-19T06:56:09Z","title":"FARe: Fault-Aware GNN Training on ReRAM-based PIM Accelerators","summary":"  Resistive random-access memory (ReRAM)-based processing-in-memory (PIM)\narchitecture is an attractive solution for training Graph Neural Networks\n(GNNs) on edge platforms. However, the immature fabrication process and limited\nwrite endurance of ReRAMs make them prone to hardware faults, thereby limiting\ntheir widespread adoption for GNN training. Further, the existing\nfault-tolerant solutions prove inadequate for effectively training GNNs in the\npresence of faults. In this paper, we propose a fault-aware framework referred\nto as FARe that mitigates the effect of faults during GNN training. FARe\noutperforms existing approaches in terms of both accuracy and timing overhead.\nExperimental results demonstrate that FARe framework can restore GNN test\naccuracy by 47.6% on faulty ReRAM hardware with a ~1% timing overhead compared\nto the fault-free counterpart.\n","authors":["Pratyush Dhingra","Chukwufumnanya Ogbogu","Biresh Kumar Joardar","Janardhan Rao Doppa","Ananth Kalyanaraman","Partha Pratim Pande"],"pdf_url":"https://arxiv.org/pdf/2401.10522v1.pdf","comment":"This paper has been accepted to the conference DATE (Design,\n  Automation and Test in Europe) - 2024"},{"id":"http://arxiv.org/abs/2401.10518v1","updated":"2024-01-19T06:26:05Z","published":"2024-01-19T06:26:05Z","title":"Spatial-temporal Forecasting for Regions without Observations","summary":"  Spatial-temporal forecasting plays an important role in many real-world\napplications, such as traffic forecasting, air pollutant forecasting,\ncrowd-flow forecasting, and so on. State-of-the-art spatial-temporal\nforecasting models take data-driven approaches and rely heavily on data\navailability. Such models suffer from accuracy issues when data is incomplete,\nwhich is common in reality due to the heavy costs of deploying and maintaining\nsensors for data collection. A few recent studies attempted to address the\nissue of incomplete data. They typically assume some data availability in a\nregion of interest either for a short period or at a few locations. In this\npaper, we further study spatial-temporal forecasting for a region of interest\nwithout any historical observations, to address scenarios such as unbalanced\nregion development, progressive deployment of sensors or lack of open data. We\npropose a model named STSM for the task. The model takes a contrastive\nlearning-based approach to learn spatial-temporal patterns from adjacent\nregions that have recorded data. Our key insight is to learn from the locations\nthat resemble those in the region of interest, and we propose a selective\nmasking strategy to enable the learning. As a result, our model outperforms\nadapted state-of-the-art models, reducing errors consistently over both traffic\nand air pollutant forecasting tasks. The source code is available at\nhttps://github.com/suzy0223/STSM.\n","authors":["Xinyu Su","Jianzhong Qi","Egemen Tanin","Yanchuan Chang","Majid Sarvi"],"pdf_url":"https://arxiv.org/pdf/2401.10518v1.pdf","comment":"Accepted by EDBT2024"},{"id":"http://arxiv.org/abs/2401.07494v2","updated":"2024-01-19T06:16:59Z","published":"2024-01-15T06:26:53Z","title":"Input Convex Lipschitz RNN: A Fast and Robust Approach for Engineering\n  Tasks","summary":"  Computational efficiency and adversarial robustness are critical factors in\nreal-world engineering applications. Yet, conventional neural networks often\nfall short in addressing both simultaneously, or even separately. Drawing\ninsights from natural physical systems and existing literature, it is known\nthat an input convex architecture enhances computational efficiency, while a\nLipschitz-constrained architecture bolsters adversarial robustness. By\nleveraging the strengths of convexity and Lipschitz continuity, we develop a\nnovel network architecture, termed Input Convex Lipschitz Recurrent Neural\nNetworks. This model outperforms existing recurrent units across a spectrum of\nengineering tasks in terms of computational efficiency and adversarial\nrobustness. These tasks encompass a benchmark MNIST image classification,\nreal-world solar irradiance prediction for Solar PV system planning at LHT\nHoldings in Singapore, and real-time Model Predictive Control optimization for\na chemical reactor.\n","authors":["Zihao Wang","P S Pravin","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2401.07494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10516v1","updated":"2024-01-19T06:14:36Z","published":"2024-01-19T06:14:36Z","title":"Episodic Reinforcement Learning with Expanded State-reward Space","summary":"  Empowered by deep neural networks, deep reinforcement learning (DRL) has\ndemonstrated tremendous empirical successes in various domains, including\ngames, health care, and autonomous driving. Despite these advancements, DRL is\nstill identified as data-inefficient as effective policies demand vast numbers\nof environmental samples. Recently, episodic control (EC)-based model-free DRL\nmethods enable sample efficiency by recalling past experiences from episodic\nmemory. However, existing EC-based methods suffer from the limitation of\npotential misalignment between the state and reward spaces for neglecting the\nutilization of (past) retrieval states with extensive information, which\nprobably causes inaccurate value estimation and degraded policy performance. To\ntackle this issue, we introduce an efficient EC-based DRL framework with\nexpanded state-reward space, where the expanded states used as the input and\nthe expanded rewards used in the training both contain historical and current\ninformation. To be specific, we reuse the historical states retrieved by EC as\npart of the input states and integrate the retrieved MC-returns into the\nimmediate reward in each interactive transition. As a result, our method is\nable to simultaneously achieve the full utilization of retrieval information\nand the better evaluation of state values by a Temporal Difference (TD) loss.\nEmpirical results on challenging Box2d and Mujoco tasks demonstrate the\nsuperiority of our method over a recent sibling method and common baselines.\nFurther, we also verify our method's effectiveness in alleviating Q-value\noverestimation by additional experiments of Q-value comparison.\n","authors":["Dayang Liang","Yaru Zhang","Yunlong Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10516v1.pdf","comment":"Accepted at AAMAS'24"},{"id":"http://arxiv.org/abs/2310.05492v3","updated":"2024-01-19T06:06:46Z","published":"2023-10-09T07:56:16Z","title":"How Abilities in Large Language Models are Affected by Supervised\n  Fine-tuning Data Composition","summary":"  Large language models (LLMs) with enormous pre-training tokens and parameters\nemerge diverse abilities, including math reasoning, code generation, and\ninstruction following. These abilities are further enhanced by supervised\nfine-tuning (SFT). While the open-source community has explored ad-hoc SFT for\nenhancing individual capabilities, proprietary LLMs exhibit versatility across\nvarious skills. Therefore, understanding the facilitation of multiple abilities\nvia SFT is paramount. In this study, we specifically focuses on the interplay\nof data composition between mathematical reasoning, code generation, and\ngeneral human-aligning abilities during SFT. We propose four intriguing\nresearch questions to explore the association between model performance and\nvarious factors including data amount, composition ratio, model size and SFT\nstrategies. Our experiments reveal that distinct capabilities scale differently\nand larger models generally show superior performance with same amount of data.\nMathematical reasoning and code generation consistently improve with increasing\ndata amount, whereas general abilities plateau after roughly a thousand\nsamples. Moreover, we observe data composition appears to enhance various\nabilities under limited data conditions, yet can lead to performance conflicts\nwhen data is plentiful. Our findings also suggest the amount of composition\ndata influences performance more than the composition ratio. In analysis of SFT\nstrategies, we find that sequentially learning multiple skills risks\ncatastrophic forgetting. Our proposed Dual-stage Mixed Fine-tuning (DMT)\nstrategy offers a promising solution to learn multiple abilities with different\nscaling patterns.\n","authors":["Guanting Dong","Hongyi Yuan","Keming Lu","Chengpeng Li","Mingfeng Xue","Dayiheng Liu","Wei Wang","Zheng Yuan","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.05492v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10510v1","updated":"2024-01-19T05:58:30Z","published":"2024-01-19T05:58:30Z","title":"A match made in consistency heaven: when large language models meet\n  evolutionary algorithms","summary":"  Pre-trained large language models (LLMs) have powerful capabilities for\ngenerating creative natural text. Evolutionary algorithms (EAs) can discover\ndiverse solutions to complex real-world problems. Motivated by the common\ncollective and directionality of text sequence generation and evolution, this\npaper illustrates the strong consistency of LLMs and EAs, which includes\nmultiple one-to-one key characteristics: token embedding and genotype-phenotype\nmapping, position encoding and fitness shaping, position embedding and\nselection, attention and crossover, feed-forward neural network and mutation,\nmodel training and parameter update, and multi-task learning and\nmulti-objective optimization. Based on this consistency perspective, existing\ncoupling studies are analyzed, including evolutionary fine-tuning and\nLLM-enhanced EAs. Leveraging these insights, we outline a fundamental roadmap\nfor future research in coupling LLMs and EAs, while highlighting key challenges\nalong the way. The consistency not only reveals the evolution mechanism behind\nLLMs but also facilitates the development of evolved artificial agents that\napproach or surpass biological organisms.\n","authors":["Wang Chao","Jiaxuan Zhao","Licheng Jiao","Lingling Li","Fang Liu","Shuyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2401.10510v1.pdf","comment":"A perspective article under review"},{"id":"http://arxiv.org/abs/2311.07202v3","updated":"2024-01-19T05:54:53Z","published":"2023-11-13T09:41:32Z","title":"Input Convex LSTM: A Convex Approach for Fast Lyapunov-Based Model\n  Predictive Control","summary":"  Leveraging Input Convex Neural Networks (ICNNs), ICNN-based Model Predictive\nControl (MPC) successfully attains globally optimal solutions by upholding\nconvexity within the MPC framework. However, current ICNN architectures\nencounter the issue of vanishing/exploding gradients, which limits their\nability to serve as deep neural networks for complex tasks. Additionally, the\ncurrent neural network-based MPC, including conventional neural network-based\nMPC and ICNN-based MPC, faces slower convergence speed when compared to MPC\nbased on first-principles models. In this study, we leverage the principles of\nICNNs to propose a novel Input Convex LSTM for Lyapunov-based MPC, with the\nspecific goal of reducing convergence time and mitigating the\nvanishing/exploding gradient problem while ensuring closed-loop stability. From\na simulation study of a nonlinear chemical reactor, we observed a mitigation of\nvanishing/exploding gradient problem and a reduction in convergence time, with\na percentage decrease of 46.7%, 31.3%, and 20.2% compared to baseline plain\nRNN, plain LSTM, and Input Convex Recurrent Neural Networks, respectively.\n","authors":["Zihao Wang","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2311.07202v3.pdf","comment":"Submitted to 6th Annual Learning for Dynamics & Control Conference\n  (L4DC 2024)"},{"id":"http://arxiv.org/abs/2401.08216v2","updated":"2024-01-19T05:31:07Z","published":"2024-01-16T09:02:34Z","title":"Towards Efficient and Certified Recovery from Poisoning Attacks in\n  Federated Learning","summary":"  Federated learning (FL) is vulnerable to poisoning attacks, where malicious\nclients manipulate their updates to affect the global model. Although various\nmethods exist for detecting those clients in FL, identifying malicious clients\nrequires sufficient model updates, and hence by the time malicious clients are\ndetected, FL models have been already poisoned. Thus, a method is needed to\nrecover an accurate global model after malicious clients are identified.\nCurrent recovery methods rely on (i) all historical information from\nparticipating FL clients and (ii) the initial model unaffected by the malicious\nclients, leading to a high demand for storage and computational resources. In\nthis paper, we show that highly effective recovery can still be achieved based\non (i) selective historical information rather than all historical information\nand (ii) a historical model that has not been significantly affected by\nmalicious clients rather than the initial model. In this scenario, while\nmaintaining comparable recovery performance, we can accelerate the recovery\nspeed and decrease memory consumption. Following this concept, we introduce\nCrab, an efficient and certified recovery method, which relies on selective\ninformation storage and adaptive model rollback. Theoretically, we demonstrate\nthat the difference between the global model recovered by Crab and the one\nrecovered by train-from-scratch can be bounded under certain assumptions. Our\nempirical evaluation, conducted across three datasets over multiple machine\nlearning models, and a variety of untargeted and targeted poisoning attacks\nreveals that Crab is both accurate and efficient, and consistently outperforms\nprevious approaches in terms of both recovery speed and memory consumption.\n","authors":["Yu Jiang","Jiyuan Shen","Ziyao Liu","Chee Wei Tan","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2401.08216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10495v1","updated":"2024-01-19T05:18:28Z","published":"2024-01-19T05:18:28Z","title":"Causal Layering via Conditional Entropy","summary":"  Causal discovery aims to recover information about an unobserved causal graph\nfrom the observable data it generates. Layerings are orderings of the variables\nwhich place causes before effects. In this paper, we provide ways to recover\nlayerings of a graph by accessing the data via a conditional entropy oracle,\nwhen distributions are discrete. Our algorithms work by repeatedly removing\nsources or sinks from the graph. Under appropriate assumptions and\nconditioning, we can separate the sources or sinks from the remainder of the\nnodes by comparing their conditional entropy to the unconditional entropy of\ntheir noise. Our algorithms are provably correct and run in worst-case\nquadratic time. The main assumptions are faithfulness and injective noise, and\neither known noise entropies or weakly monotonically increasing noise entropies\nalong directed paths. In addition, we require one of either a very mild\nextension of faithfulness, or strictly monotonically increasing noise\nentropies, or expanding noise injectivity to include an additional single\nargument in the structural functions.\n","authors":["Itai Feigenbaum","Devansh Arpit","Huan Wang","Shelby Heinecke","Juan Carlos Niebles","Weiran Yao","Caiming Xiong","Silvio Savarese"],"pdf_url":"https://arxiv.org/pdf/2401.10495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10490v1","updated":"2024-01-19T05:01:43Z","published":"2024-01-19T05:01:43Z","title":"Generalization Error Guaranteed Auto-Encoder-Based Nonlinear Model\n  Reduction for Operator Learning","summary":"  Many physical processes in science and engineering are naturally represented\nby operators between infinite-dimensional function spaces. The problem of\noperator learning, in this context, seeks to extract these physical processes\nfrom empirical data, which is challenging due to the infinite or high\ndimensionality of data. An integral component in addressing this challenge is\nmodel reduction, which reduces both the data dimensionality and problem size.\nIn this paper, we utilize low-dimensional nonlinear structures in model\nreduction by investigating Auto-Encoder-based Neural Network (AENet). AENet\nfirst learns the latent variables of the input data and then learns the\ntransformation from these latent variables to corresponding output data. Our\nnumerical experiments validate the ability of AENet to accurately learn the\nsolution operator of nonlinear partial differential equations. Furthermore, we\nestablish a mathematical and statistical estimation theory that analyzes the\ngeneralization error of AENet. Our theoretical framework shows that the sample\ncomplexity of training AENet is intricately tied to the intrinsic dimension of\nthe modeled process, while also demonstrating the remarkable resilience of\nAENet to noise.\n","authors":["Hao Liu","Biraj Dahal","Rongjie Lai","Wenjing Liao"],"pdf_url":"https://arxiv.org/pdf/2401.10490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06120v3","updated":"2024-01-19T04:13:33Z","published":"2023-02-13T06:00:56Z","title":"Knowledge from Large-Scale Protein Contact Prediction Models Can Be\n  Transferred to the Data-Scarce RNA Contact Prediction Task","summary":"  RNA, whose functionality is largely determined by its structure, plays an\nimportant role in many biological activities. The prediction of pairwise\nstructural proximity between each nucleotide of an RNA sequence can\ncharacterize the structural information of the RNA. Historically, this problem\nhas been tackled by machine learning models using expert-engineered features\nand trained on scarce labeled datasets. Here, we find that the knowledge\nlearned by a protein-coevolution Transformer-based deep neural network can be\ntransferred to the RNA contact prediction task. As protein datasets are orders\nof magnitude larger than those for RNA contact prediction, our findings and the\nsubsequent framework greatly reduce the data scarcity bottleneck. Experiments\nconfirm that RNA contact prediction through transfer learning using a publicly\navailable protein model is greatly improved. Our findings indicate that the\nlearned structural patterns of proteins can be transferred to RNAs, opening up\npotential new avenues for research.\n","authors":["Yiren Jian","Chongyang Gao","Chen Zeng","Yunjie Zhao","Soroush Vosoughi"],"pdf_url":"https://arxiv.org/pdf/2302.06120v3.pdf","comment":"The code is available at\n  https://github.com/yiren-jian/CoT-RNA-Transfer"},{"id":"http://arxiv.org/abs/2401.10478v1","updated":"2024-01-19T04:02:49Z","published":"2024-01-19T04:02:49Z","title":"Budgeted Online Model Selection and Fine-Tuning via Federated Learning","summary":"  Online model selection involves selecting a model from a set of candidate\nmodels 'on the fly' to perform prediction on a stream of data. The choice of\ncandidate models henceforth has a crucial impact on the performance. Although\nemploying a larger set of candidate models naturally leads to more flexibility\nin model selection, this may be infeasible in cases where prediction tasks are\nperformed on edge devices with limited memory. Faced with this challenge, the\npresent paper proposes an online federated model selection framework where a\ngroup of learners (clients) interacts with a server with sufficient memory such\nthat the server stores all candidate models. However, each client only chooses\nto store a subset of models that can be fit into its memory and performs its\nown prediction task using one of the stored models. Furthermore, employing the\nproposed algorithm, clients and the server collaborate to fine-tune models to\nadapt them to a non-stationary environment. Theoretical analysis proves that\nthe proposed algorithm enjoys sub-linear regret with respect to the best model\nin hindsight. Experiments on real datasets demonstrate the effectiveness of the\nproposed algorithm.\n","authors":["Pouya M. Ghari","Yanning Shen"],"pdf_url":"https://arxiv.org/pdf/2401.10478v1.pdf","comment":"Accepted by Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2401.10474v1","updated":"2024-01-19T03:50:19Z","published":"2024-01-19T03:50:19Z","title":"LDReg: Local Dimensionality Regularized Self-Supervised Learning","summary":"  Representations learned via self-supervised learning (SSL) can be susceptible\nto dimensional collapse, where the learned representation subspace is of\nextremely low dimensionality and thus fails to represent the full data\ndistribution and modalities. Dimensional collapse also known as the\n\"underfilling\" phenomenon is one of the major causes of degraded performance on\ndownstream tasks. Previous work has investigated the dimensional collapse\nproblem of SSL at a global level. In this paper, we demonstrate that\nrepresentations can span over high dimensional space globally, but collapse\nlocally. To address this, we propose a method called $\\textit{local\ndimensionality regularization (LDReg)}$. Our formulation is based on the\nderivation of the Fisher-Rao metric to compare and optimize local distance\ndistributions at an asymptotically small radius for each data point. By\nincreasing the local intrinsic dimensionality, we demonstrate through a range\nof experiments that LDReg improves the representation quality of SSL. The\nresults also show that LDReg can regularize dimensionality at both local and\nglobal levels.\n","authors":["Hanxun Huang","Ricardo J. G. B. Campello","Sarah Monazam Erfani","Xingjun Ma","Michael E. Houle","James Bailey"],"pdf_url":"https://arxiv.org/pdf/2401.10474v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10467v1","updated":"2024-01-19T03:39:43Z","published":"2024-01-19T03:39:43Z","title":"Learning Backdoors for Mixed Integer Programs with Contrastive Learning","summary":"  Many real-world problems can be efficiently modeled as Mixed Integer Programs\n(MIPs) and solved with the Branch-and-Bound method. Prior work has shown the\nexistence of MIP backdoors, small sets of variables such that prioritizing\nbranching on them when possible leads to faster running times. However, finding\nhigh-quality backdoors that improve running times remains an open question.\nPrevious work learns to estimate the relative solver speed of randomly sampled\nbackdoors through ranking and then decide whether to use it. In this paper, we\nutilize the Monte-Carlo tree search method to collect backdoors for training,\nrather than relying on random sampling, and adapt a contrastive learning\nframework to train a Graph Attention Network model to predict backdoors. Our\nmethod, evaluated on four common MIP problem domains, demonstrates performance\nimprovements over both Gurobi and previous models.\n","authors":["Junyang Cai","Taoan Huang","Bistra Dilkina"],"pdf_url":"https://arxiv.org/pdf/2401.10467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05225v2","updated":"2024-01-19T03:34:11Z","published":"2023-12-08T18:20:43Z","title":"Neural Spectral Methods: Self-supervised learning in the spectral domain","summary":"  We present Neural Spectral Methods, a technique to solve parametric Partial\nDifferential Equations (PDEs), grounded in classical spectral methods. Our\nmethod uses orthogonal bases to learn PDE solutions as mappings between\nspectral coefficients. In contrast to current machine learning approaches which\nenforce PDE constraints by minimizing the numerical quadrature of the residuals\nin the spatiotemporal domain, we leverage Parseval's identity and introduce a\nnew training strategy through a \\textit{spectral loss}. Our spectral loss\nenables more efficient differentiation through the neural network, and\nsubstantially reduces training complexity. At inference time, the computational\ncost of our method remains constant, regardless of the spatiotemporal\nresolution of the domain. Our experimental results demonstrate that our method\nsignificantly outperforms previous machine learning approaches in terms of\nspeed and accuracy by one to two orders of magnitude on multiple different\nproblems. When compared to numerical solvers of the same accuracy, our method\ndemonstrates a $10\\times$ increase in performance speed.\n","authors":["Yiheng Du","Nithin Chalapathi","Aditi Krishnapriyan"],"pdf_url":"https://arxiv.org/pdf/2312.05225v2.pdf","comment":"Accepted to International Conference on Learning Representations\n  (ICLR) 2024"},{"id":"http://arxiv.org/abs/2401.10463v1","updated":"2024-01-19T03:24:36Z","published":"2024-01-19T03:24:36Z","title":"Critical Data Size of Language Models from a Grokking Perspective","summary":"  We explore the critical data size in language models, a threshold that marks\na fundamental shift from quick memorization to slow generalization. We\nformalize the phase transition under the grokking configuration into the Data\nEfficiency Hypothesis and identify data insufficiency, sufficiency, and surplus\nregimes in language models training dynamics. We develop a grokking\nconfiguration to reproduce grokking on simplistic language models stably by\nrescaling initialization and weight decay. We show that generalization occurs\nonly when language models reach a critical size. We analyze grokking across\nsample-wise and model-wise, verifying the proposed data efficiency hypothesis.\nOur experiments reveal smoother phase transitions occurring at the critical\ndataset size for language datasets. As the model size increases, this critical\npoint also becomes larger, indicating that larger models require more data. Our\nresults deepen the understanding of language model training, offering a novel\nperspective on the role of data in the learning mechanism of language models.\n","authors":["Xuekai Zhu","Yao Fu","Bowen Zhou","Zhouhan Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11171v4","updated":"2024-01-19T03:23:21Z","published":"2023-04-21T03:26:29Z","title":"Granular-ball computing: an efficient, robust, and interpretable\n  adaptive multi-granularity representation and computation method","summary":"  Human cognition operates on a \"Global-first\" cognitive mechanism,\nprioritizing information processing based on coarse-grained details. This\nmechanism inherently possesses an adaptive multi-granularity description\ncapacity, resulting in computational traits such as efficiency, robustness, and\ninterpretability. The analysis pattern reliance on the finest granularity and\nsingle-granularity makes most existing computational methods less efficient,\nrobust, and interpretable, which is an important reason for the current lack of\ninterpretability in neural networks. Multi-granularity granular-ball computing\nemploys granular-balls of varying sizes to daptively represent and envelop the\nsample space, facilitating learning based on these granular-balls. Given that\nthe number of coarse-grained \"granular-balls\" is fewer than sample points,\ngranular-ball computing proves more efficient. Moreover, the inherent\ncoarse-grained nature of granular-balls reduces susceptibility to fine-grained\nsample disturbances, enhancing robustness. The multi-granularity construct of\ngranular-balls generates topological structures and coarse-grained\ndescriptions, naturally augmenting interpretability. Granular-ball computing\nhas successfully ventured into diverse AI domains, fostering the development of\ninnovative theoretical methods, including granular-ball classifiers, clustering\ntechniques, neural networks, rough sets, and evolutionary computing. This has\nnotably ameliorated the efficiency, noise robustness, and interpretability of\ntraditional methods. Overall, granular-ball computing is a rare and innovative\ntheoretical approach in AI that can adaptively and simultaneously enhance\nefficiency, robustness, and interpretability. This article delves into the main\napplication landscapes for granular-ball computing, aiming to equip future\nresearchers with references and insights to refine and expand this promising\ntheory.\n","authors":["Shuyin Xia","Guoyin Wang","Xinbo Gao","Xiaoyu Lian"],"pdf_url":"https://arxiv.org/pdf/2304.11171v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01521v2","updated":"2024-01-19T03:21:28Z","published":"2022-12-03T03:39:44Z","title":"Distribution Fitting for Combating Mode Collapse in Generative\n  Adversarial Networks","summary":"  Mode collapse is a significant unsolved issue of generative adversarial\nnetworks. In this work, we examine the causes of mode collapse from a novel\nperspective. Due to the nonuniform sampling in the training process, some\nsub-distributions may be missed when sampling data. As a result, even when the\ngenerated distribution differs from the real one, the GAN objective can still\nachieve the minimum. To address the issue, we propose a global distribution\nfitting (GDF) method with a penalty term to confine the generated data\ndistribution. When the generated distribution differs from the real one, GDF\nwill make the objective harder to reach the minimal value, while the original\nglobal minimum is not changed. To deal with the circumstance when the overall\nreal data is unreachable, we also propose a local distribution fitting (LDF)\nmethod. Experiments on several benchmarks demonstrate the effectiveness and\ncompetitive performance of GDF and LDF.\n","authors":["Yanxiang Gong","Zhiwei Xie","Guozhen Duan","Zheng Ma","Mei Xie"],"pdf_url":"https://arxiv.org/pdf/2212.01521v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18426v3","updated":"2024-01-19T02:56:41Z","published":"2023-11-30T10:24:07Z","title":"Convergence Analysis of Fractional Gradient Descent","summary":"  Fractional derivatives are a well-studied generalization of integer order\nderivatives. Naturally, for optimization, it is of interest to understand the\nconvergence properties of gradient descent using fractional derivatives.\nConvergence analysis of fractional gradient descent is currently limited both\nin the methods analyzed and the settings analyzed. This paper aims to fill in\nthese gaps by analyzing variations of fractional gradient descent in smooth and\nconvex, smooth and strongly convex, and smooth and non-convex settings. First,\nnovel bounds will be established bridging fractional and integer derivatives.\nThen, these bounds will be applied to the aforementioned settings to prove\nlinear convergence for smooth and strongly convex functions and $O(1/T)$\nconvergence for smooth and convex functions. Additionally, we prove $O(1/T)$\nconvergence for smooth and non-convex functions using an extended notion of\nsmoothness - H\\\"older smoothness - that is more natural for fractional\nderivatives. Finally, empirical results will be presented on the potential\nspeed up of fractional gradient descent over standard gradient descent as well\nas the challenges of predicting which will be faster in general.\n","authors":["Ashwani Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2311.18426v3.pdf","comment":"24 pages, 4 figures. Added additional results for smooth and convex\n  functions"},{"id":"http://arxiv.org/abs/2401.10460v1","updated":"2024-01-19T02:51:00Z","published":"2024-01-19T02:51:00Z","title":"Ultra-lightweight Neural Differential DSP Vocoder For High Quality\n  Speech Synthesis","summary":"  Neural vocoders model the raw audio waveform and synthesize high-quality\naudio, but even the highly efficient ones, like MB-MelGAN and LPCNet, fail to\nrun real-time on a low-end device like a smartglass. A pure digital signal\nprocessing (DSP) based vocoder can be implemented via lightweight fast Fourier\ntransforms (FFT), and therefore, is a magnitude faster than any neural vocoder.\nA DSP vocoder often gets a lower audio quality due to consuming over-smoothed\nacoustic model predictions of approximate representations for the vocal tract.\nIn this paper, we propose an ultra-lightweight differential DSP (DDSP) vocoder\nthat uses a jointly optimized acoustic model with a DSP vocoder, and learns\nwithout an extracted spectral feature for the vocal tract. The model achieves\naudio quality comparable to neural vocoders with a high average MOS of 4.36\nwhile being efficient as a DSP vocoder. Our C++ implementation, without any\nhardware-specific optimization, is at 15 MFLOPS, surpasses MB-MelGAN by 340\ntimes in terms of FLOPS, and achieves a vocoder-only RTF of 0.003 and overall\nRTF of 0.044 while running single-threaded on a 2GHz Intel Xeon CPU.\n","authors":["Prabhav Agrawal","Thilo Koehler","Zhiping Xiu","Prashant Serai","Qing He"],"pdf_url":"https://arxiv.org/pdf/2401.10460v1.pdf","comment":"Accepted for ICASSP 2024"},{"id":"http://arxiv.org/abs/2310.03320v4","updated":"2024-01-19T02:47:51Z","published":"2023-10-05T05:30:42Z","title":"BioBridge: Bridging Biomedical Foundation Models via Knowledge Graphs","summary":"  Foundation models (FMs) are able to leverage large volumes of unlabeled data\nto demonstrate superior performance across a wide range of tasks. However, FMs\ndeveloped for biomedical domains have largely remained unimodal, i.e.,\nindependently trained and used for tasks on protein sequences alone, small\nmolecule structures alone, or clinical data alone. To overcome this limitation\nof biomedical FMs, we present BioBridge, a novel parameter-efficient learning\nframework, to bridge independently trained unimodal FMs to establish multimodal\nbehavior. BioBridge achieves it by utilizing Knowledge Graphs (KG) to learn\ntransformations between one unimodal FM and another without fine-tuning any\nunderlying unimodal FMs. Our empirical results demonstrate that BioBridge can\nbeat the best baseline KG embedding methods (on average by around 76.3%) in\ncross-modal retrieval tasks. We also identify BioBridge demonstrates\nout-of-domain generalization ability by extrapolating to unseen modalities or\nrelations. Additionally, we also show that BioBridge presents itself as a\ngeneral purpose retriever that can aid biomedical multimodal question answering\nas well as enhance the guided generation of novel drugs.\n","authors":["Zifeng Wang","Zichen Wang","Balasubramaniam Srinivasan","Vassilis N. Ioannidis","Huzefa Rangwala","Rishita Anubhai"],"pdf_url":"https://arxiv.org/pdf/2310.03320v4.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2311.15497v3","updated":"2024-01-19T02:45:44Z","published":"2023-11-27T02:48:06Z","title":"Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning\n  and Optimization Functions for Enhanced Precision","summary":"  Image registration has traditionally been done using two distinct approaches:\nlearning based methods, relying on robust deep neural networks, and\noptimization-based methods, applying complex mathematical transformations to\nwarp images accordingly. Of course, both paradigms offer advantages and\ndisadvantages, and, in this work, we seek to combine their respective strengths\ninto a single streamlined framework, using the outputs of the learning based\nmethod as initial parameters for optimization while prioritizing computational\npower for the image pairs that offer the greatest loss. Our investigations\nshowed improvements of up to 1.6% in test data, while maintaining the same\ninference time, and a substantial 1.0% points performance gain in deformation\nfield smoothness.\n","authors":["Gabriel De Araujo","Shanlin Sun","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2311.15497v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08897v2","updated":"2024-01-19T02:39:59Z","published":"2024-01-17T00:46:24Z","title":"CFASL: Composite Factor-Aligned Symmetry Learning for Disentanglement in\n  Variational AutoEncoder","summary":"  Symmetries of input and latent vectors have provided valuable insights for\ndisentanglement learning in VAEs.However, only a few works were proposed as an\nunsupervised method, and even these works require known factor information in\ntraining data. We propose a novel method, Composite Factor-Aligned Symmetry\nLearning (CFASL), which is integrated into VAEs for learning symmetry-based\ndisentanglement in unsupervised learning without any knowledge of the dataset\nfactor information.CFASL incorporates three novel features for learning\nsymmetry-based disentanglement: 1) Injecting inductive bias to align latent\nvector dimensions to factor-aligned symmetries within an explicit learnable\nsymmetry codebook 2) Learning a composite symmetry to express unknown factors\nchange between two random samples by learning factor-aligned symmetries within\nthe codebook 3) Inducing group equivariant encoder and decoder in training VAEs\nwith the two conditions. In addition, we propose an extended evaluation metric\nfor multi-factor changes in comparison to disentanglement evaluation in VAEs.\nIn quantitative and in-depth qualitative analysis, CFASL demonstrates a\nsignificant improvement of disentanglement in single-factor change, and\nmulti-factor change conditions compared to state-of-the-art methods.\n","authors":["Hee-Jun Jung","Jaehyoung Jeong","Kangil Kim"],"pdf_url":"https://arxiv.org/pdf/2401.08897v2.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2303.03183v2","updated":"2024-01-19T02:31:58Z","published":"2023-03-03T03:17:45Z","title":"Utilizing synthetic training data for the supervised classification of\n  rat ultrasonic vocalizations","summary":"  Murine rodents generate ultrasonic vocalizations (USVs) with frequencies that\nextend to around 120kHz. These calls are important in social behaviour, and so\ntheir analysis can provide insights into the function of vocal communication,\nand its dysfunction. The manual identification of USVs, and subsequent\nclassification into different subcategories is time consuming. Although machine\nlearning approaches for identification and classification can lead to enormous\nefficiency gains, the time and effort required to generate training data can be\nhigh, and the accuracy of current approaches can be problematic. Here we\ncompare the detection and classification performance of a trained human against\ntwo convolutional neural networks (CNNs), DeepSqueak and VocalMat, on audio\ncontaining rat USVs. Furthermore, we test the effect of inserting synthetic\nUSVs into the training data of the VocalMat CNN as a means of reducing the\nworkload associated with generating a training set. Our results indicate that\nVocalMat outperformed the DeepSqueak CNN on measures of call identification,\nand classification. Additionally, we found that the augmentation of training\ndata with synthetic images resulted in a further improvement in accuracy, such\nthat it was sufficiently close to human performance to allow for the use of\nthis software in laboratory conditions.\n","authors":["K. Jack Scott","Lucinda J. Speers","David K. Bilkey"],"pdf_url":"https://arxiv.org/pdf/2303.03183v2.pdf","comment":"25 pages, 5 main figures, 2 tables"},{"id":"http://arxiv.org/abs/2302.13854v2","updated":"2024-01-19T02:19:29Z","published":"2023-02-24T04:28:46Z","title":"A Deep Neural Network Based Reverse Radio Spectrogram Search Algorithm","summary":"  Modern radio astronomy instruments generate vast amounts of data, and the\nincreasingly challenging radio frequency interference (RFI) environment\nnecessitates ever-more sophisticated RFI rejection algorithms. The \"needle in a\nhaystack\" nature of searches for transients and technosignatures requires us to\ndevelop methods that can determine whether a signal of interest has unique\nproperties, or is a part of some larger set of pernicious RFI. In the past,\nthis vetting has required onerous manual inspection of very large numbers of\nsignals. In this paper we present a fast and modular deep learning algorithm to\nsearch for lookalike signals of interest in radio spectrogram data. First, we\ntrained a B-Variational Autoencoder on signals returned by an energy detection\nalgorithm. We then adapted a positional embedding layer from classical\nTransformer architecture to a embed additional metadata, which we demonstrate\nusing a frequency-based embedding. Next we used the encoder component of the\nB-Variational Autoencoder to extract features from small (~ 715,Hz, with a\nresolution of 2.79Hz per frequency bin) windows in the radio spectrogram. We\nused our algorithm to conduct a search for a given query (encoded signal of\ninterest) on a set of signals (encoded features of searched items) to produce\nthe top candidates with similar features. We successfully demonstrate that the\nalgorithm retrieves signals with similar appearance, given only the original\nradio spectrogram data. This algorithm can be used to improve the efficiency of\nvetting signals of interest in technosignature searches, but could also be\napplied to a wider variety of searches for \"lookalike\" signals in large\nastronomical datasets.\n","authors":["Peter Xiangyuan Ma","Steve Croft","Chris Lintott","Andrew P. V. Siemion"],"pdf_url":"https://arxiv.org/pdf/2302.13854v2.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.10458v1","updated":"2024-01-19T02:16:30Z","published":"2024-01-19T02:16:30Z","title":"Contrastive Unlearning: A Contrastive Approach to Machine Unlearning","summary":"  Machine unlearning aims to eliminate the influence of a subset of training\nsamples (i.e., unlearning samples) from a trained model. Effectively and\nefficiently removing the unlearning samples without negatively impacting the\noverall model performance is still challenging. In this paper, we propose a\ncontrastive unlearning framework, leveraging the concept of representation\nlearning for more effective unlearning. It removes the influence of unlearning\nsamples by contrasting their embeddings against the remaining samples so that\nthey are pushed away from their original classes and pulled toward other\nclasses. By directly optimizing the representation space, it effectively\nremoves the influence of unlearning samples while maintaining the\nrepresentations learned from the remaining samples. Experiments on a variety of\ndatasets and models on both class unlearning and sample unlearning showed that\ncontrastive unlearning achieves the best unlearning effects and efficiency with\nthe lowest performance loss compared with the state-of-the-art algorithms.\n","authors":["Hong kyu Lee","Qiuchen Zhang","Carl Yang","Jian Lou","Li Xiong"],"pdf_url":"https://arxiv.org/pdf/2401.10458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10451v1","updated":"2024-01-19T01:40:58Z","published":"2024-01-19T01:40:58Z","title":"Learning-assisted Stochastic Capacity Expansion Planning: A Bayesian\n  Optimization Approach","summary":"  Solving large-scale capacity expansion problems (CEPs) is central to\ncost-effective decarbonization of regional-scale energy systems. To ensure the\nintended outcomes of CEPs, modeling uncertainty due to weather-dependent\nvariable renewable energy (VRE) supply and energy demand becomes crucially\nimportant. However, the resulting stochastic optimization models are often less\ncomputationally tractable than their deterministic counterparts. Here, we\npropose a learning-assisted approximate solution method to tractably solve\ntwo-stage stochastic CEPs. Our method identifies low-cost planning decisions by\nconstructing and solving a sequence of tractable temporally aggregated\nsurrogate problems. We adopt a Bayesian optimization approach to searching the\nspace of time series aggregation hyperparameters and compute approximate\nsolutions that minimize costs on a validation set of supply-demand projections.\nImportantly, we evaluate solved planning outcomes on a held-out set of test\nprojections. We apply our approach to generation and transmission expansion\nplanning for a joint power-gas system spanning New England. We show that our\napproach yields an estimated cost savings of up to 3.8% in comparison to\nbenchmark time series aggregation approaches.\n","authors":["Aron Brenner","Rahman Khorramfar","Dharik Mallapragada","Saurabh Amin"],"pdf_url":"https://arxiv.org/pdf/2401.10451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.05359v3","updated":"2024-01-19T01:30:56Z","published":"2022-05-11T09:11:02Z","title":"Exploring Local Explanations of Nonlinear Models Using Animated Linear\n  Projections","summary":"  The increased predictive power of machine learning models comes at the cost\nof increased complexity and loss of interpretability, particularly in\ncomparison to parametric statistical models. This trade-off has led to the\nemergence of eXplainable AI (XAI) which provides methods, such as local\nexplanations (LEs) and local variable attributions (LVAs), to shed light on how\na model use predictors to arrive at a prediction. These provide a point\nestimate of the linear variable importance in the vicinity of a single\nobservation. However, LVAs tend not to effectively handle association between\npredictors. To understand how the interaction between predictors affects the\nvariable importance estimate, we can convert LVAs into linear projections and\nuse the radial tour. This is also useful for learning how a model has made a\nmistake, or the effect of outliers, or the clustering of observations. The\napproach is illustrated with examples from categorical (penguin species,\nchocolate types) and quantitative (soccer/football salaries, house prices)\nresponse models. The methods are implemented in the R package cheem, available\non CRAN.\n","authors":["Nicholas Spyrison","Dianne Cook","Przemyslaw Biecek"],"pdf_url":"https://arxiv.org/pdf/2205.05359v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10447v1","updated":"2024-01-19T01:30:16Z","published":"2024-01-19T01:30:16Z","title":"Investigating Training Strategies and Model Robustness of Low-Rank\n  Adaptation for Language Modeling in Speech Recognition","summary":"  The use of low-rank adaptation (LoRA) with frozen pretrained language models\n(PLMs) has become increasing popular as a mainstream, resource-efficient\nmodeling approach for memory-constrained hardware. In this study, we first\nexplore how to enhance model performance by introducing various LoRA training\nstrategies, achieving relative word error rate reductions of 3.50\\% on the\npublic Librispeech dataset and of 3.67\\% on an internal dataset in the\nmessaging domain. To further characterize the stability of LoRA-based\nsecond-pass speech recognition models, we examine robustness against input\nperturbations. These perturbations are rooted in homophone replacements and a\nnovel metric called N-best Perturbation-based Rescoring Robustness (NPRR), both\ndesigned to measure the relative degradation in the performance of rescoring\nmodels. Our experimental results indicate that while advanced variants of LoRA,\nsuch as dynamic rank-allocated LoRA, lead to performance degradation in\n$1$-best perturbation, they alleviate the degradation in $N$-best perturbation.\nThis finding is in comparison to fully-tuned models and vanilla LoRA tuning\nbaselines, suggesting that a comprehensive selection is needed when using\nLoRA-based adaptation for compute-cost savings and robust language modeling.\n","authors":["Yu Yu","Chao-Han Huck Yang","Tuan Dinh","Sungho Ryu","Jari Kolehmainen","Roger Ren","Denis Filimonov","Prashanth G. Shivakumar","Ankur Gandhe","Ariya Rastow","Jia Xu","Ivan Bulyko","Andreas Stolcke"],"pdf_url":"https://arxiv.org/pdf/2401.10447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04336v3","updated":"2024-01-19T01:30:04Z","published":"2024-01-09T03:29:40Z","title":"Deep Efficient Private Neighbor Generation for Subgraph Federated\n  Learning","summary":"  Behemoth graphs are often fragmented and separately stored by multiple data\nowners as distributed subgraphs in many realistic applications. Without harming\ndata privacy, it is natural to consider the subgraph federated learning\n(subgraph FL) scenario, where each local client holds a subgraph of the entire\nglobal graph, to obtain globally generalized graph mining models. To overcome\nthe unique challenge of incomplete information propagation on local subgraphs\ndue to missing cross-subgraph neighbors, previous works resort to the\naugmentation of local neighborhoods through the joint FL of missing neighbor\ngenerators and GNNs. Yet their technical designs have profound limitations\nregarding the utility, efficiency, and privacy goals of FL. In this work, we\npropose FedDEP to comprehensively tackle these challenges in subgraph FL.\nFedDEP consists of a series of novel technical designs: (1) Deep neighbor\ngeneration through leveraging the GNN embeddings of potential missing\nneighbors; (2) Efficient pseudo-FL for neighbor generation through embedding\nprototyping; and (3) Privacy protection through noise-less\nedge-local-differential-privacy. We analyze the correctness and efficiency of\nFedDEP, and provide theoretical guarantees on its privacy. Empirical results on\nfour real-world datasets justify the clear benefits of proposed techniques.\n","authors":["Ke Zhang","Lichao Sun","Bolin Ding","Siu Ming Yiu","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2401.04336v3.pdf","comment":"Accepted to SDM 2024"},{"id":"http://arxiv.org/abs/2401.10446v1","updated":"2024-01-19T01:29:27Z","published":"2024-01-19T01:29:27Z","title":"Large Language Models are Efficient Learners of Noise-Robust Speech\n  Recognition","summary":"  Recent advances in large language models (LLMs) have promoted generative\nerror correction (GER) for automatic speech recognition (ASR), which leverages\nthe rich linguistic knowledge and powerful reasoning ability of LLMs to improve\nrecognition results. The latest work proposes a GER benchmark with HyPoradise\ndataset to learn the mapping from ASR N-best hypotheses to ground-truth\ntranscription by efficient LLM finetuning, which shows great effectiveness but\nlacks specificity on noise-robust ASR. In this work, we extend the benchmark to\nnoisy conditions and investigate if we can teach LLMs to perform denoising for\nGER just like what robust ASR do}, where one solution is introducing noise\ninformation as a conditioner into LLM. However, directly incorporating noise\nembeddings from audio encoder could harm the LLM tuning due to cross-modality\ngap. To this end, we propose to extract a language-space noise embedding from\nthe N-best list to represent the noise conditions of source speech, which can\npromote the denoising process in GER. Furthermore, in order to enhance its\nrepresentation ability of audio noise, we design a knowledge distillation (KD)\napproach via mutual information estimation to distill the real noise\ninformation in audio embeddings to our language embedding. Experiments on\nvarious latest LLMs demonstrate our approach achieves a new breakthrough with\nup to 53.9% correction improvement in terms of word error rate while with\nlimited training data. Analysis shows that our language-space noise embedding\ncan well represent the noise conditions of source speech, under which\noff-the-shelf LLMs show strong ability of language-space denoising.\n","authors":["Yuchen Hu","Chen Chen","Chao-Han Huck Yang","Ruizhe Li","Chao Zhang","Pin-Yu Chen","EnSiong Chng"],"pdf_url":"https://arxiv.org/pdf/2401.10446v1.pdf","comment":"Accepted to ICLR 2024, Spotlight top 5%, 24 pages. This work will be\n  open sourced at: https://github.com/YUCHEN005/RobustGER under MIT license"},{"id":"http://arxiv.org/abs/2312.10401v2","updated":"2024-01-19T01:25:39Z","published":"2023-12-16T10:05:18Z","title":"Rethinking Dimensional Rationale in Graph Contrastive Learning from\n  Causal Perspective","summary":"  Graph contrastive learning is a general learning paradigm excelling at\ncapturing invariant information from diverse perturbations in graphs. Recent\nworks focus on exploring the structural rationale from graphs, thereby\nincreasing the discriminability of the invariant information. However, such\nmethods may incur in the mis-learning of graph models towards the\ninterpretability of graphs, and thus the learned noisy and task-agnostic\ninformation interferes with the prediction of graphs. To this end, with the\npurpose of exploring the intrinsic rationale of graphs, we accordingly propose\nto capture the dimensional rationale from graphs, which has not received\nsufficient attention in the literature. The conducted exploratory experiments\nattest to the feasibility of the aforementioned roadmap. To elucidate the\ninnate mechanism behind the performance improvement arising from the\ndimensional rationale, we rethink the dimensional rationale in graph\ncontrastive learning from a causal perspective and further formalize the\ncausality among the variables in the pre-training stage to build the\ncorresponding structural causal model. On the basis of the understanding of the\nstructural causal model, we propose the dimensional rationale-aware graph\ncontrastive learning approach, which introduces a learnable dimensional\nrationale acquiring network and a redundancy reduction constraint. The\nlearnable dimensional rationale acquiring network is updated by leveraging a\nbi-level meta-learning technique, and the redundancy reduction constraint\ndisentangles the redundant features through a decorrelation process during\nlearning. Empirically, compared with state-of-the-art methods, our method can\nyield significant performance boosts on various benchmarks with respect to\ndiscriminability and transferability. The code implementation of our method is\navailable at https://github.com/ByronJi/DRGCL.\n","authors":["Qirui Ji","Jiangmeng Li","Jie Hu","Rui Wang","Changwen Zheng","Fanjiang Xu"],"pdf_url":"https://arxiv.org/pdf/2312.10401v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2401.10442v1","updated":"2024-01-19T01:11:44Z","published":"2024-01-19T01:11:44Z","title":"Path Choice Matters for Clear Attribution in Path Methods","summary":"  Rigorousness and clarity are both essential for interpretations of DNNs to\nengender human trust. Path methods are commonly employed to generate rigorous\nattributions that satisfy three axioms. However, the meaning of attributions\nremains ambiguous due to distinct path choices. To address the ambiguity, we\nintroduce \\textbf{Concentration Principle}, which centrally allocates high\nattributions to indispensable features, thereby endowing aesthetic and\nsparsity. We then present \\textbf{SAMP}, a model-agnostic interpreter, which\nefficiently searches the near-optimal path from a pre-defined set of\nmanipulation paths. Moreover, we propose the infinitesimal constraint (IC) and\nmomentum strategy (MS) to improve the rigorousness and optimality.\nVisualizations show that SAMP can precisely reveal DNNs by pinpointing salient\nimage pixels. We also perform quantitative experiments and observe that our\nmethod significantly outperforms the counterparts. Code:\nhttps://github.com/zbr17/SAMP.\n","authors":["Borui Zhang","Wenzhao Zheng","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2401.10442v1.pdf","comment":"ICLR 2024 accepted"},{"id":"http://arxiv.org/abs/2210.02672v3","updated":"2024-01-19T00:57:05Z","published":"2022-10-06T04:30:59Z","title":"A Novel Maximum-Entropy-Driven Technique for Low-Rank Orthogonal\n  Nonnegative Matrix Factorization with $\\ell_0$-Norm sparsity Constraint","summary":"  In data-driven control and machine learning, a common requirement involves\nbreaking down large matrices into smaller, low-rank factors that possess\nspecific levels of sparsity. This paper introduces an innovative solution to\nthe orthogonal nonnegative matrix factorization (ONMF) problem. The objective\nis to approximate input data by using two low-rank nonnegative matrices,\nadhering to both orthogonality and $\\ell_0$-norm sparsity constraints. the\nproposed maximum-entropy-principle based framework ensures orthogonality and\nsparsity of features or the mixing matrix, while maintaining nonnegativity in\nboth. Additionally, the methodology offers a quantitative determination of the\n``true'' number of underlying features, a crucial hyperparameter for ONMF.\nExperimental evaluation on synthetic and a standard datasets highlights the\nmethod's superiority in terms of sparsity, orthogonality, and computational\nspeed compared to existing approaches. Notably, the proposed method achieves\ncomparable or improved reconstruction errors in line with the literature.\n","authors":["Salar Basiri","Srinivasa Salapaka"],"pdf_url":"https://arxiv.org/pdf/2210.02672v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00110v3","updated":"2024-01-19T00:35:35Z","published":"2023-12-30T01:24:25Z","title":"Diffusion Model with Perceptual Loss","summary":"  Diffusion models trained with mean squared error loss tend to generate\nunrealistic samples. Current state-of-the-art models rely on classifier-free\nguidance to improve sample quality, yet its surprising effectiveness is not\nfully understood. In this paper, we show that the effectiveness of\nclassifier-free guidance partly originates from it being a form of implicit\nperceptual guidance. As a result, we can directly incorporate perceptual loss\nin diffusion training to improve sample quality. Since the score matching\nobjective used in diffusion training strongly resembles the denoising\nautoencoder objective used in unsupervised training of perceptual networks, the\ndiffusion model itself is a perceptual network and can be used to generate\nmeaningful perceptual loss. We propose a novel self-perceptual objective that\nresults in diffusion models capable of generating more realistic samples. For\nconditional generation, our method only improves sample quality without\nentanglement with the conditional input and therefore does not sacrifice sample\ndiversity. Our method can also improve sample quality for unconditional\ngeneration, which was not possible with classifier-free guidance before.\n","authors":["Shanchuan Lin","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2401.00110v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07988v3","updated":"2024-01-19T00:28:45Z","published":"2023-09-14T19:01:08Z","title":"Folding Attention: Memory and Power Optimization for On-Device\n  Transformer-based Streaming Speech Recognition","summary":"  Transformer-based models excel in speech recognition. Existing efforts to\noptimize Transformer inference, typically for long-context applications, center\non simplifying attention score calculations. However, streaming speech\nrecognition models usually process a limited number of tokens each time, making\nattention score calculation less of a bottleneck. Instead, the bottleneck lies\nin the linear projection layers of multi-head attention and feedforward\nnetworks, constituting a substantial portion of the model size and contributing\nsignificantly to computation, memory, and power usage.\n  To address this bottleneck, we propose folding attention, a technique\ntargeting these linear layers, significantly reducing model size and improving\nmemory and power efficiency. Experiments on on-device Transformer-based\nstreaming speech recognition models show that folding attention reduces model\nsize (and corresponding memory consumption) by up to 24% and power consumption\nby up to 23%, all without compromising model accuracy or computation overhead.\n","authors":["Yang Li","Liangzhen Lai","Yuan Shangguan","Forrest N. Iandola","Zhaoheng Ni","Ernie Chang","Yangyang Shi","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2309.07988v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10432v1","updated":"2024-01-19T00:27:34Z","published":"2024-01-19T00:27:34Z","title":"A2Q+: Improving Accumulator-Aware Weight Quantization","summary":"  Quantization techniques commonly reduce the inference costs of neural\nnetworks by restricting the precision of weights and activations. Recent\nstudies show that also reducing the precision of the accumulator can further\nimprove hardware efficiency at the risk of numerical overflow, which introduces\narithmetic errors that can degrade model accuracy. To avoid numerical overflow\nwhile maintaining accuracy, recent work proposed accumulator-aware quantization\n(A2Q), a quantization-aware training method that constrains model weights\nduring training to safely use a target accumulator bit width during inference.\nAlthough this shows promise, we demonstrate that A2Q relies on an overly\nrestrictive constraint and a sub-optimal weight initialization strategy that\neach introduce superfluous quantization error. To address these shortcomings,\nwe introduce: (1) an improved bound that alleviates accumulator constraints\nwithout compromising overflow avoidance; and (2) a new strategy for\ninitializing quantized weights from pre-trained floating-point checkpoints. We\ncombine these contributions with weight normalization to introduce A2Q+. We\nsupport our analysis with experiments that show A2Q+ significantly improves the\ntrade-off between accumulator bit width and model accuracy and characterize new\ntrade-offs that arise as a consequence of accumulator constraints.\n","authors":["Ian Colbert","Alessandro Pappalardo","Jakoba Petri-Koenig","Yaman Umuroglu"],"pdf_url":"https://arxiv.org/pdf/2401.10432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.01409v4","updated":"2024-01-19T00:23:28Z","published":"2022-06-03T06:34:09Z","title":"Hybrid Parameter Search and Dynamic Model Selection for Mixed-Variable\n  Bayesian Optimization","summary":"  This paper presents a new type of hybrid model for Bayesian optimization (BO)\nadept at managing mixed variables, encompassing both quantitative (continuous\nand integer) and qualitative (categorical) types. Our proposed new hybrid\nmodels (named hybridM) merge the Monte Carlo Tree Search structure (MCTS) for\ncategorical variables with Gaussian Processes (GP) for continuous ones. hybridM\nleverages the upper confidence bound tree search (UCTS) for MCTS strategy,\nshowcasing the tree architecture's integration into Bayesian optimization. Our\ninnovations, including dynamic online kernel selection in the surrogate\nmodeling phase and a unique UCTS search strategy, position our hybrid models as\nan advancement in mixed-variable surrogate models. Numerical experiments\nunderscore the superiority of hybrid models, highlighting their potential in\nBayesian optimization.\n","authors":["Hengrui Luo","Younghyun Cho","James W. Demmel","Xiaoye S. Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2206.01409v4.pdf","comment":"33 pages, 8 Figures"},{"id":"http://arxiv.org/abs/2305.14402v3","updated":"2024-01-19T00:16:49Z","published":"2023-05-23T10:16:08Z","title":"Enhancing Speech Emotion Recognition Through Differentiable Architecture\n  Search","summary":"  Speech Emotion Recognition (SER) is a critical enabler of emotion-aware\ncommunication in human-computer interactions. Recent advancements in Deep\nLearning (DL) have substantially enhanced the performance of SER models through\nincreased model complexity. However, designing optimal DL architectures\nrequires prior experience and experimental evaluations. Encouragingly, Neural\nArchitecture Search (NAS) offers a promising avenue to determine an optimal DL\nmodel automatically. In particular, Differentiable Architecture Search (DARTS)\nis an efficient method of using NAS to search for optimised models. This paper\nproposes a DARTS-optimised joint CNN and LSTM architecture, to improve SER\nperformance, where the literature informs the selection of CNN and LSTM\ncoupling to offer improved performance. While DARTS has previously been applied\nto CNN and LSTM combinations, our approach introduces a novel mechanism,\nparticularly in selecting CNN operations using DARTS. In contrast to previous\nstudies, we refrain from imposing constraints on the order of the layers for\nthe CNN within the DARTS cell; instead, we allow DARTS to determine the optimal\nlayer order autonomously. Experimenting with the IEMOCAP and MSP-IMPROV\ndatasets, we demonstrate that our proposed methodology achieves significantly\nhigher SER accuracy than hand-engineering the CNN-LSTM configuration. It also\noutperforms the best-reported SER results achieved using DARTS on CNN-LSTM.\n","authors":["Thejan Rajapakshe","Rajib Rana","Sara Khalifa","Berrak Sisman","Björn Schuller"],"pdf_url":"https://arxiv.org/pdf/2305.14402v3.pdf","comment":"5 pages, 4 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2401.10608v1","updated":"2024-01-19T10:37:27Z","published":"2024-01-19T10:37:27Z","title":"M2ORT: Many-To-One Regression Transformer for Spatial Transcriptomics\n  Prediction from Histopathology Images","summary":"  The advancement of Spatial Transcriptomics (ST) has facilitated the\nspatially-aware profiling of gene expressions based on histopathology images.\nAlthough ST data offers valuable insights into the micro-environment of tumors,\nits acquisition cost remains expensive. Therefore, directly predicting the ST\nexpressions from digital pathology images is desired. Current methods usually\nadopt existing regression backbones for this task, which ignore the inherent\nmulti-scale hierarchical data structure of digital pathology images. To address\nthis limit, we propose M2ORT, a many-to-one regression Transformer that can\naccommodate the hierarchical structure of the pathology images through a\ndecoupled multi-scale feature extractor. Different from traditional models that\nare trained with one-to-one image-label pairs, M2ORT accepts multiple pathology\nimages of different magnifications at a time to jointly predict the gene\nexpressions at their corresponding common ST spot, aiming at learning a\nmany-to-one relationship through training. We have tested M2ORT on three public\nST datasets and the experimental results show that M2ORT can achieve\nstate-of-the-art performance with fewer parameters and floating-point\noperations (FLOPs). The code is available at:\nhttps://github.com/Dootmaan/M2ORT/.\n","authors":["Hongyi Wang","Xiuju Du","Jing Liu","Shuyi Ouyang","Yen-Wei Chen","Lanfen Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10475v1","updated":"2024-01-19T03:54:58Z","published":"2024-01-19T03:54:58Z","title":"CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short\n  Video Search Scenarios","summary":"  Vision-Language Models pre-trained on large-scale image-text datasets have\nshown superior performance in downstream tasks such as image retrieval. Most of\nthe images for pre-training are presented in the form of open domain\ncommon-sense visual elements. Differently, video covers in short video search\nscenarios are presented as user-originated contents that provide important\nvisual summaries of videos. In addition, a portion of the video covers come\nwith manually designed cover texts that provide semantic complements. In order\nto fill in the gaps in short video cover data, we establish the first\nlarge-scale cover-text benchmark for Chinese short video search scenarios.\nSpecifically, we release two large-scale datasets CBVS-5M/10M to provide short\nvideo covers, and the manual fine-labeling dataset CBVS-20K to provide real\nuser queries, which serves as an image-text benchmark test in the Chinese short\nvideo search field. To integrate the semantics of cover text in the case of\nmodality missing, we propose UniCLIP where cover texts play a guiding role\nduring training, however are not relied upon by inference. Extensive evaluation\non CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has\nbeen deployed to Tencent's online video search systems with hundreds of\nmillions of visits and achieved significant gains. The complete dataset, code\nand checkpoints will be available upon release.\n","authors":["Xiangshuo Qiao","Xianxin Li","Xiaozhe Qu","Jie Zhang","Yang Liu","Yu Luo","Cihang Jin","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2401.10475v1.pdf","comment":null}]},"2024-01-22T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.06766v2","updated":"2024-01-22T18:55:35Z","published":"2024-01-12T18:58:26Z","title":"Mind Your Format: Towards Consistent Evaluation of In-Context Learning\n  Improvements","summary":"  Large language models demonstrate a remarkable capability for learning to\nsolve new tasks from a few examples. The prompt template, or the way the input\nexamples are formatted to obtain the prompt, is an important yet often\noverlooked aspect of in-context learning. In this work, we conduct a\ncomprehensive study of the template format's influence on the in-context\nlearning performance. We evaluate the impact of the prompt template across\nmodels (from 770M to 70B parameters) and 4 standard classification datasets. We\nshow that a poor choice of the template can reduce the performance of the\nstrongest models and inference methods to a random guess level. More\nimportantly, the best templates do not transfer between different setups and\neven between models of the same family. Our findings show that the currently\nprevalent approach to evaluation, which ignores template selection, may give\nmisleading results due to different templates in different works. As a first\nstep towards mitigating this issue, we propose Template Ensembles that\naggregate model predictions across several templates. This simple test-time\naugmentation boosts average performance while being robust to the choice of\nrandom set of templates.\n","authors":["Anton Voronov","Lena Wolf","Max Ryabinin"],"pdf_url":"https://arxiv.org/pdf/2401.06766v2.pdf","comment":"21 pages, 10 figures. Code:\n  https://github.com/yandex-research/mind-your-format"},{"id":"http://arxiv.org/abs/2401.12208v1","updated":"2024-01-22T18:51:07Z","published":"2024-01-22T18:51:07Z","title":"CheXagent: Towards a Foundation Model for Chest X-Ray Interpretation","summary":"  Chest X-rays (CXRs) are the most frequently performed imaging test in\nclinical practice. Recent advances in the development of vision-language\nfoundation models (FMs) give rise to the possibility of performing automated\nCXR interpretation, which can assist physicians with clinical decision-making\nand improve patient outcomes. However, developing FMs that can accurately\ninterpret CXRs is challenging due to the (1) limited availability of\nlarge-scale vision-language datasets in the medical image domain, (2) lack of\nvision and language encoders that can capture the complexities of medical data,\nand (3) absence of evaluation frameworks for benchmarking the abilities of FMs\non CXR interpretation. In this work, we address these challenges by first\nintroducing \\emph{CheXinstruct} - a large-scale instruction-tuning dataset\ncurated from 28 publicly-available datasets. We then present \\emph{CheXagent} -\nan instruction-tuned FM capable of analyzing and summarizing CXRs. To build\nCheXagent, we design a clinical large language model (LLM) for parsing\nradiology reports, a vision encoder for representing CXR images, and a network\nto bridge the vision and language modalities. Finally, we introduce\n\\emph{CheXbench} - a novel benchmark designed to systematically evaluate FMs\nacross 8 clinically-relevant CXR interpretation tasks. Extensive quantitative\nevaluations and qualitative reviews with five expert radiologists demonstrate\nthat CheXagent outperforms previously-developed general- and medical-domain FMs\non CheXbench tasks. Furthermore, in an effort to improve model transparency, we\nperform a fairness evaluation across factors of sex, race and age to highlight\npotential performance disparities. Our project is at\n\\url{https://stanford-aimi.github.io/chexagent.html}.\n","authors":["Zhihong Chen","Maya Varma","Jean-Benoit Delbrouck","Magdalini Paschali","Louis Blankemeier","Dave Van Veen","Jeya Maria Jose Valanarasu","Alaa Youssef","Joseph Paul Cohen","Eduardo Pontes Reis","Emily B. Tsai","Andrew Johnston","Cameron Olsen","Tanishq Mathew Abraham","Sergios Gatidis","Akshay S. Chaudhari","Curtis Langlotz"],"pdf_url":"https://arxiv.org/pdf/2401.12208v1.pdf","comment":"24 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.12200v1","updated":"2024-01-22T18:39:40Z","published":"2024-01-22T18:39:40Z","title":"APT: Adaptive Pruning and Tuning Pretrained Language Models for\n  Efficient Training and Inference","summary":"  Fine-tuning and inference with large Language Models (LM) are generally known\nto be expensive. Parameter-efficient fine-tuning over pretrained LMs reduces\ntraining memory by updating a small number of LM parameters but does not\nimprove inference efficiency. Structured pruning improves LM inference\nefficiency by removing consistent parameter blocks, yet often increases\ntraining memory and time. To improve both training and inference efficiency, we\nintroduce APT that adaptively prunes and tunes parameters for the LMs. At the\nearly stage of fine-tuning, APT dynamically adds salient tuning parameters for\nfast and accurate convergence while discarding unimportant parameters for\nefficiency. Compared to baselines, our experiments show that APT maintains up\nto 98% task performance when pruning RoBERTa and T5 models with 40% parameters\nleft while keeping 86.4% LLaMA models' performance with 70% parameters\nremained. Furthermore, APT speeds up LMs fine-tuning by up to 8x and reduces\nlarge LMs memory training footprint by up to 70%.\n","authors":["Bowen Zhao","Hannaneh Hajishirzi","Qingqing Cao"],"pdf_url":"https://arxiv.org/pdf/2401.12200v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.12192v1","updated":"2024-01-22T18:34:42Z","published":"2024-01-22T18:34:42Z","title":"Text Embedding Inversion Attacks on Multilingual Language Models","summary":"  Representing textual information as real-numbered embeddings has become the\nnorm in NLP. Moreover, with the rise of public interest in large language\nmodels (LLMs), Embeddings as a Service (EaaS) has rapidly gained traction as a\nbusiness model. This is not without outstanding security risks, as previous\nresearch has demonstrated that sensitive data can be reconstructed from\nembeddings, even without knowledge of the underlying model that generated them.\nHowever, such work is limited by its sole focus on English, leaving all other\nlanguages vulnerable to attacks by malicious actors. %As many international and\nmultilingual companies leverage EaaS, there is an urgent need for research into\nmultilingual LLM security. To this end, this work investigates LLM security\nfrom the perspective of multilingual embedding inversion. Concretely, we define\nthe problem of black-box multilingual and cross-lingual inversion attacks, with\nspecial attention to a cross-domain scenario. Our findings reveal that\nmultilingual models are potentially more vulnerable to inversion attacks than\ntheir monolingual counterparts. This stems from the reduced data requirements\nfor achieving comparable inversion performance in settings where the underlying\nlanguage is not known a-priori. To our knowledge, this work is the first to\ndelve into multilinguality within the context of inversion attacks, and our\nfindings highlight the need for further investigation and enhanced defenses in\nthe area of NLP Security.\n","authors":["Yiyi Chen","Heather Lent","Johannes Bjerva"],"pdf_url":"https://arxiv.org/pdf/2401.12192v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2401.12187v1","updated":"2024-01-22T18:27:08Z","published":"2024-01-22T18:27:08Z","title":"WARM: On the Benefits of Weight Averaged Reward Models","summary":"  Aligning large language models (LLMs) with human preferences through\nreinforcement learning (RLHF) can lead to reward hacking, where LLMs exploit\nfailures in the reward model (RM) to achieve seemingly high rewards without\nmeeting the underlying objectives. We identify two primary challenges when\ndesigning RMs to mitigate reward hacking: distribution shifts during the RL\nprocess and inconsistencies in human preferences. As a solution, we propose\nWeight Averaged Reward Models (WARM), first fine-tuning multiple RMs, then\naveraging them in the weight space. This strategy follows the observation that\nfine-tuned weights remain linearly mode connected when sharing the same\npre-training. By averaging weights, WARM improves efficiency compared to the\ntraditional ensembling of predictions, while improving reliability under\ndistribution shifts and robustness to preference inconsistencies. Our\nexperiments on summarization tasks, using best-of-N and RL methods, shows that\nWARM improves the overall quality and alignment of LLM predictions; for\nexample, a policy RL fine-tuned with WARM has a 79.4% win rate against a policy\nRL fine-tuned with a single RM.\n","authors":["Alexandre Ramé","Nino Vieillard","Léonard Hussenot","Robert Dadashi","Geoffrey Cideron","Olivier Bachem","Johan Ferret"],"pdf_url":"https://arxiv.org/pdf/2401.12187v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.12181v1","updated":"2024-01-22T18:11:01Z","published":"2024-01-22T18:11:01Z","title":"Universal Neurons in GPT2 Language Models","summary":"  A basic question within the emerging field of mechanistic interpretability is\nthe degree to which neural networks learn the same underlying mechanisms. In\nother words, are neural mechanisms universal across different models? In this\nwork, we study the universality of individual neurons across GPT2 models\ntrained from different initial random seeds, motivated by the hypothesis that\nuniversal neurons are likely to be interpretable. In particular, we compute\npairwise correlations of neuron activations over 100 million tokens for every\nneuron pair across five different seeds and find that 1-5\\% of neurons are\nuniversal, that is, pairs of neurons which consistently activate on the same\ninputs. We then study these universal neurons in detail, finding that they\nusually have clear interpretations and taxonomize them into a small number of\nneuron families. We conclude by studying patterns in neuron weights to\nestablish several universal functional roles of neurons in simple circuits:\ndeactivating attention heads, changing the entropy of the next token\ndistribution, and predicting the next token to (not) be within a particular\nset.\n","authors":["Wes Gurnee","Theo Horsley","Zifan Carl Guo","Tara Rezaei Kheirkhah","Qinyi Sun","Will Hathaway","Neel Nanda","Dimitris Bertsimas"],"pdf_url":"https://arxiv.org/pdf/2401.12181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12178v1","updated":"2024-01-22T18:09:52Z","published":"2024-01-22T18:09:52Z","title":"In-Context Learning for Extreme Multi-Label Classification","summary":"  Multi-label classification problems with thousands of classes are hard to\nsolve with in-context learning alone, as language models (LMs) might lack prior\nknowledge about the precise classes or how to assign them, and it is generally\ninfeasible to demonstrate every class in a prompt. We propose a general\nprogram, $\\texttt{Infer--Retrieve--Rank}$, that defines multi-step interactions\nbetween LMs and retrievers to efficiently tackle such problems. We implement\nthis program using the $\\texttt{DSPy}$ programming model, which specifies\nin-context systems in a declarative manner, and use $\\texttt{DSPy}$ optimizers\nto tune it towards specific datasets by bootstrapping only tens of few-shot\nexamples. Our primary extreme classification program, optimized separately for\neach task, attains state-of-the-art results across three benchmarks (HOUSE,\nTECH, TECHWOLF). We apply the same program to a benchmark with vastly different\ncharacteristics and attain competitive performance as well (BioDEX). Unlike\nprior work, our proposed solution requires no finetuning, is easily applicable\nto new tasks, alleviates prompt engineering, and requires only tens of labeled\nexamples. Our code is public at https://github.com/KarelDO/xmc.dspy.\n","authors":["Karel D'Oosterlinck","Omar Khattab","François Remy","Thomas Demeester","Chris Develder","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2401.12178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12168v1","updated":"2024-01-22T18:01:01Z","published":"2024-01-22T18:01:01Z","title":"SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning\n  Capabilities","summary":"  Understanding and reasoning about spatial relationships is a fundamental\ncapability for Visual Question Answering (VQA) and robotics. While Vision\nLanguage Models (VLM) have demonstrated remarkable performance in certain VQA\nbenchmarks, they still lack capabilities in 3D spatial reasoning, such as\nrecognizing quantitative relationships of physical objects like distances or\nsize differences. We hypothesize that VLMs' limited spatial reasoning\ncapability is due to the lack of 3D spatial knowledge in training data and aim\nto solve this problem by training VLMs with Internet-scale spatial reasoning\ndata. To this end, we present a system to facilitate this approach. We first\ndevelop an automatic 3D spatial VQA data generation framework that scales up to\n2 billion VQA examples on 10 million real-world images. We then investigate\nvarious factors in the training recipe, including data quality, training\npipeline, and VLM architecture. Our work features the first internet-scale 3D\nspatial reasoning dataset in metric space. By training a VLM on such data, we\nsignificantly enhance its ability on both qualitative and quantitative spatial\nVQA. Finally, we demonstrate that this VLM unlocks novel downstream\napplications in chain-of-thought spatial reasoning and robotics due to its\nquantitative estimation capability. Project website:\nhttps://spatial-vlm.github.io/\n","authors":["Boyuan Chen","Zhuo Xu","Sean Kirmani","Brian Ichter","Danny Driess","Pete Florence","Dorsa Sadigh","Leonidas Guibas","Fei Xia"],"pdf_url":"https://arxiv.org/pdf/2401.12168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12143v1","updated":"2024-01-22T17:26:55Z","published":"2024-01-22T17:26:55Z","title":"Anisotropy Is Inherent to Self-Attention in Transformers","summary":"  The representation degeneration problem is a phenomenon that is widely\nobserved among self-supervised learning methods based on Transformers. In NLP,\nit takes the form of anisotropy, a singular property of hidden representations\nwhich makes them unexpectedly close to each other in terms of angular distance\n(cosine-similarity). Some recent works tend to show that anisotropy is a\nconsequence of optimizing the cross-entropy loss on long-tailed distributions\nof tokens. We show in this paper that anisotropy can also be observed\nempirically in language models with specific objectives that should not suffer\ndirectly from the same consequences. We also show that the anisotropy problem\nextends to Transformers trained on other modalities. Our observations suggest\nthat anisotropy is actually inherent to Transformers-based models.\n","authors":["Nathan Godey","Éric de la Clergerie","Benoît Sagot"],"pdf_url":"https://arxiv.org/pdf/2401.12143v1.pdf","comment":"Proceedings of EACL 2024. Previously presented at ACL-SRW 2023\n  (arXiv:2306.07656). arXiv admin note: substantial text overlap with\n  arXiv:2306.07656"},{"id":"http://arxiv.org/abs/2401.10491v2","updated":"2024-01-22T17:16:37Z","published":"2024-01-19T05:02:46Z","title":"Knowledge Fusion of Large Language Models","summary":"  While training large language models (LLMs) from scratch can generate models\nwith distinct functionalities and strengths, it comes at significant costs and\nmay result in redundant capabilities. Alternatively, a cost-effective and\ncompelling approach is to merge existing pre-trained LLMs into a more potent\nmodel. However, due to the varying architectures of these LLMs, directly\nblending their weights is impractical. In this paper, we introduce the notion\nof knowledge fusion for LLMs, aimed at combining the capabilities of existing\nLLMs and transferring them into a single LLM. By leveraging the generative\ndistributions of source LLMs, we externalize their collective knowledge and\nunique strengths, thereby potentially elevating the capabilities of the target\nmodel beyond those of any individual source LLM. We validate our approach using\nthree popular LLMs with different architectures--Llama-2, MPT, and\nOpenLLaMA--across various benchmarks and tasks. Our findings confirm that the\nfusion of LLMs can improve the performance of the target model across a range\nof capabilities such as reasoning, commonsense, and code generation. Our code,\nmodel weights, and data are public at\n\\url{https://github.com/fanqiwan/FuseLLM}.\n","authors":["Fanqi Wan","Xinting Huang","Deng Cai","Xiaojun Quan","Wei Bi","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2401.10491v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2304.14317v2","updated":"2024-01-22T17:06:50Z","published":"2023-04-27T16:38:17Z","title":"ICE-Score: Instructing Large Language Models to Evaluate Code","summary":"  Recent advancements in the field of natural language generation have\nfacilitated the use of large language models to assess the quality of generated\ntext. Although these models have shown promising results in tasks such as\nmachine translation and summarization, their applicability in code intelligence\ntasks remains limited without human involvement. The complexity of programming\nconcepts required for such tasks makes it difficult to develop evaluation\nmetrics that align with human judgment. Token-matching-based metrics, such as\nBLEU, have demonstrated weak correlations with human practitioners in code\nintelligence tasks. Moreover, utilizing human-written test suites to evaluate\nfunctional correctness can be challenging in domains with low resources. To\novercome these obstacles, we propose \\texttt{ICE-Score}, a new evaluation\nmetric via instructing large language models (LLMs) for code assessments. Our\nmetric addresses the limitations of existing approaches by achieving superior\ncorrelations with functional correctness and human preferences, without the\nneed for test oracles or references. We evaluate the efficacy of our metric on\ntwo different aspects (\\textit{human preference} and \\textit{execution\nsuccess}) and four programming languages. Our results demonstrate that our\nmetric surpasses state-of-the-art metrics for code generation, delivering high\nlevels of accuracy and consistency across various programming languages and\ntasks. We also make our evaluation metric and datasets available to the\npublic\\footnote{\\url{https://github.com/terryyz/ice-score}}, encouraging\nfurther research in evaluating code intelligence tasks.\n","authors":["Terry Yue Zhuo"],"pdf_url":"https://arxiv.org/pdf/2304.14317v2.pdf","comment":"Accepted to Findings of EACL 2024"},{"id":"http://arxiv.org/abs/2401.12117v1","updated":"2024-01-22T16:57:05Z","published":"2024-01-22T16:57:05Z","title":"The Curious Case of Nonverbal Abstract Reasoning with Multi-Modal Large\n  Language Models","summary":"  While large language models (LLMs) are still being adopted to new domains and\nutilized in novel applications, we are experiencing an influx of the new\ngeneration of foundation models, namely multi-modal large language models\n(MLLMs). These models integrate verbal and visual information, opening new\npossibilities to demonstrate more complex reasoning abilities at the\nintersection of the two modalities. However, despite the revolutionizing\nprospect of MLLMs, our understanding of their reasoning abilities is limited.\nIn this study, we assess the nonverbal abstract reasoning abilities of\nopen-source and closed-source MLLMs using variations of Raven's Progressive\nMatrices. Our experiments expose the difficulty of solving such problems while\nshowcasing the immense gap between open-source and closed-source models. We\nalso reveal critical shortcomings with individual visual and textual modules,\nsubjecting the models to low-performance ceilings. Finally, to improve MLLMs'\nperformance, we experiment with various methods, such as Chain-of-Thought\nprompting, resulting in a significant (up to 100%) boost in performance.\n","authors":["Kian Ahrabian","Zhivar Sourati","Kexuan Sun","Jiarui Zhang","Yifan Jiang","Fred Morstatter","Jay Pujara"],"pdf_url":"https://arxiv.org/pdf/2401.12117v1.pdf","comment":"Code and datasets are available at\n  https://github.com/kahrabian/mllm-nvar"},{"id":"http://arxiv.org/abs/2401.12097v1","updated":"2024-01-22T16:35:00Z","published":"2024-01-22T16:35:00Z","title":"An Empirical Analysis of In-context Learning Abilities of LLMs for MT","summary":"  In-context learning (ICL) has consistently demonstrated superior performance\nover zero-shot performance in large language models (LLMs). However, the\nunderstanding of the dynamics of ICL and the aspects that influence downstream\nperformance remains limited, especially for natural language generation (NLG)\ntasks. This work aims to address this gap by investigating the ICL capabilities\nof LLMs and studying the impact of different aspects of the in-context\ndemonstrations for the task of machine translation (MT). Our preliminary\ninvestigations aim to discern whether in-context learning (ICL) is\npredominantly influenced by demonstrations or instructions by applying diverse\nperturbations to in-context demonstrations while preserving the task\ninstruction. We observe varying behavior to perturbed examples across different\nmodel families, notably with BLOOM-7B derivatives being severely influenced by\nnoise, whereas Llama 2 derivatives not only exhibit robustness but also tend to\nshow enhancements over the clean baseline when subject to perturbed\ndemonstrations. This suggests that the robustness of ICL may be governed by\nseveral factors, including the type of noise, perturbation direction (source or\ntarget), the extent of pretraining of the specific model, and fine-tuning for\ndownstream tasks if applicable. Further investigation is warranted to develop a\ncomprehensive understanding of these factors in future research.\n","authors":["Pranjal A. Chitale","Jay Gala","Varun Gumma","Mitesh M. Khapra","Raj Dabre"],"pdf_url":"https://arxiv.org/pdf/2401.12097v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.12088v1","updated":"2024-01-22T16:25:47Z","published":"2024-01-22T16:25:47Z","title":"Unsupervised Learning of Graph from Recipes","summary":"  Cooking recipes are one of the most readily available kinds of procedural\ntext. They consist of natural language instructions that can be challenging to\ninterpret. In this paper, we propose a model to identify relevant information\nfrom recipes and generate a graph to represent the sequence of actions in the\nrecipe. In contrast with other approaches, we use an unsupervised approach. We\niteratively learn the graph structure and the parameters of a $\\mathsf{GNN}$\nencoding the texts (text-to-graph) one sequence at a time while providing the\nsupervision by decoding the graph into text (graph-to-text) and comparing the\ngenerated text to the input. We evaluate the approach by comparing the\nidentified entities with annotated datasets, comparing the difference between\nthe input and output texts, and comparing our generated graphs with those\ngenerated by state of the art methods.\n","authors":["Aissatou Diallo","Antonis Bikakis","Luke Dickens","Anthony Hunter","Rob Miller"],"pdf_url":"https://arxiv.org/pdf/2401.12088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12087v1","updated":"2024-01-22T16:25:27Z","published":"2024-01-22T16:25:27Z","title":"Revisiting Demonstration Selection Strategies in In-Context Learning","summary":"  Large language models (LLMs) have shown an impressive ability to perform a\nwide range of tasks using in-context learning (ICL), where a few examples are\nused to describe a task to the model. However, the performance of ICL varies\nsignificantly with the choice of demonstrations, and it is still unclear why\nthis happens or what factors will influence its choice. In this work, we first\nrevisit the factors contributing to this variance from both data and model\naspects, and find that the choice of demonstration is both data- and\nmodel-dependent. We further proposed a data- and model-dependent demonstration\nselection method, \\textbf{TopK + ConE}, based on the assumption that\n\\textit{the performance of a demonstration positively correlates with its\ncontribution to the model's understanding of the test samples}, resulting in a\nsimple and effective recipe for ICL. Empirically, our method yields consistent\nimprovements in both language understanding and generation tasks with different\nmodel scales. Further analyses confirm that, besides the generality and\nstability under different circumstances, our method provides a unified\nexplanation for the effectiveness of previous methods. Code will be released.\n","authors":["Keqin Peng","Liang Ding","Yancheng Yuan","Xuebo Liu","Min Zhang","Yuanxin Ouyang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2401.12087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12086v1","updated":"2024-01-22T16:24:43Z","published":"2024-01-22T16:24:43Z","title":"West-of-N: Synthetic Preference Generation for Improved Reward Modeling","summary":"  The success of reinforcement learning from human feedback (RLHF) in language\nmodel alignment is strongly dependent on the quality of the underlying reward\nmodel. In this paper, we present a novel approach to improve reward model\nquality by generating synthetic preference data, thereby augmenting the\ntraining dataset with on-policy, high-quality preference pairs. Motivated by\nthe promising results of Best-of-N sampling strategies in language model\ntraining, we extend their application to reward model training. This results in\na self-training strategy to generate preference pairs by selecting the best and\nworst candidates in a pool of responses to a given query. Empirically, we find\nthat this approach improves the performance of any reward model, with an effect\ncomparable to the addition of a similar quantity of human preference data. This\nwork opens up new avenues of research for improving RLHF for language model\nalignment, by offering synthetic preference generation as a solution to reward\nmodeling challenges.\n","authors":["Alizée Pace","Jonathan Mallinson","Eric Malmi","Sebastian Krause","Aliaksei Severyn"],"pdf_url":"https://arxiv.org/pdf/2401.12086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12078v1","updated":"2024-01-22T16:20:14Z","published":"2024-01-22T16:20:14Z","title":"Temporal Blind Spots in Large Language Models","summary":"  Large language models (LLMs) have recently gained significant attention due\nto their unparalleled ability to perform various natural language processing\ntasks. These models, benefiting from their advanced natural language\nunderstanding capabilities, have demonstrated impressive zero-shot performance.\nHowever, the pre-training data utilized in LLMs is often confined to a specific\ncorpus, resulting in inherent freshness and temporal scope limitations.\nConsequently, this raises concerns regarding the effectiveness of LLMs for\ntasks involving temporal intents. In this study, we aim to investigate the\nunderlying limitations of general-purpose LLMs when deployed for tasks that\nrequire a temporal understanding. We pay particular attention to handling\nfactual temporal knowledge through three popular temporal QA datasets.\nSpecifically, we observe low performance on detailed questions about the past\nand, surprisingly, for rather new information. In manual and automatic testing,\nwe find multiple temporal errors and characterize the conditions under which QA\nperformance deteriorates. Our analysis contributes to understanding LLM\nlimitations and offers valuable insights into developing future models that can\nbetter cater to the demands of temporally-oriented tasks. The code is\navailable\\footnote{https://github.com/jwallat/temporalblindspots}.\n","authors":["Jonas Wallat","Adam Jatowt","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2401.12078v1.pdf","comment":"accepted at WSDM'24"},{"id":"http://arxiv.org/abs/2401.12072v1","updated":"2024-01-22T16:13:45Z","published":"2024-01-22T16:13:45Z","title":"Cross-lingual Transfer Learning for Javanese Dependency Parsing","summary":"  While structure learning achieves remarkable performance in high-resource\nlanguages, the situation differs for under-represented languages due to the\nscarcity of annotated data. This study focuses on assessing the efficacy of\ntransfer learning in enhancing dependency parsing for Javanese, a language\nspoken by 80 million individuals but characterized by limited representation in\nnatural language processing. We utilized the Universal Dependencies dataset\nconsisting of dependency treebanks from more than 100 languages, including\nJavanese. We propose two learning strategies to train the model: transfer\nlearning (TL) and hierarchical transfer learning (HTL). While TL only uses a\nsource language to pre-train the model, the HTL method uses a source language\nand an intermediate language in the learning process. The results show that our\nbest model uses the HTL method, which improves performance with an increase of\n10% for both UAS and LAS evaluations compared to the baseline model.\n","authors":["Fadli Aulawi Al Ghiffari","Ika Alfina","Kurniawati Azizah"],"pdf_url":"https://arxiv.org/pdf/2401.12072v1.pdf","comment":"Accepted at IJCNLP-AACL 2023 SRW"},{"id":"http://arxiv.org/abs/2401.12070v1","updated":"2024-01-22T16:09:47Z","published":"2024-01-22T16:09:47Z","title":"Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated\n  Text","summary":"  Detecting text generated by modern large language models is thought to be\nhard, as both LLMs and humans can exhibit a wide range of complex behaviors.\nHowever, we find that a score based on contrasting two closely related language\nmodels is highly accurate at separating human-generated and machine-generated\ntext. Based on this mechanism, we propose a novel LLM detector that only\nrequires simple calculations using a pair of pre-trained LLMs. The method,\ncalled Binoculars, achieves state-of-the-art accuracy without any training\ndata. It is capable of spotting machine text from a range of modern LLMs\nwithout any model-specific modifications. We comprehensively evaluate\nBinoculars on a number of text sources and in varied situations. Over a wide\nrange of document types, Binoculars detects over 90% of generated samples from\nChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being\ntrained on any ChatGPT data.\n","authors":["Abhimanyu Hans","Avi Schwarzschild","Valeriia Cherepanova","Hamid Kazemi","Aniruddha Saha","Micah Goldblum","Jonas Geiping","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2401.12070v1.pdf","comment":"20 pages, code available at https://github.com/ahans30/Binoculars"},{"id":"http://arxiv.org/abs/2311.14212v3","updated":"2024-01-22T15:05:30Z","published":"2023-11-23T21:54:22Z","title":"Annotation Sensitivity: Training Data Collection Methods Affect Model\n  Performance","summary":"  When training data are collected from human annotators, the design of the\nannotation instrument, the instructions given to annotators, the\ncharacteristics of the annotators, and their interactions can impact training\ndata. This study demonstrates that design choices made when creating an\nannotation instrument also impact the models trained on the resulting\nannotations. We introduce the term annotation sensitivity to refer to the\nimpact of annotation data collection methods on the annotations themselves and\non downstream model performance and predictions. We collect annotations of hate\nspeech and offensive language in five experimental conditions of an annotation\ninstrument, randomly assigning annotators to conditions. We then fine-tune BERT\nmodels on each of the five resulting datasets and evaluate model performance on\na holdout portion of each condition. We find considerable differences between\nthe conditions for 1) the share of hate speech/offensive language annotations,\n2) model performance, 3) model predictions, and 4) model learning curves. Our\nresults emphasize the crucial role played by the annotation instrument which\nhas received little attention in the machine learning literature. We call for\nadditional research into how and why the instrument impacts the annotations to\ninform the development of best practices in instrument design.\n","authors":["Christoph Kern","Stephanie Eckman","Jacob Beck","Rob Chew","Bolei Ma","Frauke Kreuter"],"pdf_url":"https://arxiv.org/pdf/2311.14212v3.pdf","comment":"EMNLP 2023 Findings:\n  https://aclanthology.org/2023.findings-emnlp.992/"},{"id":"http://arxiv.org/abs/2306.00824v2","updated":"2024-01-22T14:57:47Z","published":"2023-06-01T15:46:36Z","title":"Zero and Few-shot Semantic Parsing with Ambiguous Inputs","summary":"  Despite the frequent challenges posed by ambiguity when representing meaning\nvia natural language, it is often ignored or deliberately removed in tasks\nmapping language to formally-designed representations, which generally assume a\none-to-one mapping between linguistic and formal representations. We attempt to\naddress this shortcoming by introducing AmP, a framework, dataset, and\nchallenge for translating ambiguous natural language to formal representations\nlike logic and code. We define templates and generate data for five\nwell-documented linguistic ambiguities. Using AmP, we investigate how several\nfew-shot text-to-code systems handle ambiguity, introducing three new metrics.\nWe find that large pre-trained models perform poorly at capturing the\ndistribution of possible meanings without deliberate instruction. However,\nmodels are able to capture the distribution well when ambiguity is attested in\ntheir inputs. These results motivate a call for including ambiguity explicitly\nin datasets and promote considering the distribution of possible outputs when\nevaluating systems. Data and code: https://github.com/esteng/ambiguous_parsing\n","authors":["Elias Stengel-Eskin","Kyle Rawlins","Benjamin Van Durme"],"pdf_url":"https://arxiv.org/pdf/2306.00824v2.pdf","comment":"ICLR 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2401.12005v1","updated":"2024-01-22T14:53:59Z","published":"2024-01-22T14:53:59Z","title":"ALMs: Authorial Language Models for Authorship Attribution","summary":"  In this paper, we introduce an authorship attribution method called Authorial\nLanguage Models (ALMs) that involves identifying the most likely author of a\nquestioned document based on the perplexity of the questioned document\ncalculated for a set of causal language models fine-tuned on the writings of a\nset of candidate author. We benchmarked ALMs against state-of-art-systems using\nthe CCAT50 dataset and the Blogs50 datasets. We find that ALMs achieves a\nmacro-average accuracy score of 83.6% on Blogs50, outperforming all other\nmethods, and 74.9% on CCAT50, matching the performance of the best method. To\nassess the performance of ALMs on shorter texts, we also conducted text\nablation testing. We found that to reach a macro-average accuracy of 70%, ALMs\nneeds 40 tokens on Blogs50 and 400 tokens on CCAT50, while to reach 60% ALMs\nrequires 20 tokens on Blogs50 and 70 tokens on CCAT50.\n","authors":["Weihang Huang","Akira Murakami","Jack Grieve"],"pdf_url":"https://arxiv.org/pdf/2401.12005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02118v2","updated":"2024-01-22T14:41:43Z","published":"2023-10-03T14:59:35Z","title":"TWIZ-v2: The Wizard of Multimodal Conversational-Stimulus","summary":"  In this report, we describe the vision, challenges, and scientific\ncontributions of the Task Wizard team, TWIZ, in the Alexa Prize TaskBot\nChallenge 2022. Our vision, is to build TWIZ bot as an helpful, multimodal,\nknowledgeable, and engaging assistant that can guide users towards the\nsuccessful completion of complex manual tasks. To achieve this, we focus our\nefforts on three main research questions: (1) Humanly-Shaped Conversations, by\nproviding information in a knowledgeable way; (2) Multimodal Stimulus, making\nuse of various modalities including voice, images, and videos; and (3)\nZero-shot Conversational Flows, to improve the robustness of the interaction to\nunseen scenarios. TWIZ is an assistant capable of supporting a wide range of\ntasks, with several innovative features such as creative cooking, video\nnavigation through voice, and the robust TWIZ-LLM, a Large Language Model\ntrained for dialoguing about complex manual tasks. Given ratings and feedback\nprovided by users, we observed that TWIZ bot is an effective and robust system,\ncapable of guiding users through tasks while providing several multimodal\nstimuli.\n","authors":["Rafael Ferreira","Diogo Tavares","Diogo Silva","Rodrigo Valério","João Bordalo","Inês Simões","Vasco Ramos","David Semedo","João Magalhães"],"pdf_url":"https://arxiv.org/pdf/2310.02118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11972v1","updated":"2024-01-22T14:24:03Z","published":"2024-01-22T14:24:03Z","title":"Synergizing Machine Learning & Symbolic Methods: A Survey on Hybrid\n  Approaches to Natural Language Processing","summary":"  The advancement of machine learning and symbolic approaches have underscored\ntheir strengths and weaknesses in Natural Language Processing (NLP). While\nmachine learning approaches are powerful in identifying patterns in data, they\noften fall short in learning commonsense and the factual knowledge required for\nthe NLP tasks. Meanwhile, the symbolic methods excel in representing\nknowledge-rich data. However, they struggle to adapt dynamic data and\ngeneralize the knowledge. Bridging these two paradigms through hybrid\napproaches enables the alleviation of weaknesses in both while preserving their\nstrengths. Recent studies extol the virtues of this union, showcasing promising\nresults in a wide range of NLP tasks. In this paper, we present an overview of\nhybrid approaches used for NLP. Specifically, we delve into the\nstate-of-the-art hybrid approaches used for a broad spectrum of NLP tasks\nrequiring natural language understanding, generation, and reasoning.\nFurthermore, we discuss the existing resources available for hybrid approaches\nfor NLP along with the challenges, offering a roadmap for future directions.\n","authors":["Rrubaa Panchendrarajan","Arkaitz Zubiaga"],"pdf_url":"https://arxiv.org/pdf/2401.11972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11969v1","updated":"2024-01-22T14:17:03Z","published":"2024-01-22T14:17:03Z","title":"Claim Detection for Automated Fact-checking: A Survey on Monolingual,\n  Multilingual and Cross-Lingual Research","summary":"  Automated fact-checking has drawn considerable attention over the past few\ndecades due to the increase in the diffusion of misinformation on online\nplatforms. This is often carried out as a sequence of tasks comprising (i) the\ndetection of sentences circulating in online platforms which constitute claims\nneeding verification, followed by (ii) the verification process of those\nclaims. This survey focuses on the former, by discussing existing efforts\ntowards detecting claims needing fact-checking, with a particular focus on\nmultilingual data and methods. This is a challenging and fertile direction\nwhere existing methods are yet far from matching human performance due to the\nprofoundly challenging nature of the issue. Especially, the dissemination of\ninformation across multiple social platforms, articulated in multiple languages\nand modalities demands more generalized solutions for combating misinformation.\nFocusing on multilingual misinformation, we present a comprehensive survey of\nexisting multilingual claim detection research. We present state-of-the-art\nmultilingual claim detection research categorized into three key factors of the\nproblem, verifiability, priority, and similarity. Further, we present a\ndetailed overview of the existing multilingual datasets along with the\nchallenges and suggest possible future advancements.\n","authors":["Rrubaa Panchendrarajan","Arkaitz Zubiaga"],"pdf_url":"https://arxiv.org/pdf/2401.11969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14578v2","updated":"2024-01-22T14:13:51Z","published":"2023-05-23T23:31:24Z","title":"Connecting the Dots: What Graph-Based Text Representations Work Best for\n  Text Classification Using Graph Neural Networks?","summary":"  Given the success of Graph Neural Networks (GNNs) for structure-aware machine\nlearning, many studies have explored their use for text classification, but\nmostly in specific domains with limited data characteristics. Moreover, some\nstrategies prior to GNNs relied on graph mining and classical machine learning,\nmaking it difficult to assess their effectiveness in modern settings. This work\nextensively investigates graph representation methods for text classification,\nidentifying practical implications and open challenges. We compare different\ngraph construction schemes using a variety of GNN architectures and setups\nacross five datasets, encompassing short and long documents as well as\nunbalanced scenarios in diverse domains. Two Transformer-based large language\nmodels are also included to complement the study. The results show that i)\nalthough the effectiveness of graphs depends on the textual input features and\ndomain, simple graph constructions perform better the longer the documents are,\nii) graph representations are especially beneficial for longer documents,\noutperforming Transformer-based models, iii) graph methods are particularly\nefficient at solving the task.\n","authors":["Margarita Bugueño","Gerard de Melo"],"pdf_url":"https://arxiv.org/pdf/2305.14578v2.pdf","comment":"Accepted to Findings of the Association for Computational\n  Linguistics: EMNLP 2023 (Long Paper). 17 pages, 2 figures, 15 tables. The\n  Appendix starts on page 12"},{"id":"http://arxiv.org/abs/2310.01386v2","updated":"2024-01-22T13:58:50Z","published":"2023-10-02T17:46:09Z","title":"Who is ChatGPT? Benchmarking LLMs' Psychological Portrayal Using\n  PsychoBench","summary":"  Large Language Models (LLMs) have recently showcased their remarkable\ncapacities, not only in natural language processing tasks but also across\ndiverse domains such as clinical medicine, legal consultation, and education.\nLLMs become more than mere applications, evolving into assistants capable of\naddressing diverse user requests. This narrows the distinction between human\nbeings and artificial intelligence agents, raising intriguing questions\nregarding the potential manifestation of personalities, temperaments, and\nemotions within LLMs. In this paper, we propose a framework, PsychoBench, for\nevaluating diverse psychological aspects of LLMs. Comprising thirteen scales\ncommonly used in clinical psychology, PsychoBench further classifies these\nscales into four distinct categories: personality traits, interpersonal\nrelationships, motivational tests, and emotional abilities. Our study examines\nfive popular models, namely text-davinci-003, gpt-3.5-turbo, gpt-4, LLaMA-2-7b,\nand LLaMA-2-13b. Additionally, we employ a jailbreak approach to bypass the\nsafety alignment protocols and test the intrinsic natures of LLMs. We have made\nPsychoBench openly accessible via https://github.com/CUHK-ARISE/PsychoBench.\n","authors":["Jen-tse Huang","Wenxuan Wang","Eric John Li","Man Ho Lam","Shujie Ren","Youliang Yuan","Wenxiang Jiao","Zhaopeng Tu","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2310.01386v2.pdf","comment":"Accepted for ICLR 2024 Oral Presentation. 15 pages (main text) and 5\n  pages (appendix)"},{"id":"http://arxiv.org/abs/2401.11944v1","updated":"2024-01-22T13:34:34Z","published":"2024-01-22T13:34:34Z","title":"CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding\n  Benchmark","summary":"  As the capabilities of large multimodal models (LMMs) continue to advance,\nevaluating the performance of LMMs emerges as an increasing need. Additionally,\nthere is an even larger gap in evaluating the advanced knowledge and reasoning\nabilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU,\na new Chinese Massive Multi-discipline Multimodal Understanding benchmark\ndesigned to evaluate LMMs on tasks demanding college-level subject knowledge\nand deliberate reasoning in a Chinese context. CMMMU is inspired by and\nstrictly follows the annotation and analysis pattern of MMMU.\n  CMMMU includes 12k manually collected multimodal questions from college\nexams, quizzes, and textbooks, covering six core disciplines: Art & Design,\nBusiness, Science, Health & Medicine, Humanities & Social Science, and Tech &\nEngineering, like its companion, MMMU. These questions span 30 subjects and\ncomprise 39 highly heterogeneous image types, such as charts, diagrams, maps,\ntables, music sheets, and chemical structures.\n  CMMMU focuses on complex perception and reasoning with domain-specific\nknowledge in the Chinese context. We evaluate 11 open-source LLMs and one\nproprietary GPT-4V(ision). Even GPT-4V only achieves accuracies of 42%,\nindicating a large space for improvement. CMMMU will boost the community to\nbuild the next-generation LMMs towards expert artificial intelligence and\npromote the democratization of LMMs by providing diverse language contexts.\n","authors":["Ge Zhang","Xinrun Du","Bei Chen","Yiming Liang","Tongxu Luo","Tianyu Zheng","Kang Zhu","Yuyang Cheng","Chunpu Xu","Shuyue Guo","Haoran Zhang","Xingwei Qu","Junjie Wang","Ruibin Yuan","Yizhi Li","Zekun Wang","Yudong Liu","Yu-Hsuan Tsai","Fengji Zhang","Chenghua Lin","Wenhao Huang","Wenhu Chen","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2401.11944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11943v1","updated":"2024-01-22T13:33:53Z","published":"2024-01-22T13:33:53Z","title":"Benchmarking Large Multimodal Models against Common Corruptions","summary":"  This technical report aims to fill a deficiency in the assessment of large\nmultimodal models (LMMs) by specifically examining the self-consistency of\ntheir outputs when subjected to common corruptions. We investigate the\ncross-modal interactions between text, image, and speech, encompassing four\nessential generation tasks: text-to-image, image-to-text, text-to-speech, and\nspeech-to-text. We create a comprehensive benchmark, named MMCBench, that\ncovers more than 100 popular LMMs (totally over 150 model checkpoints). A\nthorough evaluation under common corruptions is critical for practical\ndeployment and facilitates a better understanding of the reliability of\ncutting-edge LMMs. The benchmarking code is available at\nhttps://github.com/sail-sg/MMCBench\n","authors":["Jiawei Zhang","Tianyu Pang","Chao Du","Yi Ren","Bo Li","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11943v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2401.11911v1","updated":"2024-01-22T12:54:04Z","published":"2024-01-22T12:54:04Z","title":"Blinded by Generated Contexts: How Language Models Merge Generated and\n  Retrieved Contexts for Open-Domain QA?","summary":"  While auxiliary information has become a key to enhance Large Language Models\n(LLMs), relatively little is known about how well LLMs merge these contexts,\nspecifically generated and retrieved. To study this, we formulate a task\nspecifically designed to identify whether the answers, derived from the\nintegration of generated and retrieved contexts, are attributed to either\ngenerated or retrieved contexts. To support this task, we develop a methodology\nto construct datasets with conflicting contexts, where each question is paired\nwith both generated and retrieved contexts, yet only one of them contains the\ncorrect answer. Our experiments reveal a significant bias in LLMs towards\ngenerated contexts, as evidenced across state-of-the-art open (Llama2-7b/13b)\nand closed (GPT 3.5/4) systems. We further identify two key factors\ncontributing to this bias: i) Contexts generated by LLMs typically show greater\nsimilarity to the questions, increasing their likelihood of selection; ii) The\nsegmentation process used in retrieved contexts disrupts their completeness,\nthereby hindering their full utilization in LLMs. Our analysis enhances the\nunderstanding of how LLMs merge diverse contexts, offering valuable insights\nfor advancing current augmentation methods for LLMs.\n","authors":["Hexiang Tan","Fei Sun","Wanli Yang","Yuanzhuo Wang","Qi Cao","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.11911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10337v2","updated":"2024-01-22T12:33:43Z","published":"2024-01-18T19:02:00Z","title":"Noise Contrastive Estimation-based Matching Framework for Low-resource\n  Security Attack Pattern Recognition","summary":"  Tactics, Techniques and Procedures (TTPs) represent sophisticated attack\npatterns in the cybersecurity domain, described encyclopedically in textual\nknowledge bases. Identifying TTPs in cybersecurity writing, often called TTP\nmapping, is an important and challenging task. Conventional learning approaches\noften target the problem in the classical multi-class or multilabel\nclassification setting. This setting hinders the learning ability of the model\ndue to a large number of classes (i.e., TTPs), the inevitable skewness of the\nlabel distribution and the complex hierarchical structure of the label space.\nWe formulate the problem in a different learning paradigm, where the assignment\nof a text to a TTP label is decided by the direct semantic similarity between\nthe two, thus reducing the complexity of competing solely over the large\nlabeling space. To that end, we propose a neural matching architecture with an\neffective sampling-based learn-to-compare mechanism, facilitating the learning\nprocess of the matching model despite constrained resources.\n","authors":["Tu Nguyen","Nedim Srndic","Alexander Neth"],"pdf_url":"https://arxiv.org/pdf/2401.10337v2.pdf","comment":"accepted at EACL 2024, in ARR October 2023"},{"id":"http://arxiv.org/abs/2311.07989v4","updated":"2024-01-22T12:27:47Z","published":"2023-11-14T08:34:26Z","title":"Unifying the Perspectives of NLP and Software Engineering: A Survey on\n  Language Models for Code","summary":"  In this work we systematically review the recent advancements in code\nprocessing with language models, covering 50+ models, 30+ evaluation tasks,\n170+ datasets, and 700+ related works. We break down code processing models\ninto general language models represented by the GPT family and specialized\nmodels that are specifically pretrained on code, often with tailored\nobjectives. We discuss the relations and differences between these models, and\nhighlight the historical transition of code modeling from statistical models\nand RNNs to pretrained Transformers and LLMs, which is exactly the same course\nthat had been taken by NLP. We also discuss code-specific features such as AST,\nCFG, and unit tests, along with their application in training code language\nmodels, and identify key challenges and potential future directions in this\ndomain. We keep the survey open and updated on GitHub at\nhttps://github.com/codefuse-ai/Awesome-Code-LLM.\n","authors":["Ziyin Zhang","Chaoyu Chen","Bingchang Liu","Cong Liao","Zi Gong","Hang Yu","Jianguo Li","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2311.07989v4.pdf","comment":"Repo is available at https://github.com/codefuse-ai/Awesome-Code-LLM.\n  8 figures, 10 tables, and 713 references"},{"id":"http://arxiv.org/abs/2401.11880v1","updated":"2024-01-22T12:11:55Z","published":"2024-01-22T12:11:55Z","title":"PsySafe: A Comprehensive Framework for Psychological-based Attack,\n  Defense, and Evaluation of Multi-agent System Safety","summary":"  Multi-agent systems, augmented with Large Language Models (LLMs), demonstrate\nsignificant capabilities for collective intelligence. However, the potential\nmisuse of this intelligence for malicious purposes presents significant risks.\nTo date, comprehensive research on the safety issues associated with\nmulti-agent systems remains limited. From the perspective of agent psychology,\nwe discover that the dark psychological states of agents can lead to severe\nsafety issues. To address these issues, we propose a comprehensive framework\ngrounded in agent psychology. In our framework, we focus on three aspects:\nidentifying how dark personality traits in agents might lead to risky\nbehaviors, designing defense strategies to mitigate these risks, and evaluating\nthe safety of multi-agent systems from both psychological and behavioral\nperspectives. Our experiments reveal several intriguing phenomena, such as the\ncollective dangerous behaviors among agents, agents' propensity for\nself-reflection when engaging in dangerous behavior, and the correlation\nbetween agents' psychological assessments and their dangerous behaviors. We\nanticipate that our framework and observations will provide valuable insights\nfor further research into the safety of multi-agent systems. We will make our\ndata and code publicly accessible at https:/github.com/AI4Good24/PsySafe.\n","authors":["Zaibin Zhang","Yongting Zhang","Lijun Li","Hongzhi Gao","Lijun Wang","Huchuan Lu","Feng Zhao","Yu Qiao","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2401.11880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11864v1","updated":"2024-01-22T11:37:18Z","published":"2024-01-22T11:37:18Z","title":"Improving Small Language Models' Mathematical Reasoning via Mix Thoughts\n  Distillation","summary":"  This work addresses the challenge of democratizing advanced Large Language\nModels (LLMs) by compressing their mathematical reasoning capabilities into\nsub-billion parameter Small Language Models (SLMs) without compromising\nperformance. We introduce Equation-of-Thought Distillation (EoTD), a novel\ntechnique that encapsulates the reasoning process into equation-based\nrepresentations to construct an EoTD dataset for fine-tuning SLMs.\nAdditionally, we propose the Mix Thoughts Distillation (MTD) framework to\nenhance the reasoning performance of SLMs. This involves creating a reasoning\ndataset with multiple thought processes and using it for fine-tuning. Our\nexperimental findings demonstrate that EoTD significantly boosts the reasoning\nabilities of SLMs, while MTD enables these models to achieve state-of-the-art\nreasoning performance.\n","authors":["Xunyu Zhu","Jian Li","Yong Liu","Can Ma","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11852v1","updated":"2024-01-22T11:15:07Z","published":"2024-01-22T11:15:07Z","title":"The Right Model for the Job: An Evaluation of Legal Multi-Label\n  Classification Baselines","summary":"  Multi-Label Classification (MLC) is a common task in the legal domain, where\nmore than one label may be assigned to a legal document. A wide range of\nmethods can be applied, ranging from traditional ML approaches to the latest\nTransformer-based architectures. In this work, we perform an evaluation of\ndifferent MLC methods using two public legal datasets, POSTURE50K and\nEURLEX57K. By varying the amount of training data and the number of labels, we\nexplore the comparative advantage offered by different approaches in relation\nto the dataset properties. Our findings highlight DistilRoBERTa and LegalBERT\nas performing consistently well in legal MLC with reasonable computational\ndemands. T5 also demonstrates comparable performance while offering advantages\nas a generative model in the presence of changing label sets. Finally, we show\nthat the CrossEncoder exhibits potential for notable macro-F1 score\nimprovements, albeit with increased computational costs.\n","authors":["Martina Forster","Claudia Schulz","Prudhvi Nokku","Melicaalsadat Mirsafian","Jaykumar Kasundra","Stavroula Skylaki"],"pdf_url":"https://arxiv.org/pdf/2401.11852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11839v1","updated":"2024-01-22T10:57:09Z","published":"2024-01-22T10:57:09Z","title":"AI for social science and social science of AI: A Survey","summary":"  Recent advancements in artificial intelligence, particularly with the\nemergence of large language models (LLMs), have sparked a rethinking of\nartificial general intelligence possibilities. The increasing human-like\ncapabilities of AI are also attracting attention in social science research,\nleading to various studies exploring the combination of these two fields. In\nthis survey, we systematically categorize previous explorations in the\ncombination of AI and social science into two directions that share common\ntechnical approaches but differ in their research objectives. The first\ndirection is focused on AI for social science, where AI is utilized as a\npowerful tool to enhance various stages of social science research. While the\nsecond direction is the social science of AI, which examines AI agents as\nsocial entities with their human-like cognitive and linguistic capabilities. By\nconducting a thorough review, particularly on the substantial progress\nfacilitated by recent advancements in large language models, this paper\nintroduces a fresh perspective to reassess the relationship between AI and\nsocial science, provides a cohesive framework that allows researchers to\nunderstand the distinctions and connections between AI for social science and\nsocial science of AI, and also summarized state-of-art experiment simulation\nplatforms to facilitate research in these two directions. We believe that as AI\ntechnology continues to advance and intelligent agents find increasing\napplications in our daily lives, the significance of the combination of AI and\nsocial science will become even more prominent.\n","authors":["Ruoxi Xu","Yingfei Sun","Mengjie Ren","Shiguang Guo","Ruotong Pan","Hongyu Lin","Le Sun","Xianpei Han"],"pdf_url":"https://arxiv.org/pdf/2401.11839v1.pdf","comment":"Accepted by Information Processing and Management (IP&M)"},{"id":"http://arxiv.org/abs/2401.11819v1","updated":"2024-01-22T10:30:11Z","published":"2024-01-22T10:30:11Z","title":"SuperCLUE-Math6: Graded Multi-Step Math Reasoning Benchmark for LLMs in\n  Chinese","summary":"  We introduce SuperCLUE-Math6(SC-Math6), a new benchmark dataset to evaluate\nthe mathematical reasoning abilities of Chinese language models. SC-Math6 is\ndesigned as an upgraded Chinese version of the GSM8K dataset with enhanced\ndifficulty, diversity, and application scope. It consists of over 2000\nmathematical word problems requiring multi-step reasoning and providing natural\nlanguage solutions. We propose an innovative scheme to quantify the reasoning\ncapability of large models based on performance over problems with different\nreasoning steps. Experiments on 12 representative Chinese models demonstrate a\nclear stratification of reasoning levels, with top models like GPT-4 showing\nsuperior performance. SC-Math6 fills the gap in Chinese mathematical reasoning\nbenchmarks and provides a comprehensive testbed to advance the intelligence of\nChinese language models.\n","authors":["Liang Xu","Hang Xue","Lei Zhu","Kangkang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.11819v1.pdf","comment":"8 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.11817v1","updated":"2024-01-22T10:26:14Z","published":"2024-01-22T10:26:14Z","title":"Hallucination is Inevitable: An Innate Limitation of Large Language\n  Models","summary":"  Hallucination has been widely recognized to be a significant drawback for\nlarge language models (LLMs). There have been many works that attempt to reduce\nthe extent of hallucination. These efforts have mostly been empirical so far,\nwhich cannot answer the fundamental question whether it can be completely\neliminated. In this paper, we formalize the problem and show that it is\nimpossible to eliminate hallucination in LLMs. Specifically, we define a formal\nworld where hallucination is defined as inconsistencies between a computable\nLLM and a computable ground truth function. By employing results from learning\ntheory, we show that LLMs cannot learn all of the computable functions and will\ntherefore always hallucinate. Since the formal world is a part of the real\nworld which is much more complicated, hallucinations are also inevitable for\nreal world LLMs. Furthermore, for real world LLMs constrained by provable time\ncomplexity, we describe the hallucination-prone tasks and empirically validate\nour claims. Finally, using the formal world framework, we discuss the possible\nmechanisms and efficacies of existing hallucination mitigators as well as the\npractical implications on the safe deployment of LLMs.\n","authors":["Ziwei Xu","Sanjay Jain","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2401.11817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11791v1","updated":"2024-01-22T09:41:05Z","published":"2024-01-22T09:41:05Z","title":"SemPLeS: Semantic Prompt Learning for Weakly-Supervised Semantic\n  Segmentation","summary":"  Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation\nmodels using training image data with only image-level supervision. Since\nprecise pixel-level annotations are not accessible, existing methods typically\nfocus on producing pseudo masks for training segmentation models by refining\nCAM-like heatmaps. However, the produced heatmaps may only capture\ndiscriminative image regions of target object categories or the associated\nco-occurring backgrounds. To address the issues, we propose a Semantic Prompt\nLearning for WSSS (SemPLeS) framework, which learns to effectively prompt the\nCLIP space to enhance the semantic alignment between the segmented regions and\nthe target object categories. More specifically, we propose Contrastive Prompt\nLearning and Class-associated Semantic Refinement to learn the prompts that\nadequately describe and suppress the image backgrounds associated with each\ntarget object category. In this way, our proposed framework is able to perform\nbetter semantic matching between object regions and the associated text labels,\nresulting in desired pseudo masks for training the segmentation model. The\nproposed SemPLeS framework achieves SOTA performance on the standard WSSS\nbenchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the\nsemantic visualization of our learned prompts. The codes will be released.\n","authors":["Ci-Siang Lin","Chien-Yi Wang","Yu-Chiang Frank Wang","Min-Hung Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04408v3","updated":"2024-01-22T07:40:02Z","published":"2023-07-10T08:15:40Z","title":"TIM: Teaching Large Language Models to Translate with Comparison","summary":"  Open-sourced large language models (LLMs) have demonstrated remarkable\nefficacy in various tasks with instruction tuning. However, these models can\nsometimes struggle with tasks that require more specialized knowledge such as\ntranslation. One possible reason for such deficiency is that instruction tuning\naims to generate fluent and coherent text that continues from a given\ninstruction without being constrained by any task-specific requirements.\nMoreover, it can be more challenging for tuning smaller LLMs with lower-quality\ntraining data. To address this issue, we propose a novel framework using\nexamples in comparison to teach LLMs to learn translation. Our approach\ninvolves presenting the model with examples of correct and incorrect\ntranslations and using a preference loss to guide the model's learning. We\nevaluate our method on WMT2022 test sets and show that it outperforms existing\nmethods. Our findings offer a new perspective on fine-tuning LLMs for\ntranslation tasks and provide a promising solution for generating high-quality\ntranslations. Please refer to Github for more details:\nhttps://github.com/lemon0830/TIM.\n","authors":["Jiali Zeng","Fandong Meng","Yongjing Yin","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.04408v3.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2309.12247v2","updated":"2024-01-22T07:24:30Z","published":"2023-09-21T16:47:30Z","title":"Bad Actor, Good Advisor: Exploring the Role of Large Language Models in\n  Fake News Detection","summary":"  Detecting fake news requires both a delicate sense of diverse clues and a\nprofound understanding of the real-world background, which remains challenging\nfor detectors based on small language models (SLMs) due to their knowledge and\ncapability limitations. Recent advances in large language models (LLMs) have\nshown remarkable performance in various tasks, but whether and how LLMs could\nhelp with fake news detection remains underexplored. In this paper, we\ninvestigate the potential of LLMs in fake news detection. First, we conduct an\nempirical study and find that a sophisticated LLM such as GPT 3.5 could\ngenerally expose fake news and provide desirable multi-perspective rationales\nbut still underperforms the basic SLM, fine-tuned BERT. Our subsequent analysis\nattributes such a gap to the LLM's inability to select and integrate rationales\nproperly to conclude. Based on these findings, we propose that current LLMs may\nnot substitute fine-tuned SLMs in fake news detection but can be a good advisor\nfor SLMs by providing multi-perspective instructive rationales. To instantiate\nthis proposal, we design an adaptive rationale guidance network for fake news\ndetection (ARG), in which SLMs selectively acquire insights on news analysis\nfrom the LLMs' rationales. We further derive a rationale-free version of ARG by\ndistillation, namely ARG-D, which services cost-sensitive scenarios without\nquerying LLMs. Experiments on two real-world datasets demonstrate that ARG and\nARG-D outperform three types of baseline methods, including SLM-based,\nLLM-based, and combinations of small and large language models.\n","authors":["Beizhe Hu","Qiang Sheng","Juan Cao","Yuhui Shi","Yang Li","Danding Wang","Peng Qi"],"pdf_url":"https://arxiv.org/pdf/2309.12247v2.pdf","comment":"16 pages, 5 figures, and 9 tables. To appear at AAAI 2024"},{"id":"http://arxiv.org/abs/2401.11725v1","updated":"2024-01-22T07:07:06Z","published":"2024-01-22T07:07:06Z","title":"Speak It Out: Solving Symbol-Related Problems with Symbol-to-Language\n  Conversion for Language Models","summary":"  Symbols (or more broadly, non-natural language textual representations) such\nas numerical sequences, molecular formulas, and table delimiters widely exist,\nplaying important roles in various tasks such as abstract reasoning, chemical\nproperty prediction, and table question answering. Despite the impressive\nnatural language comprehension capabilities of large language models (LLMs),\ntheir reasoning abilities for symbols remain inadequate, which could attributed\nto the difference between symbol representations and general natural languages.\nWe propose symbol-to-language (S2L), a tuning-free method that enables large\nlanguage models to solve symbol-related problems with information expressed in\nnatural language. Specifically, S2L first converts the symbols involved to\nlanguage-based representations, which can be implemented by prompting LLMs or\nleveraging external tools, then these language-based representations are\nintegrated into the original problem via direct substitution or concatenation,\nserving as useful input information for LLMs. We evaluate the S2L method using\nboth API-based (GPT-4, ChatGPT) and open-source (OpenChat) models over eight\nsymbol-related tasks, ranging from symbol-only abstract reasoning to sentiment\nanalysis in social media. Experimental results show that S2L consistently leads\nto superior performance. For example, by employing S2L for GPT-4, there can be\naverage significant improvements of +21.9% and +9.5% for subtasks in 1D-ARC and\nDyck language, respectively. Codes and data are available at\nhttps://github.com/THUNLP-MT/symbol2language.\n","authors":["Yile Wang","Sijie Cheng","Zixin Sun","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.11725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09798v2","updated":"2024-01-22T06:22:55Z","published":"2024-01-18T08:36:54Z","title":"All in How You Ask for It: Simple Black-Box Method for Jailbreak Attacks","summary":"  Large Language Models (LLMs) like ChatGPT face `jailbreak' challenges, where\nsafeguards are bypassed to produce ethically harmful prompts. This study\nproposes a simple black-box method to effectively generate jailbreak prompts,\novercoming the high complexity and computational costs associated with existing\nmethods. The proposed technique iteratively rewrites harmful prompts into\nnon-harmful expressions using the target LLM itself, based on the hypothesis\nthat LLMs can directly sample expressions that bypass safeguards. Demonstrated\nthrough experiments with ChatGPT (GPT-3.5 and GPT-4) and Gemini-Pro, this\nmethod achieved an attack success rate of over 80% within an average of 5\niterations and remained effective despite model updates. The generated\njailbreak prompts were naturally-worded and concise; moreover, they were\ndifficult-to-defend. These results indicate that creating effective jailbreak\nprompts is simpler than previously considered, suggesting that black-box\njailbreak attacks pose a more serious threat.\n","authors":["Kazuhiro Takemoto"],"pdf_url":"https://arxiv.org/pdf/2401.09798v2.pdf","comment":"12 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2401.11700v1","updated":"2024-01-22T05:46:11Z","published":"2024-01-22T05:46:11Z","title":"Keep Decoding Parallel with Effective Knowledge Distillation from\n  Language Models to End-to-end Speech Recognisers","summary":"  This study presents a novel approach for knowledge distillation (KD) from a\nBERT teacher model to an automatic speech recognition (ASR) model using\nintermediate layers. To distil the teacher's knowledge, we use an attention\ndecoder that learns from BERT's token probabilities. Our method shows that\nlanguage model (LM) information can be more effectively distilled into an ASR\nmodel using both the intermediate layers and the final layer. By using the\nintermediate layers as distillation target, we can more effectively distil LM\nknowledge into the lower network layers. Using our method, we achieve better\nrecognition accuracy than with shallow fusion of an external LM, allowing us to\nmaintain fast parallel decoding. Experiments on the LibriSpeech dataset\ndemonstrate the effectiveness of our approach in enhancing greedy decoding with\nconnectionist temporal classification (CTC).\n","authors":["Michael Hentschel","Yuta Nishikawa","Tatsuya Komatsu","Yusuke Fujita"],"pdf_url":"https://arxiv.org/pdf/2401.11700v1.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2304.03047v3","updated":"2024-01-22T04:57:32Z","published":"2023-04-06T13:07:17Z","title":"ETPNav: Evolving Topological Planning for Vision-Language Navigation in\n  Continuous Environments","summary":"  Vision-language navigation is a task that requires an agent to follow\ninstructions to navigate in environments. It becomes increasingly crucial in\nthe field of embodied AI, with potential applications in autonomous navigation,\nsearch and rescue, and human-robot interaction. In this paper, we propose to\naddress a more practical yet challenging counterpart setting - vision-language\nnavigation in continuous environments (VLN-CE). To develop a robust VLN-CE\nagent, we propose a new navigation framework, ETPNav, which focuses on two\ncritical skills: 1) the capability to abstract environments and generate\nlong-range navigation plans, and 2) the ability of obstacle-avoiding control in\ncontinuous environments. ETPNav performs online topological mapping of\nenvironments by self-organizing predicted waypoints along a traversed path,\nwithout prior environmental experience. It privileges the agent to break down\nthe navigation procedure into high-level planning and low-level control.\nConcurrently, ETPNav utilizes a transformer-based cross-modal planner to\ngenerate navigation plans based on topological maps and instructions. The plan\nis then performed through an obstacle-avoiding controller that leverages a\ntrial-and-error heuristic to prevent navigation from getting stuck in\nobstacles. Experimental results demonstrate the effectiveness of the proposed\nmethod. ETPNav yields more than 10% and 20% improvements over prior\nstate-of-the-art on R2R-CE and RxR-CE datasets, respectively. Our code is\navailable at https://github.com/MarSaKi/ETPNav.\n","authors":["Dong An","Hanqing Wang","Wenguan Wang","Zun Wang","Yan Huang","Keji He","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.03047v3.pdf","comment":"Project page: https://github.com/MarSaKi/ETPNav"},{"id":"http://arxiv.org/abs/2305.05352v6","updated":"2024-01-22T04:15:13Z","published":"2023-05-09T11:37:16Z","title":"A Taxonomy of Foundation Model based Systems through the Lens of\n  Software Architecture","summary":"  The recent release of large language model (LLM) based chatbots, such as\nChatGPT, has attracted huge interest in foundation models. It is widely\nbelieved that foundation models will serve as the fundamental building blocks\nfor future AI systems. As foundation models are in their early stages, the\ndesign of foundation model based systems has not yet been systematically\nexplored. There is limited understanding about the impact of introducing\nfoundation models in software architecture. Therefore, in this paper, we\npropose a taxonomy of foundation model based systems, which classifies and\ncompares the characteristics of foundation models and design options of\nfoundation model based systems. Our taxonomy comprises three categories: the\npretraining and adaptation of foundation models, the architecture design of\nfoundation model based systems, and responsible-AI-by-design. This taxonomy can\nserve as concrete guidance for making major architectural design decisions when\ndesigning foundation model based systems and highlights trade-offs arising from\ndesign decisions.\n","authors":["Qinghua Lu","Liming Zhu","Xiwei Xu","Yue Liu","Zhenchang Xing","Jon Whittle"],"pdf_url":"https://arxiv.org/pdf/2305.05352v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01538v3","updated":"2024-01-22T02:39:17Z","published":"2023-09-04T11:38:02Z","title":"ChatRule: Mining Logical Rules with Large Language Models for Knowledge\n  Graph Reasoning","summary":"  Logical rules are essential for uncovering the logical connections between\nrelations, which could improve reasoning performance and provide interpretable\nresults on knowledge graphs (KGs). Although there have been many efforts to\nmine meaningful logical rules over KGs, existing methods suffer from\ncomputationally intensive searches over the rule space and a lack of\nscalability for large-scale KGs. Besides, they often ignore the semantics of\nrelations which is crucial for uncovering logical connections. Recently, large\nlanguage models (LLMs) have shown impressive performance in the field of\nnatural language processing and various applications, owing to their emergent\nability and generalizability. In this paper, we propose a novel framework,\nChatRule, unleashing the power of large language models for mining logical\nrules over knowledge graphs. Specifically, the framework is initiated with an\nLLM-based rule generator, leveraging both the semantic and structural\ninformation of KGs to prompt LLMs to generate logical rules. To refine the\ngenerated rules, a rule ranking module estimates the rule quality by\nincorporating facts from existing KGs. Last, the ranked rules can be used to\nconduct reasoning over KGs. ChatRule is evaluated on four large-scale KGs,\nw.r.t. different rule quality metrics and downstream tasks, showing the\neffectiveness and scalability of our method.\n","authors":["Linhao Luo","Jiaxin Ju","Bo Xiong","Yuan-Fang Li","Gholamreza Haffari","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2309.01538v3.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.11645v1","updated":"2024-01-22T01:44:42Z","published":"2024-01-22T01:44:42Z","title":"Streaming Bilingual End-to-End ASR model using Attention over Multiple\n  Softmax","summary":"  Even with several advancements in multilingual modeling, it is challenging to\nrecognize multiple languages using a single neural model, without knowing the\ninput language and most multilingual models assume the availability of the\ninput language. In this work, we propose a novel bilingual end-to-end (E2E)\nmodeling approach, where a single neural model can recognize both languages and\nalso support switching between the languages, without any language input from\nthe user. The proposed model has shared encoder and prediction networks, with\nlanguage-specific joint networks that are combined via a self-attention\nmechanism. As the language-specific posteriors are combined, it produces a\nsingle posterior probability over all the output symbols, enabling a single\nbeam search decoding and also allowing dynamic switching between the languages.\nThe proposed approach outperforms the conventional bilingual baseline with\n13.3%, 8.23% and 1.3% word error rate relative reduction on Hindi, English and\ncode-mixed test sets, respectively.\n","authors":["Aditya Patil","Vikas Joshi","Purvi Agrawal","Rupesh Mehta"],"pdf_url":"https://arxiv.org/pdf/2401.11645v1.pdf","comment":"Published in IEEE's Spoken Language Technology (SLT) 2022, 8 pages (6\n  + 2 for references), 5 figures"},{"id":"http://arxiv.org/abs/2109.01636v4","updated":"2024-01-22T01:23:23Z","published":"2021-09-03T17:28:04Z","title":"Empirical Study of Named Entity Recognition Performance Using\n  Distribution-aware Word Embedding","summary":"  With the fast development of Deep Learning techniques, Named Entity\nRecognition (NER) is becoming more and more important in the information\nextraction task. The greatest difficulty that the NER task faces is to keep the\ndetectability even when types of NE and documents are unfamiliar. Realizing\nthat the specificity information may contain potential meanings of a word and\ngenerate semantic-related features for word embedding, we develop a\ndistribution-aware word embedding and implement three different methods to make\nuse of the distribution information in a NER framework. And the result shows\nthat the performance of NER will be improved if the word specificity is\nincorporated into existing NER methods.\n","authors":["Xin Chen","Qi Zhao","Xinyang Liu"],"pdf_url":"https://arxiv.org/pdf/2109.01636v4.pdf","comment":"Want to correct"},{"id":"http://arxiv.org/abs/2401.11641v1","updated":"2024-01-22T01:06:17Z","published":"2024-01-22T01:06:17Z","title":"Revolutionizing Finance with LLMs: An Overview of Applications and\n  Insights","summary":"  In recent years, Large Language Models (LLMs) like ChatGPT have seen\nconsiderable advancements and have been applied in diverse fields. Built on the\nTransformer architecture, these models are trained on extensive datasets,\nenabling them to understand and generate human language effectively. In the\nfinancial domain, the deployment of LLMs is gaining momentum. These models are\nbeing utilized for automating financial report generation, forecasting market\ntrends, analyzing investor sentiment, and offering personalized financial\nadvice. Leveraging their natural language processing capabilities, LLMs can\ndistill key insights from vast financial data, aiding institutions in making\ninformed investment choices and enhancing both operational efficiency and\ncustomer satisfaction. In this study, we provide a comprehensive overview of\nthe emerging integration of LLMs into various financial tasks. Additionally, we\nconducted holistic tests on multiple financial tasks through the combination of\nnatural language instructions. Our findings show that GPT-4 effectively follow\nprompt instructions across various financial tasks. This survey and evaluation\nof LLMs in the financial domain aim to deepen the understanding of LLMs'\ncurrent role in finance for both financial practitioners and LLM researchers,\nidentify new research and application prospects, and highlight how these\ntechnologies can be leveraged to solve practical challenges in the finance\nindustry.\n","authors":["Huaqin Zhao","Zhengliang Liu","Zihao Wu","Yiwei Li","Tianze Yang","Peng Shu","Shaochen Xu","Haixing Dai","Lin Zhao","Gengchen Mai","Ninghao Liu","Tianming Liu"],"pdf_url":"https://arxiv.org/pdf/2401.11641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.14358v2","updated":"2024-01-22T00:38:08Z","published":"2022-06-29T01:57:44Z","title":"Using Twitter Data to Understand Public Perceptions of Approved versus\n  Off-label Use for COVID-19-related Medications","summary":"  Understanding public discourse on emergency use of unproven therapeutics is\ncrucial for monitoring safe use and combating misinformation. We developed a\nnatural language processing-based pipeline to comprehend public perceptions of\nand stances on coronavirus disease 2019 (COVID-19)-related drugs on Twitter\nover time. This retrospective study included 609,189 US-based tweets from\nJanuary 29, 2020, to November 30, 2021, about four drugs that garnered\nsignificant public attention during the COVID-19 pandemic: (1)\nHydroxychloroquine and Ivermectin, therapies with anecdotal evidence; and (2)\nMolnupiravir and Remdesivir, FDA-approved treatments for eligible patients.\nTime-trend analysis was employed to understand popularity trends and related\nevents. Content and demographic analyses were conducted to explore potential\nrationales behind people's stances on each drug. Time-trend analysis indicated\nthat Hydroxychloroquine and Ivermectin were discussed more than Molnupiravir\nand Remdesivir, particularly during COVID-19 surges. Hydroxychloroquine and\nIvermectin discussions were highly politicized, related to conspiracy theories,\nhearsay, and celebrity influences. The distribution of stances between the two\nmajor US political parties was significantly different (P < .001); Republicans\nwere more likely to support Hydroxychloroquine (55%) and Ivermectin (30%) than\nDemocrats. People with healthcare backgrounds tended to oppose\nHydroxychloroquine (7%) more than the general population, while the general\npopulation was more likely to support Ivermectin (14%). Our study found that\nsocial media users have varying perceptions and stances on off-label versus\nFDA-authorized drug use at different stages of COVID-19. This indicates that\nhealth systems, regulatory agencies, and policymakers should design tailored\nstrategies to monitor and reduce misinformation to promote safe drug use.\n","authors":["Yining Hua","Hang Jiang","Shixu Lin","Jie Yang","Joseph M. Plasek","David W. Bates","Li Zhou"],"pdf_url":"https://arxiv.org/pdf/2206.14358v2.pdf","comment":"Full paper published in JAMIA"},{"id":"http://arxiv.org/abs/2306.16001v2","updated":"2024-01-22T00:27:45Z","published":"2023-06-28T08:20:35Z","title":"Streamlining Social Media Information Extraction for Public Health\n  Research with Deep Learning","summary":"  Objective: Social media-based public health research is crucial for epidemic\nsurveillance, but most studies identify relevant corpora with keyword matching.\nThis study develops a system to streamline the process of curating colloquial\nmedical dictionaries. We demonstrate the pipeline by curating a UMLS-colloquial\nsymptom dictionary from COVID-19-related tweets as proof of concept. Methods:\nCOVID-19-related tweets from February 1, 2020, to April 30, 2022 were used. The\npipeline includes three modules: a named entity recognition module to detect\nsymptoms in tweets; an entity normalization module to aggregate detected\nentities; and a mapping module that iteratively maps entities to Unified\nMedical Language System concepts. A random 500 entity sample were drawn from\nthe final dictionary for accuracy validation. Additionally, we conducted a\nsymptom frequency distribution analysis to compare our dictionary to a\npre-defined lexicon from previous research. Results: We identified 498,480\nunique symptom entity expressions from the tweets. Pre-processing reduces the\nnumber to 18,226. The final dictionary contains 38,175 unique expressions of\nsymptoms that can be mapped to 966 UMLS concepts (accuracy = 95%). Symptom\ndistribution analysis found that our dictionary detects more symptoms and is\neffective at identifying psychiatric disorders like anxiety and depression,\noften missed by pre-defined lexicons. Conclusion: This study advances public\nhealth research by implementing a novel, systematic pipeline for curating\nsymptom lexicons from social media data. The final lexicon's high accuracy,\nvalidated by medical professionals, underscores the potential of this\nmethodology to reliably interpret and categorize vast amounts of unstructured\nsocial media data into actionable medical insights across diverse linguistic\nand regional landscapes.\n","authors":["Yining Hua","Shixu Lin","Minghui Li","Yujie Zhang","Dinah Foer","Siwen Wang","Peilin Zhou","Li Zhou","Jie Yang"],"pdf_url":"https://arxiv.org/pdf/2306.16001v2.pdf","comment":"Updated full paper. Abstract presented at IEEE ICHI 2023 and AMIA\n  Annual Symposium 2023"},{"id":"http://arxiv.org/abs/2401.12413v1","updated":"2024-01-22T23:55:00Z","published":"2024-01-22T23:55:00Z","title":"How Far Can 100 Samples Go? Unlocking Overall Zero-Shot Multilingual\n  Translation via Tiny Multi-Parallel Data","summary":"  Zero-shot translation is an open problem, aiming to translate between\nlanguage pairs unseen during training in Multilingual Machine Translation\n(MMT). A common, albeit resource-consuming, solution is to mine as many\ntranslation directions as possible to add to the parallel corpus. In this\npaper, we show that the zero-shot capability of an English-centric model can be\neasily enhanced by fine-tuning with a very small amount of multi-parallel data.\nFor example, on the EC30 dataset, we show that up to +21.7 ChrF non-English\noverall improvements (870 directions) can be achieved by using only 100\nmulti-parallel samples, meanwhile preserving capability in English-centric\ndirections. We further study the size effect of fine-tuning data and its\ntransfer capabilities. Surprisingly, our empirical analysis shows that\ncomparable overall improvements can be achieved even through fine-tuning in a\nsmall, randomly sampled direction set (10\\%). Also, the resulting non-English\nperformance is quite close to the upper bound (complete translation). Due to\nits high efficiency and practicality, we encourage the community 1) to consider\nthe use of the fine-tuning method as a strong baseline for zero-shot\ntranslation and 2) to construct more comprehensive and high-quality\nmulti-parallel data to cover real-world demand.\n","authors":["Di Wu","Shaomu Tan","Yan Meng","David Stap","Christof Monz"],"pdf_url":"https://arxiv.org/pdf/2401.12413v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.12406v1","updated":"2024-01-22T23:35:09Z","published":"2024-01-22T23:35:09Z","title":"Enhancing In-context Learning via Linear Probe Calibration","summary":"  In-context learning (ICL) is a new paradigm for natural language processing\nthat utilizes Generative Pre-trained Transformer (GPT)-like models. This\napproach uses prompts that include in-context demonstrations to generate the\ncorresponding output for a new query input. However, applying ICL in real cases\ndoes not scale with the number of samples, and lacks robustness to different\nprompt templates and demonstration permutations. In this paper, we first show\nthat GPT-like models using ICL result in unreliable predictions based on a new\nmetric based on Shannon entropy. Then, to solve this problem, we propose a new\ntechnique called the Linear Probe Calibration (LinC), a method that calibrates\nthe model's output probabilities, resulting in reliable predictions and\nimproved performance, while requiring only minimal additional samples (as few\nas five labeled data samples). LinC significantly enhances the ICL test\nperformance of GPT models on various benchmark datasets, with an average\nimprovement of up to 21%, and up to a 50% improvement in some cases, and\nsignificantly boosts the performance of PEFT methods, especially in the low\nresource regime. Moreover, LinC achieves lower expected calibration error, and\nis highly robust to varying label proportions, prompt templates, and\ndemonstration permutations. Our code is available at\n\\url{https://github.com/mominabbass/LinC}.\n","authors":["Momin Abbas","Yi Zhou","Parikshit Ram","Nathalie Baracaldo","Horst Samulowitz","Theodoros Salonidis","Tianyi Chen"],"pdf_url":"https://arxiv.org/pdf/2401.12406v1.pdf","comment":"Accepted at AISTATS2024"},{"id":"http://arxiv.org/abs/2309.08007v2","updated":"2024-01-22T23:05:55Z","published":"2023-09-14T19:33:27Z","title":"DiariST: Streaming Speech Translation with Speaker Diarization","summary":"  End-to-end speech translation (ST) for conversation recordings involves\nseveral under-explored challenges such as speaker diarization (SD) without\naccurate word time stamps and handling of overlapping speech in a streaming\nfashion. In this work, we propose DiariST, the first streaming ST and SD\nsolution. It is built upon a neural transducer-based streaming ST system and\nintegrates token-level serialized output training and t-vector, which were\noriginally developed for multi-talker speech recognition. Due to the absence of\nevaluation benchmarks in this area, we develop a new evaluation dataset,\nDiariST-AliMeeting, by translating the reference Chinese transcriptions of the\nAliMeeting corpus into English. We also propose new metrics, called\nspeaker-agnostic BLEU and speaker-attributed BLEU, to measure the ST quality\nwhile taking SD accuracy into account. Our system achieves a strong ST and SD\ncapability compared to offline systems based on Whisper, while performing\nstreaming inference for overlapping speech. To facilitate the research in this\nnew direction, we release the evaluation data, the offline baseline systems,\nand the evaluation code.\n","authors":["Mu Yang","Naoyuki Kanda","Xiaofei Wang","Junkun Chen","Peidong Wang","Jian Xue","Jinyu Li","Takuya Yoshioka"],"pdf_url":"https://arxiv.org/pdf/2309.08007v2.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.12382v1","updated":"2024-01-22T22:16:55Z","published":"2024-01-22T22:16:55Z","title":"Longitudinal Sentiment Classification of Reddit Posts","summary":"  We report results of a longitudinal sentiment classification of Reddit posts\nwritten by students of four major Canadian universities. We work with the texts\nof the posts, concentrating on the years 2020-2023. By finely tuning a\nsentiment threshold to a range of [-0.075,0.075], we successfully built\nclassifiers proficient in categorizing post sentiments into positive and\nnegative categories. Noticeably, our sentiment classification results are\nconsistent across the four university data sets.\n","authors":["Fabian Nwaoha","Ziyad Gaffar","Ho Joon Chun","Marina Sokolova"],"pdf_url":"https://arxiv.org/pdf/2401.12382v1.pdf","comment":"11 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2310.00737v3","updated":"2024-01-22T22:12:05Z","published":"2023-10-01T17:25:56Z","title":"GenAI Against Humanity: Nefarious Applications of Generative Artificial\n  Intelligence and Large Language Models","summary":"  Generative Artificial Intelligence (GenAI) and Large Language Models (LLMs)\nare marvels of technology; celebrated for their prowess in natural language\nprocessing and multimodal content generation, they promise a transformative\nfuture. But as with all powerful tools, they come with their shadows. Picture\nliving in a world where deepfakes are indistinguishable from reality, where\nsynthetic identities orchestrate malicious campaigns, and where targeted\nmisinformation or scams are crafted with unparalleled precision. Welcome to the\ndarker side of GenAI applications. This article is not just a journey through\nthe meanders of potential misuse of GenAI and LLMs, but also a call to\nrecognize the urgency of the challenges ahead. As we navigate the seas of\nmisinformation campaigns, malicious content generation, and the eerie creation\nof sophisticated malware, we'll uncover the societal implications that ripple\nthrough the GenAI revolution we are witnessing. From AI-powered botnets on\nsocial media platforms to the unnerving potential of AI to generate fabricated\nidentities, or alibis made of synthetic realities, the stakes have never been\nhigher. The lines between the virtual and the real worlds are blurring, and the\nconsequences of potential GenAI's nefarious applications impact us all. This\narticle serves both as a synthesis of rigorous research presented on the risks\nof GenAI and misuse of LLMs and as a thought-provoking vision of the different\ntypes of harmful GenAI applications we might encounter in the near future, and\nsome ways we can prepare for them.\n","authors":["Emilio Ferrara"],"pdf_url":"https://arxiv.org/pdf/2310.00737v3.pdf","comment":"Accepted in: Journal of Computational Social Science"},{"id":"http://arxiv.org/abs/2401.12375v1","updated":"2024-01-22T21:59:00Z","published":"2024-01-22T21:59:00Z","title":"Development of an NLP-driven computer-based test guide for visually\n  impaired students","summary":"  In recent years, advancements in Natural Language Processing (NLP) techniques\nhave revolutionized the field of accessibility and exclusivity of testing,\nparticularly for visually impaired students (VIS). CBT has shown in years back\nits relevance in terms of administering exams electronically, making the test\nprocess easier, providing quicker and more accurate results, and offering\ngreater flexibility and accessibility for candidates. Yet, its relevance was\nnot felt by the visually impaired students as they cannot access printed\ndocuments. Hence, in this paper, we present an NLP-driven Computer-Based Test\nguide for visually impaired students. It employs a speech technology\npre-trained methods to provide real-time assistance and support to visually\nimpaired students. The system utilizes NLP technologies to convert the\ntext-based questions and the associated options in a machine-readable format.\nSubsequently, the speech technology pre-trained model processes the converted\ntext enabling the VIS to comprehend and analyze the content. Furthermore, we\nvalidated that this pre-trained model is not perverse by testing for accuracy\nusing sample audio datasets labels (A, B, C, D, E, F, G) to compare with the\nvoice recordings obtained from 20 VIS which is been predicted by the system to\nattain values for precision, recall, and F1-scores. These metrics are used to\nassess the performance of the pre-trained model and have indicated that it is\nproficient enough to give its better performance to the evaluated system. The\nmethodology adopted for this system is Object Oriented Analysis and Design\nMethodology (OOADM) where Objects are discussed and built by modeling\nreal-world instances.\n","authors":["Tubo Faustinah Nemieboka","Ikechukwu E. Onyenwe","Doris C. Asogwa"],"pdf_url":"https://arxiv.org/pdf/2401.12375v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2305.14259v4","updated":"2024-01-22T20:47:51Z","published":"2023-05-23T17:12:08Z","title":"Learning to Generate Novel Scientific Directions with Contextualized\n  Literature-based Discovery","summary":"  Literature-Based Discovery (LBD) aims to discover new scientific knowledge by\nmining papers and generating hypotheses. Standard LBD is limited to predicting\npairwise relations between discrete concepts (e.g., drug-disease links), and\nignores critical contexts like experimental settings (e.g., a specific patient\npopulation where a drug is evaluated) and background motivations (e.g., to find\ndrugs without specific side effects). We address these limitations with a novel\nformulation of contextualized-LBD (C-LBD): generating scientific hypotheses in\nnatural language, while grounding them in a context that controls the\nhypothesis search space. We present a modeling framework using retrieval of\n``inspirations'' from past scientific papers. Our evaluations reveal that GPT-4\ntends to generate ideas with overall low technical depth and novelty, while our\ninspiration prompting approaches partially mitigate this issue. Our work\nrepresents a first step toward building language models that generate new ideas\nderived from scientific literature.\n","authors":["Qingyun Wang","Doug Downey","Heng Ji","Tom Hope"],"pdf_url":"https://arxiv.org/pdf/2305.14259v4.pdf","comment":"25 pages. Code and resource is available at\n  https://github.com/EagleW/CLBD"},{"id":"http://arxiv.org/abs/2401.12343v1","updated":"2024-01-22T20:17:06Z","published":"2024-01-22T20:17:06Z","title":"Subgraph Extraction-based Feedback-guided Iterative Scheduling for HLS","summary":"  This paper proposes ISDC, a novel feedback-guided iterative system of\ndifference constraints (SDC) scheduling algorithm for high-level synthesis\n(HLS). ISDC leverages subgraph extraction-based low-level feedback from\ndownstream tools like logic synthesizers to iteratively refine HLS scheduling.\nTechnical innovations include: (1) An enhanced SDC formulation that effectively\nintegrates low-level feedback into the linear-programming (LP) problem; (2) A\nfanout and window-based subgraph extraction mechanism driving the feedback\ncycle; (3) A no-human-in-loop ISDC flow compatible with a wide range of\ndownstream tools and process design kits (PDKs). Evaluation shows that ISDC\nreduces register usage by 28.5% against an industrial-strength open-source HLS\ntool.\n","authors":["Hanchen Ye","David Z. Pan","Chris Leary","Deming Chen","Xiaoqing Xu"],"pdf_url":"https://arxiv.org/pdf/2401.12343v1.pdf","comment":"DATE'24"},{"id":"http://arxiv.org/abs/2401.12326v1","updated":"2024-01-22T19:39:05Z","published":"2024-01-22T19:39:05Z","title":"Fine-tuning Large Language Models for Multigenerator, Multidomain, and\n  Multilingual Machine-Generated Text Detection","summary":"  SemEval-2024 Task 8 introduces the challenge of identifying machine-generated\ntexts from diverse Large Language Models (LLMs) in various languages and\ndomains. The task comprises three subtasks: binary classification in\nmonolingual and multilingual (Subtask A), multi-class classification (Subtask\nB), and mixed text detection (Subtask C). This paper focuses on Subtask A & B.\nEach subtask is supported by three datasets for training, development, and\ntesting. To tackle this task, two methods: 1) using traditional machine\nlearning (ML) with natural language preprocessing (NLP) for feature extraction,\nand 2) fine-tuning LLMs for text classification. The results show that\ntransformer models, particularly LoRA-RoBERTa, exceed traditional ML methods in\neffectiveness, with majority voting being particularly effective in\nmultilingual contexts for identifying machine-generated texts.\n","authors":["Feng Xiong","Thanet Markchom","Ziwei Zheng","Subin Jung","Varun Ojha","Huizhi Liang"],"pdf_url":"https://arxiv.org/pdf/2401.12326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08919v2","updated":"2024-01-22T19:07:07Z","published":"2024-01-17T02:04:59Z","title":"Partial Diacritization: A Context-Contrastive Inference Approach","summary":"  Diacritization plays a pivotal role in improving readability and\ndisambiguating the meaning of Arabic texts. Efforts have so far focused on\nmarking every eligible character (Full Diacritization). Comparatively\noverlooked, Partial Diacritzation (PD) is the selection of a subset of\ncharacters to be marked to aid comprehension where needed. Research has\nindicated that excessive diacritic marks can hinder skilled readers--reducing\nreading speed and accuracy. We conduct a behavioral experiment and show that\npartially marked text is often easier to read than fully marked text, and\nsometimes easier than plain text. In this light, we introduce\nContext-Contrastive Partial Diacritization (CCPD)--a novel approach to PD which\nintegrates seamlessly with existing Arabic diacritization systems. CCPD\nprocesses each word twice, once with context and once without, and diacritizes\nonly the characters with disparities between the two inferences. Further, we\nintroduce novel indicators for measuring partial diacritization quality (SR,\nPDER, HDER, ERE), essential for establishing this as a machine learning task.\nLastly, we introduce TD2, a Transformer-variant of an established model which\noffers a markedly different performance profile on our proposed indicators\ncompared to all other known systems.\n","authors":["Muhammad ElNokrashy","Badr AlKhamissi"],"pdf_url":"https://arxiv.org/pdf/2401.08919v2.pdf","comment":"13 equations, 5 tables, 5 figures"},{"id":"http://arxiv.org/abs/2401.12295v1","updated":"2024-01-22T19:00:11Z","published":"2024-01-22T19:00:11Z","title":"Cheap Learning: Maximising Performance of Language Models for Social\n  Data Science Using Minimal Data","summary":"  The field of machine learning has recently made significant progress in\nreducing the requirements for labelled training data when building new models.\nThese `cheaper' learning techniques hold significant potential for the social\nsciences, where development of large labelled training datasets is often a\nsignificant practical impediment to the use of machine learning for analytical\ntasks. In this article we review three `cheap' techniques that have developed\nin recent years: weak supervision, transfer learning and prompt engineering.\nFor the latter, we also review the particular case of zero-shot prompting of\nlarge language models. For each technique we provide a guide of how it works\nand demonstrate its application across six different realistic social science\napplications (two different tasks paired with three different dataset makeups).\nWe show good performance for all techniques, and in particular we demonstrate\nhow prompting of large language models can achieve high accuracy at very low\ncost. Our results are accompanied by a code repository to make it easy for\nothers to duplicate our work and use it in their own research. Overall, our\narticle is intended to stimulate further uptake of these techniques in the\nsocial sciences.\n","authors":["Leonardo Castro-Gonzalez","Yi-Ling Chung","Hannak Rose Kirk","John Francis","Angus R. Williams","Pica Johansson","Jonathan Bright"],"pdf_url":"https://arxiv.org/pdf/2401.12295v1.pdf","comment":"39 pages, 10 figures, 6 tables"},{"id":"http://arxiv.org/abs/2401.12292v1","updated":"2024-01-22T19:00:08Z","published":"2024-01-22T19:00:08Z","title":"GRATH: Gradual Self-Truthifying for Large Language Models","summary":"  Truthfulness is paramount for large language models (LLMs) as they are\nincreasingly deployed in real-world applications. However, existing LLMs still\nstruggle with generating truthful answers and content, as evidenced by their\nmodest performance on benchmarks like TruthfulQA. To address this issue, we\npropose GRAdual self-truTHifying (GRATH), a novel post-processing method to\nenhance truthfulness of LLMs. GRATH utilizes out-of-domain question prompts to\ngenerate corresponding answers and adaptively optimizes the model via direct\npreference optimization (DPO). Note that during this process, GRATH learns\ntruthfulness in a self-supervised manner without requiring annotated answers.\nIn particular, GRATH first generates pairwise truthfulness training data by\nprompting the LLM itself, with each pair containing a question and its correct\nand incorrect answers. The model is then fine-tuned using DPO to learn from the\ndifference between answer pairs. Subsequently, GRATH iteratively refines the\ntruthfulness data and optimizes the model, leading to a gradual improvement in\nmodel truthfulness. Empirically, we evaluate GRATH using different 7B-LLMs and\ncompare with LLMs with similar or even larger sizes on benchmark datasets. Our\nresults show that GRATH effectively improves LLMs' truthfulness without\ncompromising other core capabilities. Notably, GRATH achieves state-of-the-art\nperformance on TruthfulQA, with MC1 accuracy as 54.71% and MC2 accuracy as\n69.10%, which even surpass those on larger-scale models, such as\nLlama2-Chat-70B, by 23.62% and 24.18%, respectively.\n","authors":["Weixin Chen","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2401.12292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12273v1","updated":"2024-01-22T17:11:37Z","published":"2024-01-22T17:11:37Z","title":"The Ethics of Interaction: Mitigating Security Threats in LLMs","summary":"  This paper comprehensively explores the ethical challenges arising from\nsecurity threats to Language Learning Models (LLMs). These intricate digital\nrepositories are increasingly integrated into our daily lives, making them\nprime targets for attacks that can compromise their training data and the\nconfidentiality of their data sources. The paper delves into the nuanced\nethical repercussions of such security threats on society and individual\nprivacy. We scrutinize five major threats: prompt injection, jailbreaking,\nPersonal Identifiable Information (PII) exposure, sexually explicit content,\nand hate based content, going beyond mere identification to assess their\ncritical ethical consequences and the urgency they create for robust defensive\nstrategies. The escalating reliance on LLMs underscores the crucial need for\nensuring these systems operate within the bounds of ethical norms, particularly\nas their misuse can lead to significant societal and individual harm. We\npropose conceptualizing and developing an evaluative tool tailored for LLMs,\nwhich would serve a dual purpose, guiding developers and designers in\npreemptive fortification of backend systems and scrutinizing the ethical\ndimensions of LLM chatbot responses during the testing phase. By comparing LLM\nresponses with those expected from humans in a moral context, we aim to discern\nthe degree to which AI behaviors align with the ethical values held by a\nbroader society. Ultimately, this paper not only underscores the ethical\ntroubles presented by LLMs, it also highlights a path toward cultivating trust\nin these systems.\n","authors":["Ashutosh Kumar","Sagarika Singh","Shiv Vignesh Murty","Swathy Ragupathy"],"pdf_url":"https://arxiv.org/pdf/2401.12273v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.12217v1","updated":"2024-01-22T18:59:29Z","published":"2024-01-22T18:59:29Z","title":"Exploring Simple Open-Vocabulary Semantic Segmentation","summary":"  Open-vocabulary semantic segmentation models aim to accurately assign a\nsemantic label to each pixel in an image from a set of arbitrary\nopen-vocabulary texts. In order to learn such pixel-level alignment, current\napproaches typically rely on a combination of (i) image-level VL model (e.g.\nCLIP), (ii) ground truth masks, and (iii) custom grouping encoders. In this\npaper, we introduce S-Seg, a novel model that can achieve surprisingly strong\nperformance without depending on any of the above elements. S-Seg leverages\npseudo-mask and language to train a MaskFormer, and can be easily trained from\npublicly available image-text datasets. Contrary to prior works, our model\ndirectly trains for pixel-level features and language alignment. Once trained,\nS-Seg generalizes well to multiple testing datasets without requiring\nfine-tuning. In addition, S-Seg has the extra benefits of scalability with data\nand consistently improvement when augmented with self-training. We believe that\nour simple yet effective approach will serve as a solid baseline for future\nresearch.\n","authors":["Zihang Lai"],"pdf_url":"https://arxiv.org/pdf/2401.12217v1.pdf","comment":"Code is available at: https://github.com/zlai0/S-Seg"},{"id":"http://arxiv.org/abs/2401.12215v1","updated":"2024-01-22T18:59:07Z","published":"2024-01-22T18:59:07Z","title":"Less Could Be Better: Parameter-efficient Fine-tuning Advances Medical\n  Vision Foundation Models","summary":"  Parameter-efficient fine-tuning (PEFT) that was initially developed for\nexploiting pre-trained large language models has recently emerged as an\neffective approach to perform transfer learning on computer vision tasks.\nHowever, the effectiveness of PEFT on medical vision foundation models is still\nunclear and remains to be explored. As a proof of concept, we conducted a\ndetailed empirical study on applying PEFT to chest radiography foundation\nmodels. Specifically, we delved into LoRA, a representative PEFT method, and\ncompared it against full-parameter fine-tuning (FFT) on two self-supervised\nradiography foundation models across three well-established chest radiograph\ndatasets. Our results showed that LoRA outperformed FFT in 13 out of 18\ntransfer learning tasks by at most 2.9% using fewer than 1% tunable parameters.\nCombining LoRA with foundation models, we set up new state-of-the-art on a\nrange of data-efficient learning tasks, such as an AUROC score of 80.6% using\n1% labeled data on NIH ChestX-ray14. We hope this study can evoke more\nattention from the community in the use of PEFT for transfer learning on\nmedical imaging tasks. Code and models are available at\nhttps://github.com/RL4M/MED-PEFT.\n","authors":["Chenyu Lian","Hong-Yu Zhou","Yizhou Yu","Liansheng Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12215v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2310.00647v2","updated":"2024-01-22T18:53:48Z","published":"2023-10-01T12:02:59Z","title":"Beyond Task Performance: Evaluating and Reducing the Flaws of Large\n  Multimodal Models with In-Context Learning","summary":"  Following the success of Large Language Models (LLMs), Large Multimodal\nModels (LMMs), such as the Flamingo model and its subsequent competitors, have\nstarted to emerge as natural steps towards generalist agents. However,\ninteracting with recent LMMs reveals major limitations that are hardly captured\nby the current evaluation benchmarks. Indeed, task performances (e.g., VQA\naccuracy) alone do not provide enough clues to understand their real\ncapabilities, limitations, and to which extent such models are aligned to human\nexpectations. To refine our understanding of those flaws, we deviate from the\ncurrent evaluation paradigm, and (1) evaluate 10 recent open-source LMMs from\n3B up to 80B parameter scale, on 5 different axes; hallucinations, abstention,\ncompositionality, explainability and instruction following. Our evaluation on\nthese axes reveals major flaws in LMMs. While the current go-to solution to\nalign these models is based on training, such as instruction tuning or RLHF, we\nrather (2) explore the training-free in-context learning (ICL) as a solution,\nand study how it affects these limitations. Based on our ICL study, (3) we push\nICL further and propose new multimodal ICL variants such as; Multitask-ICL,\nChain-of-Hindsight-ICL, and Self-Correcting-ICL. Our findings are as follows.\n(1) Despite their success, LMMs have flaws that remain unsolved with scaling\nalone. (2) The effect of ICL on LMMs flaws is nuanced; despite its\neffectiveness for improved explainability, answer abstention, ICL only slightly\nimproves instruction following, does not improve compositional abilities, and\nactually even amplifies hallucinations. (3) The proposed ICL variants are\npromising as post-hoc approaches to efficiently tackle some of those flaws. The\ncode is available here: https://github.com/mshukor/EvALign-ICL.\n","authors":["Mustafa Shukor","Alexandre Rame","Corentin Dancette","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2310.00647v2.pdf","comment":"ICLR 2024. Project Page: https://evalign-icl.github.io/"},{"id":"http://arxiv.org/abs/2401.12210v1","updated":"2024-01-22T18:52:51Z","published":"2024-01-22T18:52:51Z","title":"Connecting the Dots: Leveraging Spatio-Temporal Graph Neural Networks\n  for Accurate Bangla Sign Language Recognition","summary":"  Recent advances in Deep Learning and Computer Vision have been successfully\nleveraged to serve marginalized communities in various contexts. One such area\nis Sign Language - a primary means of communication for the deaf community.\nHowever, so far, the bulk of research efforts and investments have gone into\nAmerican Sign Language, and research activity into low-resource sign languages\n- especially Bangla Sign Language - has lagged significantly. In this research\npaper, we present a new word-level Bangla Sign Language dataset - BdSL40 -\nconsisting of 611 videos over 40 words, along with two different approaches:\none with a 3D Convolutional Neural Network model and another with a novel Graph\nNeural Network approach for the classification of BdSL40 dataset. This is the\nfirst study on word-level BdSL recognition, and the dataset was transcribed\nfrom Indian Sign Language (ISL) using the Bangla Sign Language Dictionary\n(1997). The proposed GNN model achieved an F1 score of 89%. The study\nhighlights the significant lexical and semantic similarity between BdSL, West\nBengal Sign Language, and ISL, and the lack of word-level datasets for BdSL in\nthe literature. We release the dataset and source code to stimulate further\nresearch.\n","authors":["Haz Sameen Shahgir","Khondker Salman Sayeed","Md Toki Tahmid","Tanjeem Azwad Zaman","Md. Zarif Ul Alam"],"pdf_url":"https://arxiv.org/pdf/2401.12210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12208v1","updated":"2024-01-22T18:51:07Z","published":"2024-01-22T18:51:07Z","title":"CheXagent: Towards a Foundation Model for Chest X-Ray Interpretation","summary":"  Chest X-rays (CXRs) are the most frequently performed imaging test in\nclinical practice. Recent advances in the development of vision-language\nfoundation models (FMs) give rise to the possibility of performing automated\nCXR interpretation, which can assist physicians with clinical decision-making\nand improve patient outcomes. However, developing FMs that can accurately\ninterpret CXRs is challenging due to the (1) limited availability of\nlarge-scale vision-language datasets in the medical image domain, (2) lack of\nvision and language encoders that can capture the complexities of medical data,\nand (3) absence of evaluation frameworks for benchmarking the abilities of FMs\non CXR interpretation. In this work, we address these challenges by first\nintroducing \\emph{CheXinstruct} - a large-scale instruction-tuning dataset\ncurated from 28 publicly-available datasets. We then present \\emph{CheXagent} -\nan instruction-tuned FM capable of analyzing and summarizing CXRs. To build\nCheXagent, we design a clinical large language model (LLM) for parsing\nradiology reports, a vision encoder for representing CXR images, and a network\nto bridge the vision and language modalities. Finally, we introduce\n\\emph{CheXbench} - a novel benchmark designed to systematically evaluate FMs\nacross 8 clinically-relevant CXR interpretation tasks. Extensive quantitative\nevaluations and qualitative reviews with five expert radiologists demonstrate\nthat CheXagent outperforms previously-developed general- and medical-domain FMs\non CheXbench tasks. Furthermore, in an effort to improve model transparency, we\nperform a fairness evaluation across factors of sex, race and age to highlight\npotential performance disparities. Our project is at\n\\url{https://stanford-aimi.github.io/chexagent.html}.\n","authors":["Zhihong Chen","Maya Varma","Jean-Benoit Delbrouck","Magdalini Paschali","Louis Blankemeier","Dave Van Veen","Jeya Maria Jose Valanarasu","Alaa Youssef","Joseph Paul Cohen","Eduardo Pontes Reis","Emily B. Tsai","Andrew Johnston","Cameron Olsen","Tanishq Mathew Abraham","Sergios Gatidis","Akshay S. Chaudhari","Curtis Langlotz"],"pdf_url":"https://arxiv.org/pdf/2401.12208v1.pdf","comment":"24 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.12202v1","updated":"2024-01-22T18:42:20Z","published":"2024-01-22T18:42:20Z","title":"OK-Robot: What Really Matters in Integrating Open-Knowledge Models for\n  Robotics","summary":"  Remarkable progress has been made in recent years in the fields of vision,\nlanguage, and robotics. We now have vision models capable of recognizing\nobjects based on language queries, navigation systems that can effectively\ncontrol mobile systems, and grasping models that can handle a wide range of\nobjects. Despite these advancements, general-purpose applications of robotics\nstill lag behind, even though they rely on these fundamental capabilities of\nrecognition, navigation, and grasping. In this paper, we adopt a systems-first\napproach to develop a new Open Knowledge-based robotics framework called\nOK-Robot. By combining Vision-Language Models (VLMs) for object detection,\nnavigation primitives for movement, and grasping primitives for object\nmanipulation, OK-Robot offers a integrated solution for pick-and-drop\noperations without requiring any training. To evaluate its performance, we run\nOK-Robot in 10 real-world home environments. The results demonstrate that\nOK-Robot achieves a 58.5% success rate in open-ended pick-and-drop tasks,\nrepresenting a new state-of-the-art in Open Vocabulary Mobile Manipulation\n(OVMM) with nearly 1.8x the performance of prior work. On cleaner, uncluttered\nenvironments, OK-Robot's performance increases to 82%. However, the most\nimportant insight gained from OK-Robot is the critical role of nuanced details\nwhen combining Open Knowledge systems like VLMs with robotic modules. Videos of\nour experiments are available on our website: https://ok-robot.github.io\n","authors":["Peiqi Liu","Yaswanth Orru","Chris Paxton","Nur Muhammad Mahi Shafiullah","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2401.12202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12198v1","updated":"2024-01-22T18:38:44Z","published":"2024-01-22T18:38:44Z","title":"LONEStar: The Lunar Flashlight Optical Navigation Experiment","summary":"  This paper documents the results from the highly successful Lunar flashlight\nOptical Navigation Experiment with a Star tracker (LONEStar). Launched in\nDecember 2022, Lunar Flashlight (LF) was a NASA-funded technology demonstration\nmission. After a propulsion system anomaly prevented capture in lunar orbit, LF\nwas ejected from the Earth-Moon system and into heliocentric space. NASA\nsubsequently transferred ownership of LF to Georgia Tech to conduct an unfunded\nextended mission to demonstrate further advanced technology objectives,\nincluding LONEStar. From August-December 2023, the LONEStar team performed\non-orbit calibration of the optical instrument and a number of different OPNAV\nexperiments. This campaign included the processing of nearly 400 images of star\nfields, Earth and Moon, and four other planets (Mercury, Mars, Jupiter, and\nSaturn). LONEStar provided the first on-orbit demonstrations of heliocentric\nnavigation using only optical observations of planets. Of special note is the\nsuccessful in-flight demonstration of (1) instantaneous triangulation with\nsimultaneous sightings of two planets with the LOST algorithm and (2) dynamic\ntriangulation with sequential sightings of multiple planets.\n","authors":["Michael Krause","Ava Thrasher","Priyal Soni","Liam Smego","Reuben Isaac","Jennifer Nolan","Micah Pledger","E. Glenn Lightsey","W. Jud Ready","John Christian"],"pdf_url":"https://arxiv.org/pdf/2401.12198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12176v1","updated":"2024-01-22T18:09:15Z","published":"2024-01-22T18:09:15Z","title":"Broiler-Net: A Deep Convolutional Framework for Broiler Behavior\n  Analysis in Poultry Houses","summary":"  Detecting anomalies in poultry houses is crucial for maintaining optimal\nchicken health conditions, minimizing economic losses and bolstering\nprofitability. This paper presents a novel real-time framework for analyzing\nchicken behavior in cage-free poultry houses to detect abnormal behaviors.\nSpecifically, two significant abnormalities, namely inactive broiler and\nhuddling behavior, are investigated in this study. The proposed framework\ncomprises three key steps: (1) chicken detection utilizing a state-of-the-art\ndeep learning model, (2) tracking individual chickens across consecutive frames\nwith a fast tracker module, and (3) detecting abnormal behaviors within the\nvideo stream. Experimental studies are conducted to evaluate the efficacy of\nthe proposed algorithm in accurately assessing chicken behavior. The results\nillustrate that our framework provides a precise and efficient solution for\nreal-time anomaly detection, facilitating timely interventions to maintain\nchicken health and enhance overall productivity on poultry farms. Github:\nhttps://github.com/TaherehZarratEhsan/Chicken-Behavior-Analysis\n","authors":["Tahereh Zarrat Ehsan","Seyed Mehdi Mohtavipour"],"pdf_url":"https://arxiv.org/pdf/2401.12176v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.05916v3","updated":"2024-01-22T18:08:52Z","published":"2023-10-09T17:59:04Z","title":"Interpreting CLIP's Image Representation via Text-Based Decomposition","summary":"  We investigate the CLIP image encoder by analyzing how individual model\ncomponents affect the final representation. We decompose the image\nrepresentation as a sum across individual image patches, model layers, and\nattention heads, and use CLIP's text representation to interpret the summands.\nInterpreting the attention heads, we characterize each head's role by\nautomatically finding text representations that span its output space, which\nreveals property-specific roles for many heads (e.g. location or shape). Next,\ninterpreting the image patches, we uncover an emergent spatial localization\nwithin CLIP. Finally, we use this understanding to remove spurious features\nfrom CLIP and to create a strong zero-shot image segmenter. Our results\nindicate that a scalable understanding of transformer models is attainable and\ncan be used to repair and improve models.\n","authors":["Yossi Gandelsman","Alexei A. Efros","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2310.05916v3.pdf","comment":"Project page and code:\n  https://yossigandelsman.github.io/clip_decomposition/"},{"id":"http://arxiv.org/abs/2401.12175v1","updated":"2024-01-22T18:08:22Z","published":"2024-01-22T18:08:22Z","title":"Single-View 3D Human Digitalization with Large Reconstruction Models","summary":"  In this paper, we introduce Human-LRM, a single-stage feed-forward Large\nReconstruction Model designed to predict human Neural Radiance Fields (NeRF)\nfrom a single image. Our approach demonstrates remarkable adaptability in\ntraining using extensive datasets containing 3D scans and multi-view capture.\nFurthermore, to enhance the model's applicability for in-the-wild scenarios\nespecially with occlusions, we propose a novel strategy that distills\nmulti-view reconstruction into single-view via a conditional triplane diffusion\nmodel. This generative extension addresses the inherent variations in human\nbody shapes when observed from a single view, and makes it possible to\nreconstruct the full body human from an occluded image. Through extensive\nexperiments, we show that Human-LRM surpasses previous methods by a significant\nmargin on several benchmarks.\n","authors":["Zhenzhen Weng","Jingyuan Liu","Hao Tan","Zhan Xu","Yang Zhou","Serena Yeung-Levy","Jimei Yang"],"pdf_url":"https://arxiv.org/pdf/2401.12175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12168v1","updated":"2024-01-22T18:01:01Z","published":"2024-01-22T18:01:01Z","title":"SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning\n  Capabilities","summary":"  Understanding and reasoning about spatial relationships is a fundamental\ncapability for Visual Question Answering (VQA) and robotics. While Vision\nLanguage Models (VLM) have demonstrated remarkable performance in certain VQA\nbenchmarks, they still lack capabilities in 3D spatial reasoning, such as\nrecognizing quantitative relationships of physical objects like distances or\nsize differences. We hypothesize that VLMs' limited spatial reasoning\ncapability is due to the lack of 3D spatial knowledge in training data and aim\nto solve this problem by training VLMs with Internet-scale spatial reasoning\ndata. To this end, we present a system to facilitate this approach. We first\ndevelop an automatic 3D spatial VQA data generation framework that scales up to\n2 billion VQA examples on 10 million real-world images. We then investigate\nvarious factors in the training recipe, including data quality, training\npipeline, and VLM architecture. Our work features the first internet-scale 3D\nspatial reasoning dataset in metric space. By training a VLM on such data, we\nsignificantly enhance its ability on both qualitative and quantitative spatial\nVQA. Finally, we demonstrate that this VLM unlocks novel downstream\napplications in chain-of-thought spatial reasoning and robotics due to its\nquantitative estimation capability. Project website:\nhttps://spatial-vlm.github.io/\n","authors":["Boyuan Chen","Zhuo Xu","Sean Kirmani","Brian Ichter","Danny Driess","Pete Florence","Dorsa Sadigh","Leonidas Guibas","Fei Xia"],"pdf_url":"https://arxiv.org/pdf/2401.12168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12164v1","updated":"2024-01-22T17:56:07Z","published":"2024-01-22T17:56:07Z","title":"Semi-supervised segmentation of land cover images using nonlinear\n  canonical correlation analysis with multiple features and t-SNE","summary":"  Image segmentation is a clustering task whereby each pixel is assigned a\ncluster label. Remote sensing data usually consists of multiple bands of\nspectral images in which there exist semantically meaningful land cover\nsubregions, co-registered with other source data such as LIDAR (LIght Detection\nAnd Ranging) data, where available. This suggests that, in order to account for\nspatial correlation between pixels, a feature vector associated with each pixel\nmay be a vectorized tensor representing the multiple bands and a local patch as\nappropriate. Similarly, multiple types of texture features based on a pixel's\nlocal patch would also be beneficial for encoding locally statistical\ninformation and spatial variations, without necessarily labelling pixel-wise a\nlarge amount of ground truth, then training a supervised model, which is\nsometimes impractical. In this work, by resorting to label only a small\nquantity of pixels, a new semi-supervised segmentation approach is proposed.\nInitially, over all pixels, an image data matrix is created in high dimensional\nfeature space. Then, t-SNE projects the high dimensional data onto 3D\nembedding. By using radial basis functions as input features, which use the\nlabelled data samples as centres, to pair with the output class labels, a\nmodified canonical correlation analysis algorithm, referred to as RBF-CCA, is\nintroduced which learns the associated projection matrix via the small labelled\ndata set. The associated canonical variables, obtained for the full image, are\napplied by k-means clustering algorithm. The proposed semi-supervised RBF-CCA\nalgorithm has been implemented on several remotely sensed multispectral images,\ndemonstrating excellent segmentation results.\n","authors":["Hong Wei","James Xiao","Yichao Zhang","Xia Hong"],"pdf_url":"https://arxiv.org/pdf/2401.12164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12161v1","updated":"2024-01-22T17:55:16Z","published":"2024-01-22T17:55:16Z","title":"Automated facial recognition system using deep learning for pain\n  assessment in adults with cerebral palsy","summary":"  Background: Pain assessment in individuals with neurological conditions,\nespecially those with limited self-report ability and altered facial\nexpressions, presents challenges. Existing measures, relying on direct\nobservation by caregivers, lack sensitivity and specificity. In cerebral palsy,\npain is a common comorbidity and a reliable evaluation protocol is crucial.\nThus, having an automatic system that recognizes facial expressions could be of\nenormous help when diagnosing pain in this type of patient.\n  Objectives: 1) to build a dataset of facial pain expressions in individuals\nwith cerebral palsy, and 2) to develop an automated facial recognition system\nbased on deep learning for pain assessment addressed to this population.\n  Methods: Ten neural networks were trained on three pain image databases,\nincluding the UNBC-McMaster Shoulder Pain Expression Archive Database, the\nMultimodal Intensity Pain Dataset, and the Delaware Pain Database.\nAdditionally, a curated dataset (CPPAIN) was created, consisting of 109\npreprocessed facial pain expression images from individuals with cerebral\npalsy, categorized by two physiotherapists using the Facial Action Coding\nSystem observational scale.\n  Results: InceptionV3 exhibited promising performance on the CP-PAIN dataset,\nachieving an accuracy of 62.67% and an F1 score of 61.12%. Explainable\nartificial intelligence techniques revealed consistent essential features for\npain identification across models.\n  Conclusion: This study demonstrates the potential of deep learning models for\nrobust pain detection in populations with neurological conditions and\ncommunication disabilities. The creation of a larger dataset specific to\ncerebral palsy would further enhance model accuracy, offering a valuable tool\nfor discerning subtle and idiosyncratic pain expressions. The insights gained\ncould extend to other complex neurological conditions.\n","authors":["Álvaro Sabater-Gárriz","F. Xavier Gaya-Morey","José María Buades-Rubio","Cristina Manresa Yee","Pedro Montoya","Inmaculada Riquelme"],"pdf_url":"https://arxiv.org/pdf/2401.12161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08573v2","updated":"2024-01-22T17:54:58Z","published":"2024-01-16T18:58:36Z","title":"Benchmarking the Robustness of Image Watermarks","summary":"  This paper investigates the weaknesses of image watermarking techniques. We\npresent WAVES (Watermark Analysis Via Enhanced Stress-testing), a novel\nbenchmark for assessing watermark robustness, overcoming the limitations of\ncurrent evaluation methods.WAVES integrates detection and identification tasks,\nand establishes a standardized evaluation protocol comprised of a diverse range\nof stress tests. The attacks in WAVES range from traditional image distortions\nto advanced and novel variations of diffusive, and adversarial attacks. Our\nevaluation examines two pivotal dimensions: the degree of image quality\ndegradation and the efficacy of watermark detection after attacks. We develop a\nseries of Performance vs. Quality 2D plots, varying over several prominent\nimage similarity metrics, which are then aggregated in a heuristically novel\nmanner to paint an overall picture of watermark robustness and attack potency.\nOur comprehensive evaluation reveals previously undetected vulnerabilities of\nseveral modern watermarking algorithms. We envision WAVES as a toolkit for the\nfuture development of robust watermarking systems. The project is available at\nhttps://wavesbench.github.io/\n","authors":["Bang An","Mucong Ding","Tahseen Rabbani","Aakriti Agrawal","Yuancheng Xu","Chenghao Deng","Sicheng Zhu","Abdirisak Mohamed","Yuxin Wen","Tom Goldstein","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.08573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02273v4","updated":"2024-01-22T17:37:03Z","published":"2023-07-05T13:17:14Z","title":"Joint Hierarchical Priors and Adaptive Spatial Resolution for Efficient\n  Neural Image Compression","summary":"  Recently, the performance of neural image compression (NIC) has steadily\nimproved thanks to the last line of study, reaching or outperforming\nstate-of-the-art conventional codecs. Despite significant progress, current NIC\nmethods still rely on ConvNet-based entropy coding, limited in modeling\nlong-range dependencies due to their local connectivity and the increasing\nnumber of architectural biases and priors, resulting in complex underperforming\nmodels with high decoding latency. Motivated by the efficiency investigation of\nthe Tranformer-based transform coding framework, namely SwinT-ChARM, we propose\nto enhance the latter, as first, with a more straightforward yet effective\nTranformer-based channel-wise auto-regressive prior model, resulting in an\nabsolute image compression transformer (ICT). Through the proposed ICT, we can\ncapture both global and local contexts from the latent representations and\nbetter parameterize the distribution of the quantized latents. Further, we\nleverage a learnable scaling module with a sandwich ConvNeXt-based\npre-/post-processor to accurately extract more compact latent codes while\nreconstructing higher-quality images. Extensive experimental results on\nbenchmark datasets showed that the proposed framework significantly improves\nthe trade-off between coding efficiency and decoder complexity over the\nversatile video coding (VVC) reference encoder (VTM-18.0) and the neural codec\nSwinT-ChARM. Moreover, we provide model scaling studies to verify the\ncomputational efficiency of our approach and conduct several objective and\nsubjective analyses to bring to the fore the performance gap between the\nadaptive image compression transformer (AICT) and the neural codec SwinT-ChARM.\n","authors":["Ahmed Ghorbel","Wassim Hamidouche","Luce Morin"],"pdf_url":"https://arxiv.org/pdf/2307.02273v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12133v1","updated":"2024-01-22T17:15:02Z","published":"2024-01-22T17:15:02Z","title":"VRMN-bD: A Multi-modal Natural Behavior Dataset of Immersive Human Fear\n  Responses in VR Stand-up Interactive Games","summary":"  Understanding and recognizing emotions are important and challenging issues\nin the metaverse era. Understanding, identifying, and predicting fear, which is\none of the fundamental human emotions, in virtual reality (VR) environments\nplays an essential role in immersive game development, scene development, and\nnext-generation virtual human-computer interaction applications. In this\narticle, we used VR horror games as a medium to analyze fear emotions by\ncollecting multi-modal data (posture, audio, and physiological signals) from 23\nplayers. We used an LSTM-based model to predict fear with accuracies of 65.31%\nand 90.47% under 6-level classification (no fear and five different levels of\nfear) and 2-level classification (no fear and fear), respectively. We\nconstructed a multi-modal natural behavior dataset of immersive human fear\nresponses (VRMN-bD) and compared it with existing relevant advanced datasets.\nThe results show that our dataset has fewer limitations in terms of collection\nmethod, data scale and audience scope. We are unique and advanced in targeting\nmulti-modal datasets of fear and behavior in VR stand-up interactive\nenvironments. Moreover, we discussed the implications of this work for\ncommunities and applications. The dataset and pre-trained model are available\nat https://github.com/KindOPSTAR/VRMN-bD.\n","authors":["He Zhang","Xinyang Li","Yuanxi Sun","Xinyi Fu","Christine Qiu","John M. Carroll"],"pdf_url":"https://arxiv.org/pdf/2401.12133v1.pdf","comment":"Accepted to IEEE VR 2024"},{"id":"http://arxiv.org/abs/2401.06144v2","updated":"2024-01-22T17:11:57Z","published":"2023-11-30T23:31:33Z","title":"DFU: scale-robust diffusion model for zero-shot super-resolution image\n  generation","summary":"  Diffusion generative models have achieved remarkable success in generating\nimages with a fixed resolution. However, existing models have limited ability\nto generalize to different resolutions when training data at those resolutions\nare not available. Leveraging techniques from operator learning, we present a\nnovel deep-learning architecture, Dual-FNO UNet (DFU), which approximates the\nscore operator by combining both spatial and spectral information at multiple\nresolutions. Comparisons of DFU to baselines demonstrate its scalability: 1)\nsimultaneously training on multiple resolutions improves FID over training at\nany single fixed resolution; 2) DFU generalizes beyond its training\nresolutions, allowing for coherent, high-fidelity generation at\nhigher-resolutions with the same model, i.e. zero-shot super-resolution\nimage-generation; 3) we propose a fine-tuning strategy to further enhance the\nzero-shot super-resolution image-generation capability of our model, leading to\na FID of 11.3 at 1.66 times the maximum training resolution on FFHQ, which no\nother method can come close to achieving.\n","authors":["Alex Havrilla","Kevin Rojas","Wenjing Liao","Molei Tao"],"pdf_url":"https://arxiv.org/pdf/2401.06144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12129v1","updated":"2024-01-22T17:11:01Z","published":"2024-01-22T17:11:01Z","title":"Out-of-Distribution Detection & Applications With Ablated Learned\n  Temperature Energy","summary":"  As deep neural networks become adopted in high-stakes domains, it is crucial\nto be able to identify when inference inputs are Out-of-Distribution (OOD) so\nthat users can be alerted of likely drops in performance and calibration\ndespite high confidence. Among many others, existing methods use the following\ntwo scores to do so without training on any apriori OOD examples: a learned\ntemperature and an energy score. In this paper we introduce Ablated Learned\nTemperature Energy (or \"AbeT\" for short), a method which combines these prior\nmethods in novel ways with effective modifications. Due to these contributions,\nAbeT lowers the False Positive Rate at $95\\%$ True Positive Rate (FPR@95) by\n$35.39\\%$ in classification (averaged across all ID and OOD datasets measured)\ncompared to state of the art without training networks in multiple stages or\nrequiring hyperparameters or test-time backward passes. We additionally provide\nempirical insights as to how our model learns to distinguish between\nIn-Distribution (ID) and OOD samples while only being explicitly trained on ID\nsamples via exposure to misclassified ID examples at training time. Lastly, we\nshow the efficacy of our method in identifying predicted bounding boxes and\npixels corresponding to OOD objects in object detection and semantic\nsegmentation, respectively - with an AUROC increase of $5.15\\%$ in object\ndetection and both a decrease in FPR@95 of $41.48\\%$ and an increase in AUPRC\nof $34.20\\%$ on average in semantic segmentation compared to previous state of\nthe art.\n","authors":["Will LeVine","Benjamin Pikus","Jacob Phillips","Berk Norman","Fernando Amat Gil","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2401.12129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00454v2","updated":"2024-01-22T17:10:49Z","published":"2023-09-30T18:13:41Z","title":"UniLVSeg: Unified Left Ventricular Segmentation with Sparsely Annotated\n  Echocardiogram Videos through Self-Supervised Temporal Masking and Weakly\n  Supervised Training","summary":"  Echocardiography has become an indispensable clinical imaging modality for\ngeneral heart health assessment. From calculating biomarkers such as ejection\nfraction to the probability of a patient's heart failure, accurate segmentation\nof the heart and its structures allows doctors to plan and execute treatments\nwith greater precision and accuracy. However, achieving accurate and robust\nleft ventricle segmentation is time-consuming and challenging due to different\nreasons. This work introduces a novel approach for consistent left ventricular\n(LV) segmentation from sparsely annotated echocardiogram videos. We achieve\nthis through (1) self-supervised learning (SSL) using temporal masking followed\nby (2) weakly supervised training. We investigate two different segmentation\napproaches: 3D segmentation and a novel 2D superimage (SI). We demonstrate how\nour proposed method outperforms the state-of-the-art solutions by achieving a\n93.32% (95%CI 93.21-93.43%) dice score on a large-scale dataset\n(EchoNet-Dynamic) while being more efficient. To show the effectiveness of our\napproach, we provide extensive ablation studies, including pre-training\nsettings and various deep learning backbones. Additionally, we discuss how our\nproposed methodology achieves high data utility by incorporating unlabeled\nframes in the training process. To help support the AI in medicine community,\nthe complete solution with the source code will be made publicly available upon\nacceptance.\n","authors":["Fadillah Maani","Asim Ukaye","Nada Saadi","Numan Saeed","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2310.00454v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12074v1","updated":"2024-01-22T16:14:26Z","published":"2024-01-22T16:14:26Z","title":"DeepCERES: A Deep learning method for cerebellar lobule segmentation\n  using ultra-high resolution multimodal MRI","summary":"  This paper introduces a novel multimodal and high-resolution human brain\ncerebellum lobule segmentation method. Unlike current tools that operate at\nstandard resolution ($1 \\text{ mm}^{3}$) or using mono-modal data, the proposed\nmethod improves cerebellum lobule segmentation through the use of a multimodal\nand ultra-high resolution ($0.125 \\text{ mm}^{3}$) training dataset. To develop\nthe method, first, a database of semi-automatically labelled cerebellum lobules\nwas created to train the proposed method with ultra-high resolution T1 and T2\nMR images. Then, an ensemble of deep networks has been designed and developed,\nallowing the proposed method to excel in the complex cerebellum lobule\nsegmentation task, improving precision while being memory efficient. Notably,\nour approach deviates from the traditional U-Net model by exploring alternative\narchitectures. We have also integrated deep learning with classical machine\nlearning methods incorporating a priori knowledge from multi-atlas\nsegmentation, which improved precision and robustness. Finally, a new online\npipeline, named DeepCERES, has been developed to make available the proposed\nmethod to the scientific community requiring as input only a single T1 MR image\nat standard resolution.\n","authors":["Sergio Morell-Ortega","Marina Ruiz-Perez","Marien Gadea","Roberto Vivo-Hernando","Gregorio Rubio","Fernando Aparici","Mariam de la Iglesia-Vaya","Gwenaelle Catheline","Pierrick Coupé","José V. Manjón"],"pdf_url":"https://arxiv.org/pdf/2401.12074v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2401.12051v1","updated":"2024-01-22T15:42:21Z","published":"2024-01-22T15:42:21Z","title":"CloSe: A 3D Clothing Segmentation Dataset and Model","summary":"  3D Clothing modeling and datasets play crucial role in the entertainment,\nanimation, and digital fashion industries. Existing work often lacks detailed\nsemantic understanding or uses synthetic datasets, lacking realism and\npersonalization. To address this, we first introduce CloSe-D: a novel\nlarge-scale dataset containing 3D clothing segmentation of 3167 scans, covering\na range of 18 distinct clothing classes. Additionally, we propose CloSe-Net,\nthe first learning-based 3D clothing segmentation model for fine-grained\nsegmentation from colored point clouds. CloSe-Net uses local point features,\nbody-clothing correlation, and a garment-class and point features-based\nattention module, improving performance over baselines and prior work. The\nproposed attention module enables our model to learn appearance and\ngeometry-dependent clothing prior from data. We further validate the efficacy\nof our approach by successfully segmenting publicly available datasets of\npeople in clothing. We also introduce CloSe-T, a 3D interactive tool for\nrefining segmentation labels. Combining the tool with CloSe-T in a continual\nlearning setup demonstrates improved generalization on real-world data.\nDataset, model, and tool can be found at\nhttps://virtualhumans.mpi-inf.mpg.de/close3dv24/.\n","authors":["Dimitrije Antić","Garvita Tiwari","Batuhan Ozcomlekci","Riccardo Marin","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2401.12051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12048v1","updated":"2024-01-22T15:40:24Z","published":"2024-01-22T15:40:24Z","title":"HomeRobot Open Vocabulary Mobile Manipulation Challenge 2023 Participant\n  Report (Team KuzHum)","summary":"  We report an improvements to NeurIPS 2023 HomeRobot: Open Vocabulary Mobile\nManipulation (OVMM) Challenge reinforcement learning baseline. More\nspecifically, we propose more accurate semantic segmentation module, along with\nbetter place skill policy, and high-level heuristic that outperforms the\nbaseline by 2.4% of overall success rate (sevenfold improvement) and 8.2% of\npartial success rate (1.75 times improvement) on Test Standard split of the\nchallenge dataset. With aforementioned enhancements incorporated our agent\nscored 3rd place in the challenge on both simulation and real-world stages.\n","authors":["Volodymyr Kuzma","Vladyslav Humennyy","Ruslan Partsey"],"pdf_url":"https://arxiv.org/pdf/2401.12048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08865v2","updated":"2024-01-22T15:30:08Z","published":"2024-01-16T22:36:23Z","title":"The Effect of Intrinsic Dataset Properties on Generalization: Unraveling\n  Learning Differences Between Natural and Medical Images","summary":"  This paper investigates discrepancies in how neural networks learn from\ndifferent imaging domains, which are commonly overlooked when adopting computer\nvision techniques from the domain of natural images to other specialized\ndomains such as medical images. Recent works have found that the generalization\nerror of a trained network typically increases with the intrinsic dimension\n($d_{data}$) of its training set. Yet, the steepness of this relationship\nvaries significantly between medical (radiological) and natural imaging\ndomains, with no existing theoretical explanation. We address this gap in\nknowledge by establishing and empirically validating a generalization scaling\nlaw with respect to $d_{data}$, and propose that the substantial scaling\ndiscrepancy between the two considered domains may be at least partially\nattributed to the higher intrinsic \"label sharpness\" ($K_F$) of medical imaging\ndatasets, a metric which we propose. Next, we demonstrate an additional benefit\nof measuring the label sharpness of a training set: it is negatively correlated\nwith the trained model's adversarial robustness, which notably leads to models\nfor medical images having a substantially higher vulnerability to adversarial\nattack. Finally, we extend our $d_{data}$ formalism to the related metric of\nlearned representation intrinsic dimension ($d_{repr}$), derive a\ngeneralization scaling law with respect to $d_{repr}$, and show that $d_{data}$\nserves as an upper bound for $d_{repr}$. Our theoretical results are supported\nby thorough experiments with six models and eleven natural and medical imaging\ndatasets over a range of training set sizes. Our findings offer insights into\nthe influence of intrinsic dataset properties on generalization, representation\nlearning, and robustness in deep neural networks.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2401.08865v2.pdf","comment":"ICLR 2024. Code:\n  https://github.com/mazurowski-lab/intrinsic-properties"},{"id":"http://arxiv.org/abs/2401.12039v1","updated":"2024-01-22T15:26:01Z","published":"2024-01-22T15:26:01Z","title":"Look, Listen and Recognise: Character-Aware Audio-Visual Subtitling","summary":"  The goal of this paper is automatic character-aware subtitle generation.\nGiven a video and a minimal amount of metadata, we propose an audio-visual\nmethod that generates a full transcript of the dialogue, with precise speech\ntimestamps, and the character speaking identified. The key idea is to first use\naudio-visual cues to select a set of high-precision audio exemplars for each\ncharacter, and then use these exemplars to classify all speech segments by\nspeaker identity. Notably, the method does not require face detection or\ntracking. We evaluate the method over a variety of TV sitcoms, including\nSeinfeld, Fraiser and Scrubs. We envision this system being useful for the\nautomatic generation of subtitles to improve the accessibility of the vast\namount of videos available on modern streaming services. Project page :\n\\url{https://www.robots.ox.ac.uk/~vgg/research/look-listen-recognise/}\n","authors":["Bruno Korbar","Jaesung Huh","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2401.12039v1.pdf","comment":"Accepted for publication in ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.12033v1","updated":"2024-01-22T15:19:18Z","published":"2024-01-22T15:19:18Z","title":"Momentum-SAM: Sharpness Aware Minimization without Computational\n  Overhead","summary":"  The recently proposed optimization algorithm for deep neural networks\nSharpness Aware Minimization (SAM) suggests perturbing parameters before\ngradient calculation by a gradient ascent step to guide the optimization into\nparameter space regions of flat loss. While significant generalization\nimprovements and thus reduction of overfitting could be demonstrated, the\ncomputational costs are doubled due to the additionally needed gradient\ncalculation, making SAM unfeasible in case of limited computationally\ncapacities. Motivated by Nesterov Accelerated Gradient (NAG) we propose\nMomentum-SAM (MSAM), which perturbs parameters in the direction of the\naccumulated momentum vector to achieve low sharpness without significant\ncomputational overhead or memory demands over SGD or Adam. We evaluate MSAM in\ndetail and reveal insights on separable mechanisms of NAG, SAM and MSAM\nregarding training optimization and generalization. Code is available at\nhttps://github.com/MarlonBecker/MSAM.\n","authors":["Marlon Becker","Frederick Altrock","Benjamin Risse"],"pdf_url":"https://arxiv.org/pdf/2401.12033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09495v3","updated":"2024-01-22T15:05:43Z","published":"2024-01-17T01:33:40Z","title":"IPR-NeRF: Ownership Verification meets Neural Radiance Field","summary":"  Neural Radiance Field (NeRF) models have gained significant attention in the\ncomputer vision community in the recent past with state-of-the-art visual\nquality and produced impressive demonstrations. Since then, technopreneurs have\nsought to leverage NeRF models into a profitable business. Therefore, NeRF\nmodels make it worth the risk of plagiarizers illegally copying,\nre-distributing, or misusing those models. This paper proposes a comprehensive\nintellectual property (IP) protection framework for the NeRF model in both\nblack-box and white-box settings, namely IPR-NeRF. In the black-box setting, a\ndiffusion-based solution is introduced to embed and extract the watermark via a\ntwo-stage optimization process. In the white-box setting, a designated digital\nsignature is embedded into the weights of the NeRF model by adopting the sign\nloss objective. Our extensive experiments demonstrate that not only does our\napproach maintain the fidelity (\\ie, the rendering quality) of IPR-NeRF models,\nbut it is also robust against both ambiguity and removal attacks compared to\nprior arts.\n","authors":["Win Kent Ong","Kam Woh Ng","Chee Seng Chan","Yi Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2401.09495v3.pdf","comment":"Error on result tabulation for the state of the art method which\n  might cause misleading to the readers"},{"id":"http://arxiv.org/abs/2401.12019v1","updated":"2024-01-22T15:05:05Z","published":"2024-01-22T15:05:05Z","title":"Stereo-Matching Knowledge Distilled Monocular Depth Estimation Filtered\n  by Multiple Disparity Consistency","summary":"  In stereo-matching knowledge distillation methods of the self-supervised\nmonocular depth estimation, the stereo-matching network's knowledge is\ndistilled into a monocular depth network through pseudo-depth maps. In these\nmethods, the learning-based stereo-confidence network is generally utilized to\nidentify errors in the pseudo-depth maps to prevent transferring the errors.\nHowever, the learning-based stereo-confidence networks should be trained with\nground truth (GT), which is not feasible in a self-supervised setting. In this\npaper, we propose a method to identify and filter errors in the pseudo-depth\nmap using multiple disparity maps by checking their consistency without the\nneed for GT and a training process. Experimental results show that the proposed\nmethod outperforms the previous methods and works well on various\nconfigurations by filtering out erroneous areas where the stereo-matching is\nvulnerable, especially such as textureless regions, occlusion boundaries, and\nreflective surfaces.\n","authors":["Woonghyun Ka","Jae Young Lee","Jaehyun Choi","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2401.12019v1.pdf","comment":"ICASSP 2024. The first two authors are equally contributed"},{"id":"http://arxiv.org/abs/2401.12014v1","updated":"2024-01-22T15:00:32Z","published":"2024-01-22T15:00:32Z","title":"Robustness to distribution shifts of compressed networks for edge\n  devices","summary":"  It is necessary to develop efficient DNNs deployed on edge devices with\nlimited computation resources. However, the compressed networks often execute\nnew tasks in the target domain, which is different from the source domain where\nthe original network is trained. It is important to investigate the robustness\nof compressed networks in two types of data distribution shifts: domain shifts\nand adversarial perturbations. In this study, we discover that compressed\nmodels are less robust to distribution shifts than their original networks.\nInterestingly, larger networks are more vulnerable to losing robustness than\nsmaller ones, even when they are compressed to a similar size as the smaller\nnetworks. Furthermore, compact networks obtained by knowledge distillation are\nmuch more robust to distribution shifts than pruned networks. Finally,\npost-training quantization is a reliable method for achieving significant\nrobustness to distribution shifts, and it outperforms both pruned and distilled\nmodels in terms of robustness.\n","authors":["Lulan Shen","Ali Edalati","Brett Meyer","Warren Gross","James J. Clark"],"pdf_url":"https://arxiv.org/pdf/2401.12014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11841v4","updated":"2024-01-22T14:59:20Z","published":"2023-12-19T04:14:11Z","title":"MixRT: Mixed Neural Representations For Real-Time NeRF Rendering","summary":"  Neural Radiance Field (NeRF) has emerged as a leading technique for novel\nview synthesis, owing to its impressive photorealistic reconstruction and\nrendering capability. Nevertheless, achieving real-time NeRF rendering in\nlarge-scale scenes has presented challenges, often leading to the adoption of\neither intricate baked mesh representations with a substantial number of\ntriangles or resource-intensive ray marching in baked representations. We\nchallenge these conventions, observing that high-quality geometry, represented\nby meshes with substantial triangles, is not necessary for achieving\nphotorealistic rendering quality. Consequently, we propose MixRT, a novel NeRF\nrepresentation that includes a low-quality mesh, a view-dependent displacement\nmap, and a compressed NeRF model. This design effectively harnesses the\ncapabilities of existing graphics hardware, thus enabling real-time NeRF\nrendering on edge devices. Leveraging a highly-optimized WebGL-based rendering\nframework, our proposed MixRT attains real-time rendering speeds on edge\ndevices (over 30 FPS at a resolution of 1280 x 720 on a MacBook M1 Pro laptop),\nbetter rendering quality (0.2 PSNR higher in indoor scenes of the Unbounded-360\ndatasets), and a smaller storage size (less than 80% compared to\nstate-of-the-art methods).\n","authors":["Chaojian Li","Bichen Wu","Peter Vajda"," Yingyan"," Lin"],"pdf_url":"https://arxiv.org/pdf/2312.11841v4.pdf","comment":"Accepted by 3DV'24. Project Page: https://licj15.github.io/MixRT/"},{"id":"http://arxiv.org/abs/2312.10105v2","updated":"2024-01-22T14:56:52Z","published":"2023-12-15T04:11:34Z","title":"Forging Tokens for Improved Storage-efficient Training","summary":"  Recent advancements in Deep Neural Network (DNN) models have significantly\nimproved performance across computer vision tasks. However, achieving highly\ngeneralizable and high-performing vision models requires extensive datasets,\nleading to large storage requirements. This storage challenge poses a critical\nbottleneck for scaling up vision models. Motivated by the success of discrete\nrepresentations, SeiT proposes to use Vector-Quantized (VQ) feature vectors\n(i.e., tokens) as network inputs for vision classification. However, applying\ntraditional data augmentations to tokens faces challenges due to input domain\nshift. To address this issue, we introduce TokenAdapt and ColorAdapt, simple\nyet effective token-based augmentation strategies. TokenAdapt realigns token\nembedding space for compatibility with spatial augmentations, preserving the\nmodel's efficiency without requiring fine-tuning. Additionally, ColorAdapt\naddresses color-based augmentations for tokens inspired by Adaptive Instance\nNormalization (AdaIN). We evaluate our approach across various scenarios,\nincluding storage-efficient ImageNet-1k classification, fine-grained\nclassification, robustness benchmarks, and ADE-20k semantic segmentation.\nExperimental results demonstrate consistent performance improvement in diverse\nexperiments. Code is available at https://github.com/naver-ai/tokenadapt.\n","authors":["Minhyun Lee","Song Park","Byeongho Heo","Dongyoon Han","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2312.10105v2.pdf","comment":"First two authors contributed equally"},{"id":"http://arxiv.org/abs/2311.03782v3","updated":"2024-01-22T14:52:14Z","published":"2023-11-07T08:05:09Z","title":"CapST: An Enhanced and Lightweight Model Attribution Approach for\n  Synthetic Videos","summary":"  Deepfake videos, generated through AI faceswapping techniques, have garnered\nconsiderable attention due to their potential for powerful impersonation\nattacks. While existing research primarily focuses on binary classification to\ndiscern between real and fake videos, however determining the specific\ngeneration model for a fake video is crucial for forensic investigation.\nAddressing this gap, this paper investigates the model attribution problem of\nDeepfake videos from a recently proposed dataset, Deepfakes from Different\nModels (DFDM), derived from various Autoencoder models. The dataset comprises\n6,450 Deepfake videos generated by five distinct models with variations in\nencoder, decoder, intermediate layer, input resolution, and compression ratio.\nThis study formulates Deepfakes model attribution as a multiclass\nclassification task, proposing a segment of VGG19 as a feature extraction\nbackbone, known for its effectiveness in imagerelated tasks, while integrated a\nCapsule Network with a Spatio-Temporal attention mechanism. The Capsule module\ncaptures intricate hierarchies among features for robust identification of\ndeepfake attributes. Additionally, the video-level fusion technique leverages\ntemporal attention mechanisms to handle concatenated feature vectors,\ncapitalizing on inherent temporal dependencies in deepfake videos. By\naggregating insights across frames, our model gains a comprehensive\nunderstanding of video content, resulting in more precise predictions.\nExperimental results on the deepfake benchmark dataset (DFDM) demonstrate the\nefficacy of our proposed method, achieving up to a 4% improvement in accurately\ncategorizing deepfake videos compared to baseline models while demanding fewer\ncomputational resources.\n","authors":["Wasim Ahmad","Yan-Tsung Peng","Yuan-Hao Chang","Gaddisa Olani Ganfure","Sarwar Khan","Sahibzada Adil Shahzad"],"pdf_url":"https://arxiv.org/pdf/2311.03782v3.pdf","comment":"Rejected from jounal and will have to conduct several more\n  experiments"},{"id":"http://arxiv.org/abs/2401.12001v1","updated":"2024-01-22T14:52:08Z","published":"2024-01-22T14:52:08Z","title":"Modeling Stereo-Confidence Out of the End-to-End Stereo-Matching Network\n  via Disparity Plane Sweep","summary":"  We propose a novel stereo-confidence that can be measured externally to\nvarious stereo-matching networks, offering an alternative input modality choice\nof the cost volume for learning-based approaches, especially in safety-critical\nsystems. Grounded in the foundational concepts of disparity definition and the\ndisparity plane sweep, the proposed stereo-confidence method is built upon the\nidea that any shift in a stereo-image pair should be updated in a corresponding\namount shift in the disparity map. Based on this idea, the proposed\nstereo-confidence method can be summarized in three folds. 1) Using the\ndisparity plane sweep, multiple disparity maps can be obtained and treated as a\n3-D volume (predicted disparity volume), like the cost volume is constructed.\n2) One of these disparity maps serves as an anchor, allowing us to define a\ndesirable (or ideal) disparity profile at every spatial point. 3) By comparing\nthe desirable and predicted disparity profiles, we can quantify the level of\nmatching ambiguity between left and right images for confidence measurement.\nExtensive experimental results using various stereo-matching networks and\ndatasets demonstrate that the proposed stereo-confidence method not only shows\ncompetitive performance on its own but also consistent performance improvements\nwhen it is used as an input modality for learning-based stereo-confidence\nmethods.\n","authors":["Jae Young Lee","Woonghyun Ka","Jaehyun Choi","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2401.12001v1.pdf","comment":"AAAI 2024. The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2401.11985v1","updated":"2024-01-22T14:38:25Z","published":"2024-01-22T14:38:25Z","title":"Scaling Face Interaction Graph Networks to Real World Scenes","summary":"  Accurately simulating real world object dynamics is essential for various\napplications such as robotics, engineering, graphics, and design. To better\ncapture complex real dynamics such as contact and friction, learned simulators\nbased on graph networks have recently shown great promise. However, applying\nthese learned simulators to real scenes comes with two major challenges: first,\nscaling learned simulators to handle the complexity of real world scenes which\ncan involve hundreds of objects each with complicated 3D shapes, and second,\nhandling inputs from perception rather than 3D state information. Here we\nintroduce a method which substantially reduces the memory required to run\ngraph-based learned simulators. Based on this memory-efficient simulation\nmodel, we then present a perceptual interface in the form of editable NeRFs\nwhich can convert real-world scenes into a structured representation that can\nbe processed by graph network simulator. We show that our method uses\nsubstantially less memory than previous graph-based simulators while retaining\ntheir accuracy, and that the simulators learned in synthetic environments can\nbe applied to real world scenes captured from multiple camera angles. This\npaves the way for expanding the application of learned simulators to settings\nwhere only perceptual information is available at inference time.\n","authors":["Tatiana Lopez-Guevara","Yulia Rubanova","William F. Whitney","Tobias Pfaff","Kimberly Stachenfeld","Kelsey R. Allen"],"pdf_url":"https://arxiv.org/pdf/2401.11985v1.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2401.11960v1","updated":"2024-01-22T14:02:56Z","published":"2024-01-22T14:02:56Z","title":"Observation-Guided Meteorological Field Downscaling at Station Scale: A\n  Benchmark and a New Method","summary":"  Downscaling (DS) of meteorological variables involves obtaining\nhigh-resolution states from low-resolution meteorological fields and is an\nimportant task in weather forecasting. Previous methods based on deep learning\ntreat downscaling as a super-resolution task in computer vision and utilize\nhigh-resolution gridded meteorological fields as supervision to improve\nresolution at specific grid scales. However, this approach has struggled to\nalign with the continuous distribution characteristics of meteorological\nfields, leading to an inherent systematic bias between the downscaled results\nand the actual observations at meteorological stations. In this paper, we\nextend meteorological downscaling to arbitrary scattered station scales,\nestablish a brand new benchmark and dataset, and retrieve meteorological states\nat any given station location from a coarse-resolution meteorological field.\nInspired by data assimilation techniques, we integrate observational data into\nthe downscaling process, providing multi-scale observational priors. Building\non this foundation, we propose a new downscaling model based on hypernetwork\narchitecture, namely HyperDS, which efficiently integrates different\nobservational information into the model training, achieving continuous scale\nmodeling of the meteorological field. Through extensive experiments, our\nproposed method outperforms other specially designed baseline models on\nmultiple surface variables. Notably, the mean squared error (MSE) for wind\nspeed and surface pressure improved by 67% and 19.5% compared to other methods.\nWe will release the dataset and code subsequently.\n","authors":["Zili Liu","Hao Chen","Lei Bai","Wenyuan Li","Keyan Chen","Zhengyi Wang","Wanli Ouyang","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2401.11960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11949v1","updated":"2024-01-22T13:38:24Z","published":"2024-01-22T13:38:24Z","title":"Feature Denoising Diffusion Model for Blind Image Quality Assessment","summary":"  Blind Image Quality Assessment (BIQA) aims to evaluate image quality in line\nwith human perception, without reference benchmarks. Currently, deep learning\nBIQA methods typically depend on using features from high-level tasks for\ntransfer learning. However, the inherent differences between BIQA and these\nhigh-level tasks inevitably introduce noise into the quality-aware features. In\nthis paper, we take an initial step towards exploring the diffusion model for\nfeature denoising in BIQA, namely Perceptual Feature Diffusion for IQA\n(PFD-IQA), which aims to remove noise from quality-aware features.\nSpecifically, (i) We propose a {Perceptual Prior Discovery and Aggregation\nmodule to establish two auxiliary tasks to discover potential low-level\nfeatures in images that are used to aggregate perceptual text conditions for\nthe diffusion model. (ii) We propose a Perceptual Prior-based Feature\nRefinement strategy, which matches noisy features to predefined denoising\ntrajectories and then performs exact feature denoising based on text\nconditions. Extensive experiments on eight standard BIQA datasets demonstrate\nthe superior performance to the state-of-the-art BIQA methods, i.e., achieving\nthe PLCC values of 0.935 ( vs. 0.905 in KADID) and 0.922 ( vs. 0.894 in LIVEC).\n","authors":["Xudong Li","Jingyuan Zheng","Runze Hu","Yan Zhang","Ke Li","Yunhang Shen","Xiawu Zheng","Yutao Liu","ShengChuan Zhang","Pingyang Dai","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2401.11949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11944v1","updated":"2024-01-22T13:34:34Z","published":"2024-01-22T13:34:34Z","title":"CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding\n  Benchmark","summary":"  As the capabilities of large multimodal models (LMMs) continue to advance,\nevaluating the performance of LMMs emerges as an increasing need. Additionally,\nthere is an even larger gap in evaluating the advanced knowledge and reasoning\nabilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU,\na new Chinese Massive Multi-discipline Multimodal Understanding benchmark\ndesigned to evaluate LMMs on tasks demanding college-level subject knowledge\nand deliberate reasoning in a Chinese context. CMMMU is inspired by and\nstrictly follows the annotation and analysis pattern of MMMU.\n  CMMMU includes 12k manually collected multimodal questions from college\nexams, quizzes, and textbooks, covering six core disciplines: Art & Design,\nBusiness, Science, Health & Medicine, Humanities & Social Science, and Tech &\nEngineering, like its companion, MMMU. These questions span 30 subjects and\ncomprise 39 highly heterogeneous image types, such as charts, diagrams, maps,\ntables, music sheets, and chemical structures.\n  CMMMU focuses on complex perception and reasoning with domain-specific\nknowledge in the Chinese context. We evaluate 11 open-source LLMs and one\nproprietary GPT-4V(ision). Even GPT-4V only achieves accuracies of 42%,\nindicating a large space for improvement. CMMMU will boost the community to\nbuild the next-generation LMMs towards expert artificial intelligence and\npromote the democratization of LMMs by providing diverse language contexts.\n","authors":["Ge Zhang","Xinrun Du","Bei Chen","Yiming Liang","Tongxu Luo","Tianyu Zheng","Kang Zhu","Yuyang Cheng","Chunpu Xu","Shuyue Guo","Haoran Zhang","Xingwei Qu","Junjie Wang","Ruibin Yuan","Yizhi Li","Zekun Wang","Yudong Liu","Yu-Hsuan Tsai","Fengji Zhang","Chenghua Lin","Wenhao Huang","Wenhu Chen","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2401.11944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11943v1","updated":"2024-01-22T13:33:53Z","published":"2024-01-22T13:33:53Z","title":"Benchmarking Large Multimodal Models against Common Corruptions","summary":"  This technical report aims to fill a deficiency in the assessment of large\nmultimodal models (LMMs) by specifically examining the self-consistency of\ntheir outputs when subjected to common corruptions. We investigate the\ncross-modal interactions between text, image, and speech, encompassing four\nessential generation tasks: text-to-image, image-to-text, text-to-speech, and\nspeech-to-text. We create a comprehensive benchmark, named MMCBench, that\ncovers more than 100 popular LMMs (totally over 150 model checkpoints). A\nthorough evaluation under common corruptions is critical for practical\ndeployment and facilitates a better understanding of the reliability of\ncutting-edge LMMs. The benchmarking code is available at\nhttps://github.com/sail-sg/MMCBench\n","authors":["Jiawei Zhang","Tianyu Pang","Chao Du","Yi Ren","Bo Li","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11943v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2303.07064v3","updated":"2024-01-22T13:26:32Z","published":"2023-03-13T12:38:07Z","title":"A Generalized Multi-Modal Fusion Detection Framework","summary":"  LiDAR point clouds have become the most common data source in autonomous\ndriving. However, due to the sparsity of point clouds, accurate and reliable\ndetection cannot be achieved in specific scenarios. Because of their\ncomplementarity with point clouds, images are getting increasing attention.\nAlthough with some success, existing fusion methods either perform hard fusion\nor do not fuse in a direct manner. In this paper, we propose a generic 3D\ndetection framework called MMFusion, using multi-modal features. The framework\naims to achieve accurate fusion between LiDAR and images to improve 3D\ndetection in complex scenes. Our framework consists of two separate streams:\nthe LiDAR stream and the camera stream, which can be compatible with any\nsingle-modal feature extraction network. The Voxel Local Perception Module in\nthe LiDAR stream enhances local feature representation, and then the\nMulti-modal Feature Fusion Module selectively combines feature output from\ndifferent streams to achieve better fusion. Extensive experiments have shown\nthat our framework not only outperforms existing benchmarks but also improves\ntheir detection, especially for detecting cyclists and pedestrians on KITTI\nbenchmarks, with strong robustness and generalization capabilities. Hopefully,\nour work will stimulate more research into multi-modal fusion for autonomous\ndriving tasks.\n","authors":["Leichao Cui","Xiuxian Li","Min Meng","Xiaoyu Mo"],"pdf_url":"https://arxiv.org/pdf/2303.07064v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15567v3","updated":"2024-01-22T13:17:21Z","published":"2023-07-28T14:04:06Z","title":"Panoptic Scene Graph Generation with Semantics-Prototype Learning","summary":"  Panoptic Scene Graph Generation (PSG) parses objects and predicts their\nrelationships (predicate) to connect human language and visual scenes. However,\ndifferent language preferences of annotators and semantic overlaps between\npredicates lead to biased predicate annotations in the dataset, i.e. different\npredicates for same object pairs. Biased predicate annotations make PSG models\nstruggle in constructing a clear decision plane among predicates, which greatly\nhinders the real application of PSG models. To address the intrinsic bias\nabove, we propose a novel framework named ADTrans to adaptively transfer biased\npredicate annotations to informative and unified ones. To promise consistency\nand accuracy during the transfer process, we propose to measure the invariance\nof representations in each predicate class, and learn unbiased prototypes of\npredicates with different intensities. Meanwhile, we continuously measure the\ndistribution changes between each presentation and its prototype, and\nconstantly screen potential biased data. Finally, with the unbiased\npredicate-prototype representation embedding space, biased annotations are\neasily identified. Experiments show that ADTrans significantly improves the\nperformance of benchmark models, achieving a new state-of-the-art performance,\nand shows great generalization and effectiveness on multiple datasets.\n","authors":["Li Li","Wei Ji","Yiming Wu","Mengze Li","You Qin","Lina Wei","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2307.15567v3.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2310.09126v2","updated":"2024-01-22T13:14:33Z","published":"2023-10-13T14:14:43Z","title":"Physics-guided Noise Neural Proxy for Practical Low-light Raw Image\n  Denoising","summary":"  Recently, the mainstream practice for training low-light raw image denoising\nmethods has shifted towards employing synthetic data. Noise modeling, which\nfocuses on characterizing the noise distribution of real-world sensors,\nprofoundly influences the effectiveness and practicality of synthetic data.\nCurrently, physics-based noise modeling struggles to characterize the entire\nreal noise distribution, while learning-based noise modeling impractically\ndepends on paired real data. In this paper, we propose a novel strategy:\nlearning the noise model from dark frames instead of paired real data, to break\ndown the data dependency. Based on this strategy, we introduce an efficient\nphysics-guided noise neural proxy (PNNP) to approximate the real-world sensor\nnoise model. Specifically, we integrate physical priors into neural proxies and\nintroduce three efficient techniques: physics-guided noise decoupling (PND),\nphysics-guided proxy model (PPM), and differentiable distribution loss (DDL).\nPND decouples the dark frame into different components and handles different\nlevels of noise flexibly, which reduces the complexity of noise modeling. PPM\nincorporates physical priors to constrain the generated noise, which promotes\nthe accuracy of noise modeling. DDL provides explicit and reliable supervision\nfor noise distribution, which promotes the precision of noise modeling. PNNP\nexhibits powerful potential in characterizing the real noise distribution.\nExtensive experiments on public datasets demonstrate superior performance in\npractical low-light raw image denoising. The code will be available at\n\\url{https://github.com/fenghansen/PNNP}.\n","authors":["Hansen Feng","Lizhi Wang","Yiqi Huang","Yuzhi Wang","Lin Zhu","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2310.09126v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2401.11914v1","updated":"2024-01-22T13:01:35Z","published":"2024-01-22T13:01:35Z","title":"A Saliency Enhanced Feature Fusion based multiscale RGB-D Salient Object\n  Detection Network","summary":"  Multiscale convolutional neural network (CNN) has demonstrated remarkable\ncapabilities in solving various vision problems. However, fusing features of\ndifferent scales alwaysresults in large model sizes, impeding the application\nof multiscale CNNs in RGB-D saliency detection. In this paper, we propose a\ncustomized feature fusion module, called Saliency Enhanced Feature Fusion\n(SEFF), for RGB-D saliency detection. SEFF utilizes saliency maps of the\nneighboring scales to enhance the necessary features for fusing, resulting in\nmore representative fused features. Our multiscale RGB-D saliency detector uses\nSEFF and processes images with three different scales. SEFF is used to fuse the\nfeatures of RGB and depth images, as well as the features of decoders at\ndifferent scales. Extensive experiments on five benchmark datasets have\ndemonstrated the superiority of our method over ten SOTA saliency detectors.\n","authors":["Rui Huang","Qingyi Zhao","Yan Xing","Sihua Gao","Weifeng Xu","Yuxiang Zhang","Wei Fan"],"pdf_url":"https://arxiv.org/pdf/2401.11914v1.pdf","comment":"Accpeted by 2024 IEEE International Conference on Acoustics, Speech,\n  and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2401.11913v1","updated":"2024-01-22T13:01:28Z","published":"2024-01-22T13:01:28Z","title":"Large receptive field strategy and important feature extraction strategy\n  in 3D object detection","summary":"  The enhancement of 3D object detection is pivotal for precise environmental\nperception and improved task execution capabilities in autonomous driving.\nLiDAR point clouds, offering accurate depth information, serve as a crucial\ninformation for this purpose. Our study focuses on key challenges in 3D target\ndetection. To tackle the challenge of expanding the receptive field of a 3D\nconvolutional kernel, we introduce the Dynamic Feature Fusion Module (DFFM).\nThis module achieves adaptive expansion of the 3D convolutional kernel's\nreceptive field, balancing the expansion with acceptable computational loads.\nThis innovation reduces operations, expands the receptive field, and allows the\nmodel to dynamically adjust to different object requirements. Simultaneously,\nwe identify redundant information in 3D features. Employing the Feature\nSelection Module (FSM) quantitatively evaluates and eliminates non-important\nfeatures, achieving the separation of output box fitting and feature\nextraction. This innovation enables the detector to focus on critical features,\nresulting in model compression, reduced computational burden, and minimized\ncandidate frame interference. Extensive experiments confirm that both DFFM and\nFSM not only enhance current benchmarks, particularly in small target\ndetection, but also accelerate network performance. Importantly, these modules\nexhibit effective complementarity.\n","authors":["Leichao Cui","Xiuxian Li","Min Meng"],"pdf_url":"https://arxiv.org/pdf/2401.11913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11902v1","updated":"2024-01-22T12:50:21Z","published":"2024-01-22T12:50:21Z","title":"A Training-Free Defense Framework for Robust Learned Image Compression","summary":"  We study the robustness of learned image compression models against\nadversarial attacks and present a training-free defense technique based on\nsimple image transform functions. Recent learned image compression models are\nvulnerable to adversarial attacks that result in poor compression rate, low\nreconstruction quality, or weird artifacts. To address the limitations, we\npropose a simple but effective two-way compression algorithm with random input\ntransforms, which is conveniently applicable to existing image compression\nmodels. Unlike the na\\\"ive approaches, our approach preserves the original\nrate-distortion performance of the models on clean images. Moreover, the\nproposed algorithm requires no additional training or modification of existing\nmodels, making it more practical. We demonstrate the effectiveness of the\nproposed techniques through extensive experiments under multiple compression\nmodels, evaluation metrics, and attack scenarios.\n","authors":["Myungseo Song","Jinyoung Choi","Bohyung Han"],"pdf_url":"https://arxiv.org/pdf/2401.11902v1.pdf","comment":"10 pages and 14 figures"},{"id":"http://arxiv.org/abs/2203.13718v2","updated":"2024-01-22T12:47:52Z","published":"2022-03-25T15:40:44Z","title":"Digital Fingerprinting of Microstructures","summary":"  Finding efficient means of fingerprinting microstructural information is a\ncritical step towards harnessing data-centric machine learning approaches. A\nstatistical framework is systematically developed for compressed\ncharacterisation of a population of images, which includes some classical\ncomputer vision methods as special cases. The focus is on materials\nmicrostructure. The ultimate purpose is to rapidly fingerprint sample images in\nthe context of various high-throughput design/make/test scenarios. This\nincludes, but is not limited to, quantification of the disparity between\nmicrostructures for quality control, classifying microstructures, predicting\nmaterials properties from image data and identifying potential processing\nroutes to engineer new materials with specific properties. Here, we consider\nmicrostructure classification and utilise the resulting features over a range\nof related machine learning tasks, namely supervised, semi-supervised, and\nunsupervised learning.\n  The approach is applied to two distinct datasets to illustrate various\naspects and some recommendations are made based on the findings. In particular,\nmethods that leverage transfer learning with convolutional neural networks\n(CNNs), pretrained on the ImageNet dataset, are generally shown to outperform\nother methods. Additionally, dimensionality reduction of these CNN-based\nfingerprints is shown to have negligible impact on classification accuracy for\nthe supervised learning approaches considered. In situations where there is a\nlarge dataset with only a handful of images labelled, graph-based label\npropagation to unlabelled data is shown to be favourable over discarding\nunlabelled data and performing supervised learning. In particular, label\npropagation by Poisson learning is shown to be highly effective at low label\nrates.\n","authors":["Michael D. White","Alexander Tarakanov","Christopher P. Race","Philip J. Withers","Kody J. H. Law"],"pdf_url":"https://arxiv.org/pdf/2203.13718v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.00067v3","updated":"2024-01-22T12:24:36Z","published":"2022-06-30T19:13:23Z","title":"Rethinking Unsupervised Domain Adaptation for Semantic Segmentation","summary":"  Unsupervised domain adaptation (UDA) adapts a model trained on one domain\n(called source) to a novel domain (called target) using only unlabeled data.\nDue to its high annotation cost, researchers have developed many UDA methods\nfor semantic segmentation, which assume no labeled sample is available in the\ntarget domain. We question the practicality of this assumption for two reasons.\nFirst, after training a model with a UDA method, we must somehow verify the\nmodel before deployment. Second, UDA methods have at least a few\nhyper-parameters that need to be determined. The surest solution to these is to\nevaluate the model using validation data, i.e., a certain amount of labeled\ntarget-domain samples. This question about the basic assumption of UDA leads us\nto rethink UDA from a data-centric point of view. Specifically, we assume we\nhave access to a minimum level of labeled data. Then, we ask how much is\nnecessary to find good hyper-parameters of existing UDA methods. We then\nconsider what if we use the same data for supervised training of the same\nmodel, e.g., finetuning. We conducted experiments to answer these questions\nwith popular scenarios, {GTA5, SYNTHIA}$\\rightarrow$Cityscapes. We found that\ni) choosing good hyper-parameters needs only a few labeled images for some UDA\nmethods whereas a lot more for others; and ii) simple finetuning works\nsurprisingly well; it outperforms many UDA methods if only several dozens of\nlabeled images are available.\n","authors":["Zhijie Wang","Masanori Suganuma","Takayuki Okatani"],"pdf_url":"https://arxiv.org/pdf/2207.00067v3.pdf","comment":"Under review in Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2401.11877v1","updated":"2024-01-22T12:02:40Z","published":"2024-01-22T12:02:40Z","title":"Evaluating the Feasibility of Standard Facial Expression Recognition in\n  Individuals with Moderate to Severe Intellectual Disabilities","summary":"  Recent research has underscored the increasing preference of users for\nhuman-like interactions with machines. Consequently, facial expression\nrecognition has gained significance as a means of imparting social robots with\nthe capacity to discern the emotional states of users. In this investigation,\nwe assess the suitability of deep learning approaches, known for their\nremarkable performance in this domain, for recognizing facial expressions in\nindividuals with intellectual disabilities, which has not been yet studied in\nthe literature, to the best of our knowledge. To address this objective, we\ntrain a set of twelve distinct convolutional neural networks in different\napproaches, including an ensemble of datasets without individuals with\nintellectual disabilities and a dataset featuring such individuals. Our\nexamination of the outcomes achieved by the various models under distinct\ntraining conditions, coupled with a comprehensive analysis of critical facial\nregions during expression recognition facilitated by explainable artificial\nintelligence techniques, revealed significant distinctions in facial\nexpressions between individuals with and without intellectual disabilities, as\nwell as among individuals with intellectual disabilities. Remarkably, our\nfindings demonstrate the feasibility of facial expression recognition within\nthis population through tailored user-specific training methodologies, which\nenable the models to effectively address the unique expressions of each user.\n","authors":["F. Xavier Gaya-Morey","Silvia Ramis","Jose M. Buades-Rubio","Cristina Manresa-Yee"],"pdf_url":"https://arxiv.org/pdf/2401.11877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11874v1","updated":"2024-01-22T12:00:37Z","published":"2024-01-22T12:00:37Z","title":"Detect-Order-Construct: A Tree Construction based Approach for\n  Hierarchical Document Structure Analysis","summary":"  Document structure analysis (aka document layout analysis) is crucial for\nunderstanding the physical layout and logical structure of documents, with\napplications in information retrieval, document summarization, knowledge\nextraction, etc. In this paper, we concentrate on Hierarchical Document\nStructure Analysis (HDSA) to explore hierarchical relationships within\nstructured documents created using authoring software employing hierarchical\nschemas, such as LaTeX, Microsoft Word, and HTML. To comprehensively analyze\nhierarchical document structures, we propose a tree construction based approach\nthat addresses multiple subtasks concurrently, including page object detection\n(Detect), reading order prediction of identified objects (Order), and the\nconstruction of intended hierarchical structure (Construct). We present an\neffective end-to-end solution based on this framework to demonstrate its\nperformance. To assess our approach, we develop a comprehensive benchmark\ncalled Comp-HRDoc, which evaluates the above subtasks simultaneously. Our\nend-to-end system achieves state-of-the-art performance on two large-scale\ndocument layout analysis datasets (PubLayNet and DocLayNet), a high-quality\nhierarchical document structure reconstruction dataset (HRDoc), and our\nComp-HRDoc benchmark. The Comp-HRDoc benchmark will be released to facilitate\nfurther research in this field.\n","authors":["Jiawei Wang","Kai Hu","Zhuoyao Zhong","Lei Sun","Qiang Huo"],"pdf_url":"https://arxiv.org/pdf/2401.11874v1.pdf","comment":"Submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2401.11859v1","updated":"2024-01-22T11:28:24Z","published":"2024-01-22T11:28:24Z","title":"LKFormer: Large Kernel Transformer for Infrared Image Super-Resolution","summary":"  Given the broad application of infrared technology across diverse fields,\nthere is an increasing emphasis on investigating super-resolution techniques\nfor infrared images within the realm of deep learning. Despite the impressive\nresults of current Transformer-based methods in image super-resolution tasks,\ntheir reliance on the self-attentive mechanism intrinsic to the Transformer\narchitecture results in images being treated as one-dimensional sequences,\nthereby neglecting their inherent two-dimensional structure. Moreover, infrared\nimages exhibit a uniform pixel distribution and a limited gradient range,\nposing challenges for the model to capture effective feature information.\nConsequently, we suggest a potent Transformer model, termed Large Kernel\nTransformer (LKFormer), to address this issue. Specifically, we have designed a\nLarge Kernel Residual Depth-wise Convolutional Attention (LKRDA) module with\nlinear complexity. This mainly employs depth-wise convolution with large\nkernels to execute non-local feature modeling, thereby substituting the\nstandard self-attentive layer. Additionally, we have devised a novel\nfeed-forward network structure called Gated-Pixel Feed-Forward Network (GPFN)\nto augment the LKFormer's capacity to manage the information flow within the\nnetwork. Comprehensive experimental results reveal that our method surpasses\nthe most advanced techniques available, using fewer parameters and yielding\nconsiderably superior performance.\n","authors":["Feiwei Qin","Kang Yan","Changmiao Wang","Ruiquan Ge","Yong Peng","Kai Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11856v1","updated":"2024-01-22T11:25:59Z","published":"2024-01-22T11:25:59Z","title":"MOSformer: Momentum encoder-based inter-slice fusion transformer for\n  medical image segmentation","summary":"  Medical image segmentation takes an important position in various clinical\napplications. Deep learning has emerged as the predominant solution for\nautomated segmentation of volumetric medical images. 2.5D-based segmentation\nmodels bridge computational efficiency of 2D-based models and spatial\nperception capabilities of 3D-based models. However, prevailing 2.5D-based\nmodels often treat each slice equally, failing to effectively learn and exploit\ninter-slice information, resulting in suboptimal segmentation performances. In\nthis paper, a novel Momentum encoder-based inter-slice fusion transformer\n(MOSformer) is proposed to overcome this issue by leveraging inter-slice\ninformation at multi-scale feature maps extracted by different encoders.\nSpecifically, dual encoders are employed to enhance feature distinguishability\namong different slices. One of the encoders is moving-averaged to maintain the\nconsistency of slice representations. Moreover, an IF-Swin transformer module\nis developed to fuse inter-slice multi-scale features. The MOSformer is\nevaluated on three benchmark datasets (Synapse, ACDC, and AMOS), establishing a\nnew state-of-the-art with 85.63%, 92.19%, and 85.43% of DSC, respectively.\nThese promising results indicate its competitiveness in medical image\nsegmentation. Codes and models of MOSformer will be made publicly available\nupon acceptance.\n","authors":["De-Xing Huang","Xiao-Hu Zhou","Xiao-Liang Xie","Shi-Qi Liu","Zhen-Qiu Feng","Mei-Jiang Gui","Hao Li","Tian-Yu Xiang","Xiu-Ling Liu","Zeng-Guang Hou"],"pdf_url":"https://arxiv.org/pdf/2401.11856v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2401.11847v1","updated":"2024-01-22T11:04:55Z","published":"2024-01-22T11:04:55Z","title":"SignVTCL: Multi-Modal Continuous Sign Language Recognition Enhanced by\n  Visual-Textual Contrastive Learning","summary":"  Sign language recognition (SLR) plays a vital role in facilitating\ncommunication for the hearing-impaired community. SLR is a weakly supervised\ntask where entire videos are annotated with glosses, making it challenging to\nidentify the corresponding gloss within a video segment. Recent studies\nindicate that the main bottleneck in SLR is the insufficient training caused by\nthe limited availability of large-scale datasets. To address this challenge, we\npresent SignVTCL, a multi-modal continuous sign language recognition framework\nenhanced by visual-textual contrastive learning, which leverages the full\npotential of multi-modal data and the generalization ability of language model.\nSignVTCL integrates multi-modal data (video, keypoints, and optical flow)\nsimultaneously to train a unified visual backbone, thereby yielding more robust\nvisual representations. Furthermore, SignVTCL contains a visual-textual\nalignment approach incorporating gloss-level and sentence-level alignment to\nensure precise correspondence between visual features and glosses at the level\nof individual glosses and sentence. Experimental results conducted on three\ndatasets, Phoenix-2014, Phoenix-2014T, and CSL-Daily, demonstrate that SignVTCL\nachieves state-of-the-art results compared with previous methods.\n","authors":["Hao Chen","Jiaze Wang","Ziyu Guo","Jinpeng Li","Donghao Zhou","Bian Wu","Chenyong Guan","Guangyong Chen","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2401.11847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11844v1","updated":"2024-01-22T11:01:52Z","published":"2024-01-22T11:01:52Z","title":"Adaptive Fusion of Multi-view Remote Sensing data for Optimal Sub-field\n  Crop Yield Prediction","summary":"  Accurate crop yield prediction is of utmost importance for informed\ndecision-making in agriculture, aiding farmers, and industry stakeholders.\nHowever, this task is complex and depends on multiple factors, such as\nenvironmental conditions, soil properties, and management practices. Combining\nheterogeneous data views poses a fusion challenge, like identifying the\nview-specific contribution to the predictive task. We present a novel\nmulti-view learning approach to predict crop yield for different crops\n(soybean, wheat, rapeseed) and regions (Argentina, Uruguay, and Germany). Our\nmulti-view input data includes multi-spectral optical images from Sentinel-2\nsatellites and weather data as dynamic features during the crop growing season,\ncomplemented by static features like soil properties and topographic\ninformation. To effectively fuse the data, we introduce a Multi-view Gated\nFusion (MVGF) model, comprising dedicated view-encoders and a Gated Unit (GU)\nmodule. The view-encoders handle the heterogeneity of data sources with varying\ntemporal resolutions by learning a view-specific representation. These\nrepresentations are adaptively fused via a weighted sum. The fusion weights are\ncomputed for each sample by the GU using a concatenation of the\nview-representations. The MVGF model is trained at sub-field level with 10 m\nresolution pixels. Our evaluations show that the MVGF outperforms conventional\nmodels on the same task, achieving the best results by incorporating all the\ndata sources, unlike the usual fusion results in the literature. For Argentina,\nthe MVGF model achieves an R2 value of 0.68 at sub-field yield prediction,\nwhile at field level evaluation (comparing field averages), it reaches around\n0.80 across different countries. The GU module learned different weights based\non the country and crop-type, aligning with the variable significance of each\ndata source to the prediction task.\n","authors":["Francisco Mena","Deepak Pathak","Hiba Najjar","Cristhian Sanchez","Patrick Helber","Benjamin Bischke","Peter Habelitz","Miro Miranda","Jayanth Siddamsetty","Marlon Nuske","Marcela Charfuelan","Diego Arenas","Michaela Vollmer","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2401.11844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11835v1","updated":"2024-01-22T10:52:02Z","published":"2024-01-22T10:52:02Z","title":"Unveiling the Human-like Similarities of Automatic Facial Expression\n  Recognition: An Empirical Exploration through Explainable AI","summary":"  Facial expression recognition is vital for human behavior analysis, and deep\nlearning has enabled models that can outperform humans. However, it is unclear\nhow closely they mimic human processing. This study aims to explore the\nsimilarity between deep neural networks and human perception by comparing\ntwelve different networks, including both general object classifiers and\nFER-specific models. We employ an innovative global explainable AI method to\ngenerate heatmaps, revealing crucial facial regions for the twelve networks\ntrained on six facial expressions. We assess these results both quantitatively\nand qualitatively, comparing them to ground truth masks based on Friesen and\nEkman's description and among them. We use Intersection over Union (IoU) and\nnormalized correlation coefficients for comparisons. We generate 72 heatmaps to\nhighlight critical regions for each expression and architecture. Qualitatively,\nmodels with pre-trained weights show more similarity in heatmaps compared to\nthose without pre-training. Specifically, eye and nose areas influence certain\nfacial expressions, while the mouth is consistently important across all models\nand expressions. Quantitatively, we find low average IoU values (avg. 0.2702)\nacross all expressions and architectures. The best-performing architecture\naverages 0.3269, while the worst-performing one averages 0.2066. Dendrograms,\nbuilt with the normalized correlation coefficient, reveal two main clusters for\nmost expressions: models with pre-training and models without pre-training.\nFindings suggest limited alignment between human and AI facial expression\nrecognition, with network architectures influencing the similarity, as similar\narchitectures prioritize similar facial regions.\n","authors":["F. Xavier Gaya-Morey","Silvia Ramis-Guarinos","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11831v1","updated":"2024-01-22T10:42:51Z","published":"2024-01-22T10:42:51Z","title":"A Fair Evaluation of Various Deep Learning-Based Document Image\n  Binarization Approaches","summary":"  Binarization of document images is an important pre-processing step in the\nfield of document analysis. Traditional image binarization techniques usually\nrely on histograms or local statistics to identify a valid threshold to\ndifferentiate between different aspects of the image. Deep learning techniques\nare able to generate binarized versions of the images by learning\ncontext-dependent features that are less error-prone to degradation typically\noccurring in document images. In recent years, many deep learning-based methods\nhave been developed for document binarization. But which one to choose? There\nhave been no studies that compare these methods rigorously. Therefore, this\nwork focuses on the evaluation of different deep learning-based methods under\nthe same evaluation protocol. We evaluate them on different Document Image\nBinarization Contest (DIBCO) datasets and obtain very heterogeneous results. We\nshow that the DE-GAN model was able to perform better compared to other models\nwhen evaluated on the DIBCO2013 dataset while DP-LinkNet performed best on the\nDIBCO2017 dataset. The 2-StageGAN performed best on the DIBCO2018 dataset while\nSauvolaNet outperformed the others on the DIBCO2019 challenge. Finally, we make\nthe code, all models and evaluation publicly available\n(https://github.com/RichSu95/Document_Binarization_Collection) to ensure\nreproducibility and simplify future binarization evaluations.\n","authors":["Richin Sukesh","Mathias Seuret","Anguelos Nicolaou","Martin Mayr","Vincent Christlein"],"pdf_url":"https://arxiv.org/pdf/2401.11831v1.pdf","comment":"DAS 2022"},{"id":"http://arxiv.org/abs/2401.11824v1","updated":"2024-01-22T10:37:59Z","published":"2024-01-22T10:37:59Z","title":"Rethinking Centered Kernel Alignment in Knowledge Distillation","summary":"  Knowledge distillation has emerged as a highly effective method for bridging\nthe representation discrepancy between large-scale models and lightweight\nmodels. Prevalent approaches involve leveraging appropriate metrics to minimize\nthe divergence or distance between the knowledge extracted from the teacher\nmodel and the knowledge learned by the student model. Centered Kernel Alignment\n(CKA) is widely used to measure representation similarity and has been applied\nin several knowledge distillation methods. However, these methods are complex\nand fail to uncover the essence of CKA, thus not answering the question of how\nto use CKA to achieve simple and effective distillation properly. This paper\nfirst provides a theoretical perspective to illustrate the effectiveness of\nCKA, which decouples CKA to the upper bound of Maximum Mean Discrepancy~(MMD)\nand a constant term. Drawing from this, we propose a novel Relation-Centered\nKernel Alignment~(RCKA) framework, which practically establishes a connection\nbetween CKA and MMD. Furthermore, we dynamically customize the application of\nCKA based on the characteristics of each task, with less computational source\nyet comparable performance than the previous methods. The extensive experiments\non the CIFAR-100, ImageNet-1k, and MS-COCO demonstrate that our method achieves\nstate-of-the-art performance on almost all teacher-student pairs for image\nclassification and object detection, validating the effectiveness of our\napproaches.\n","authors":["Zikai Zhou","Yunhang Shen","Shitong Shao","Huanran Chen","Linrui Gong","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11814v1","updated":"2024-01-22T10:22:14Z","published":"2024-01-22T10:22:14Z","title":"Symbrain: A large-scale dataset of MRI images for neonatal brain\n  symmetry analysis","summary":"  This paper presents an annotated dataset of brain MRI images designed to\nadvance the field of brain symmetry study. Magnetic resonance imaging (MRI) has\ngained interest in analyzing brain symmetry in neonatal infants, and challenges\nremain due to the vast size differences between fetal and adult brains.\nClassification methods for brain structural MRI use scales and visual cues to\nassess hemisphere symmetry, which can help diagnose neonatal patients by\ncomparing hemispheres and anatomical regions of interest in the brain. Using\nthe Developing Human Connectome Project dataset, this work presents a dataset\ncomprising cerebral images extracted as slices across selected portions of\ninterest for clinical evaluation . All the extracted images are annotated with\nthe brain's midline. All the extracted images are annotated with the brain's\nmidline. From the assumption that a decrease in symmetry is directly related to\npossible clinical pathologies, the dataset can contribute to a more precise\ndiagnosis because it can be used to train deep learning model application in\nneonatal cerebral MRI anomaly detection from postnatal infant scans thanks to\ncomputer vision. Such models learn to identify and classify anomalies by\nidentifying potential asymmetrical patterns in medical MRI images. Furthermore,\nthis dataset can contribute to the research and development of methods using\nthe relative symmetry of the two brain hemispheres for crucial diagnosis and\ntreatment planning.\n","authors":["Arnaud Gucciardi","Safouane El Ghazouali","Francesca Venturini","Vida Groznik","Umberto Michelucci"],"pdf_url":"https://arxiv.org/pdf/2401.11814v1.pdf","comment":"7 pages, 2 figures, Dataset Paper, Medical AI"},{"id":"http://arxiv.org/abs/2401.02436v2","updated":"2024-01-22T10:08:28Z","published":"2023-11-17T14:40:43Z","title":"Compressed 3D Gaussian Splatting for Accelerated Novel View Synthesis","summary":"  Recently, high-fidelity scene reconstruction with an optimized 3D Gaussian\nsplat representation has been introduced for novel view synthesis from sparse\nimage sets. Making such representations suitable for applications like network\nstreaming and rendering on low-power devices requires significantly reduced\nmemory consumption as well as improved rendering efficiency. We propose a\ncompressed 3D Gaussian splat representation that utilizes sensitivity-aware\nvector clustering with quantization-aware training to compress directional\ncolors and Gaussian parameters. The learned codebooks have low bitrates and\nachieve a compression rate of up to $31\\times$ on real-world scenes with only\nminimal degradation of visual quality. We demonstrate that the compressed splat\nrepresentation can be efficiently rendered with hardware rasterization on\nlightweight GPUs at up to $4\\times$ higher framerates than reported via an\noptimized GPU compute pipeline. Extensive experiments across multiple datasets\ndemonstrate the robustness and rendering speed of the proposed approach.\n","authors":["Simon Niedermayr","Josef Stumpfegger","Rüdiger Westermann"],"pdf_url":"https://arxiv.org/pdf/2401.02436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11796v1","updated":"2024-01-22T09:53:20Z","published":"2024-01-22T09:53:20Z","title":"Local Agnostic Video Explanations: a Study on the Applicability of\n  Removal-Based Explanations to Video","summary":"  Explainable artificial intelligence techniques are becoming increasingly\nimportant with the rise of deep learning applications in various domains. These\ntechniques aim to provide a better understanding of complex \"black box\" models\nand enhance user trust while maintaining high learning performance. While many\nstudies have focused on explaining deep learning models in computer vision for\nimage input, video explanations remain relatively unexplored due to the\ntemporal dimension's complexity. In this paper, we present a unified framework\nfor local agnostic explanations in the video domain. Our contributions include:\n(1) Extending a fine-grained explanation framework tailored for computer vision\ndata, (2) Adapting six existing explanation techniques to work on video data by\nincorporating temporal information and enabling local explanations, and (3)\nConducting an evaluation and comparison of the adapted explanation methods\nusing different models and datasets. We discuss the possibilities and choices\ninvolved in the removal-based explanation process for visual data. The\nadaptation of six explanation methods for video is explained, with comparisons\nto existing approaches. We evaluate the performance of the methods using\nautomated metrics and user-based evaluation, showing that 3D RISE, 3D LIME, and\n3D Kernel SHAP outperform other methods. By decomposing the explanation process\ninto manageable steps, we facilitate the study of each choice's impact and\nallow for further refinement of explanation methods to suit specific datasets\nand models.\n","authors":["F. Xavier Gaya-Morey","Jose M. Buades-Rubio","Cristina Manresa-Yee"],"pdf_url":"https://arxiv.org/pdf/2401.11796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12817v2","updated":"2024-01-22T09:44:18Z","published":"2023-10-19T15:12:44Z","title":"2D-3D Interlaced Transformer for Point Cloud Segmentation with\n  Scene-Level Supervision","summary":"  We present a Multimodal Interlaced Transformer (MIT) that jointly considers\n2D and 3D data for weakly supervised point cloud segmentation. Research studies\nhave shown that 2D and 3D features are complementary for point cloud\nsegmentation. However, existing methods require extra 2D annotations to achieve\n2D-3D information fusion. Considering the high annotation cost of point clouds,\neffective 2D and 3D feature fusion based on weakly supervised learning is in\ngreat demand. To this end, we propose a transformer model with two encoders and\none decoder for weakly supervised point cloud segmentation using only\nscene-level class tags. Specifically, the two encoders compute the\nself-attended features for 3D point clouds and 2D multi-view images,\nrespectively. The decoder implements interlaced 2D-3D cross-attention and\ncarries out implicit 2D and 3D feature fusion. We alternately switch the roles\nof queries and key-value pairs in the decoder layers. It turns out that the 2D\nand 3D features are iteratively enriched by each other. Experiments show that\nit performs favorably against existing weakly supervised point cloud\nsegmentation methods by a large margin on the S3DIS and ScanNet benchmarks. The\nproject page will be available at https://jimmy15923.github.io/mit_web/.\n","authors":["Cheng-Kun Yang","Min-Hung Chen","Yung-Yu Chuang","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2310.12817v2.pdf","comment":"ICCV 2023 (main + supp). Website:\n  https://jimmy15923.github.io/mit_web/"},{"id":"http://arxiv.org/abs/2401.11791v1","updated":"2024-01-22T09:41:05Z","published":"2024-01-22T09:41:05Z","title":"SemPLeS: Semantic Prompt Learning for Weakly-Supervised Semantic\n  Segmentation","summary":"  Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation\nmodels using training image data with only image-level supervision. Since\nprecise pixel-level annotations are not accessible, existing methods typically\nfocus on producing pseudo masks for training segmentation models by refining\nCAM-like heatmaps. However, the produced heatmaps may only capture\ndiscriminative image regions of target object categories or the associated\nco-occurring backgrounds. To address the issues, we propose a Semantic Prompt\nLearning for WSSS (SemPLeS) framework, which learns to effectively prompt the\nCLIP space to enhance the semantic alignment between the segmented regions and\nthe target object categories. More specifically, we propose Contrastive Prompt\nLearning and Class-associated Semantic Refinement to learn the prompts that\nadequately describe and suppress the image backgrounds associated with each\ntarget object category. In this way, our proposed framework is able to perform\nbetter semantic matching between object regions and the associated text labels,\nresulting in desired pseudo masks for training the segmentation model. The\nproposed SemPLeS framework achieves SOTA performance on the standard WSSS\nbenchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the\nsemantic visualization of our learned prompts. The codes will be released.\n","authors":["Ci-Siang Lin","Chien-Yi Wang","Yu-Chiang Frank Wang","Min-Hung Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11790v1","updated":"2024-01-22T09:40:52Z","published":"2024-01-22T09:40:52Z","title":"Deep Learning for Computer Vision based Activity Recognition and Fall\n  Detection of the Elderly: a Systematic Review","summary":"  As the percentage of elderly people in developed countries increases\nworldwide, the healthcare of this collective is a worrying matter, especially\nif it includes the preservation of their autonomy. In this direction, many\nstudies are being published on Ambient Assisted Living (AAL) systems, which\nhelp to reduce the preoccupations raised by the independent living of the\nelderly. In this study, a systematic review of the literature is presented on\nfall detection and Human Activity Recognition (HAR) for the elderly, as the two\nmain tasks to solve to guarantee the safety of elderly people living alone. To\naddress the current tendency to perform these two tasks, the review focuses on\nthe use of Deep Learning (DL) based approaches on computer vision data. In\naddition, different collections of data like DL models, datasets or hardware\n(e.g. depth or thermal cameras) are gathered from the reviewed studies and\nprovided for reference in future studies. Strengths and weaknesses of existing\napproaches are also discussed and, based on them, our recommendations for\nfuture works are provided.\n","authors":["F. Xavier Gaya-Morey","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11783v1","updated":"2024-01-22T09:29:42Z","published":"2024-01-22T09:29:42Z","title":"Full-Body Motion Reconstruction with Sparse Sensing from Graph\n  Perspective","summary":"  Estimating 3D full-body pose from sparse sensor data is a pivotal technique\nemployed for the reconstruction of realistic human motions in Augmented Reality\nand Virtual Reality. However, translating sparse sensor signals into\ncomprehensive human motion remains a challenge since the sparsely distributed\nsensors in common VR systems fail to capture the motion of full human body. In\nthis paper, we use well-designed Body Pose Graph (BPG) to represent the human\nbody and translate the challenge into a prediction problem of graph missing\nnodes. Then, we propose a novel full-body motion reconstruction framework based\non BPG. To establish BPG, nodes are initially endowed with features extracted\nfrom sparse sensor signals. Features from identifiable joint nodes across\ndiverse sensors are amalgamated and processed from both temporal and spatial\nperspectives. Temporal dynamics are captured using the Temporal Pyramid\nStructure, while spatial relations in joint movements inform the spatial\nattributes. The resultant features serve as the foundational elements of the\nBPG nodes. To further refine the BPG, node features are updated through a graph\nneural network that incorporates edge reflecting varying joint relations. Our\nmethod's effectiveness is evidenced by the attained state-of-the-art\nperformance, particularly in lower body motion, outperforming other baseline\nmethods. Additionally, an ablation study validates the efficacy of each module\nin our proposed framework.\n","authors":["Feiyu Yao","Zongkai Wu","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2401.11783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11775v1","updated":"2024-01-22T09:11:12Z","published":"2024-01-22T09:11:12Z","title":"Collaborative Position Reasoning Network for Referring Image\n  Segmentation","summary":"  Given an image and a natural language expression as input, the goal of\nreferring image segmentation is to segment the foreground masks of the entities\nreferred by the expression. Existing methods mainly focus on interactive\nlearning between vision and language to enhance the multi-modal representations\nfor global context reasoning. However, predicting directly in pixel-level space\ncan lead to collapsed positioning and poor segmentation results. Its main\nchallenge lies in how to explicitly model entity localization, especially for\nnon-salient entities. In this paper, we tackle this problem by executing a\nCollaborative Position Reasoning Network (CPRN) via the proposed novel\nRow-and-Column interactive (RoCo) and Guided Holistic interactive (Holi)\nmodules. Specifically, RoCo aggregates the visual features into the row- and\ncolumn-wise features corresponding two directional axes respectively. It offers\na fine-grained matching behavior that perceives the associations between the\nlinguistic features and two decoupled visual features to perform position\nreasoning over a hierarchical space. Holi integrates features of the two\nmodalities by a cross-modal attention mechanism, which suppresses the\nirrelevant redundancy under the guide of positioning information from RoCo.\nThus, with the incorporation of RoCo and Holi modules, CPRN captures the visual\ndetails of position reasoning so that the model can achieve more accurate\nsegmentation. To our knowledge, this is the first work that explicitly focuses\non position reasoning modeling. We also validate the proposed method on three\nevaluation datasets. It consistently outperforms existing state-of-the-art\nmethods.\n","authors":["Jianjian Cao","Beiya Dai","Yulin Li","Xiameng Qin","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17004v2","updated":"2024-01-22T09:10:04Z","published":"2023-12-28T13:16:03Z","title":"Continual Learning in Medical Image Analysis: A Comprehensive Review of\n  Recent Advancements and Future Prospects","summary":"  Medical imaging analysis has witnessed remarkable advancements even\nsurpassing human-level performance in recent years, driven by the rapid\ndevelopment of advanced deep-learning algorithms. However, when the inference\ndataset slightly differs from what the model has seen during one-time training,\nthe model performance is greatly compromised. The situation requires restarting\nthe training process using both the old and the new data which is\ncomputationally costly, does not align with the human learning process, and\nimposes storage constraints and privacy concerns. Alternatively, continual\nlearning has emerged as a crucial approach for developing unified and\nsustainable deep models to deal with new classes, tasks, and the drifting\nnature of data in non-stationary environments for various application areas.\nContinual learning techniques enable models to adapt and accumulate knowledge\nover time, which is essential for maintaining performance on evolving datasets\nand novel tasks. This systematic review paper provides a comprehensive overview\nof the state-of-the-art in continual learning techniques applied to medical\nimaging analysis. We present an extensive survey of existing research, covering\ntopics including catastrophic forgetting, data drifts, stability, and\nplasticity requirements. Further, an in-depth discussion of key components of a\ncontinual learning framework such as continual learning scenarios, techniques,\nevaluation schemes, and metrics is provided. Continual learning techniques\nencompass various categories, including rehearsal, regularization,\narchitectural, and hybrid strategies. We assess the popularity and\napplicability of continual learning categories in various medical sub-fields\nlike radiology and histopathology...\n","authors":["Pratibha Kumari","Joohi Chauhan","Afshin Bozorgpour","Boqiang Huang","Reza Azad","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2312.17004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11767v1","updated":"2024-01-22T09:02:52Z","published":"2024-01-22T09:02:52Z","title":"Concealed Object Segmentation with Hierarchical Coherence Modeling","summary":"  Concealed object segmentation (COS) is a challenging task that involves\nlocalizing and segmenting those concealed objects that are visually blended\nwith their surrounding environments. Despite achieving remarkable success,\nexisting COS segmenters still struggle to achieve complete segmentation results\nin extremely concealed scenarios. In this paper, we propose a Hierarchical\nCoherence Modeling (HCM) segmenter for COS, aiming to address this incomplete\nsegmentation limitation. In specific, HCM promotes feature coherence by\nleveraging the intra-stage coherence and cross-stage coherence modules,\nexploring feature correlations at both the single-stage and contextual levels.\nAdditionally, we introduce the reversible re-calibration decoder to detect\npreviously undetected parts in low-confidence regions, resulting in further\nenhancing segmentation performance. Extensive experiments conducted on three\nCOS tasks, including camouflaged object detection, polyp image segmentation,\nand transparent object detection, demonstrate the promising results achieved by\nthe proposed HCM segmenter.\n","authors":["Fengyang Xiao","Pan Zhang","Chunming He","Runze Hu","Yutao Liu"],"pdf_url":"https://arxiv.org/pdf/2401.11767v1.pdf","comment":"Accepted to CICAI 2023. 13 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.11751v1","updated":"2024-01-22T08:23:52Z","published":"2024-01-22T08:23:52Z","title":"Boosting Multi-view Stereo with Late Cost Aggregation","summary":"  Pairwise matching cost aggregation is a crucial step for modern\nlearning-based Multi-view Stereo (MVS). Prior works adopt an early aggregation\nscheme, which adds up pairwise costs into an intermediate cost. However, we\nanalyze that this process can degrade informative pairwise matchings, thereby\nblocking the depth network from fully utilizing the original geometric matching\ncues.To address this challenge, we present a late aggregation approach that\nallows for aggregating pairwise costs throughout the network feed-forward\nprocess, achieving accurate estimations with only minor changes of the plain\nCasMVSNet.Instead of building an intermediate cost by weighted sum, late\naggregation preserves all pairwise costs along a distinct view channel. This\nenables the succeeding depth network to fully utilize the crucial geometric\ncues without loss of cost fidelity. Grounded in the new aggregation scheme, we\npropose further techniques addressing view order dependence inside the\npreserved cost, handling flexible testing views, and improving the depth\nfiltering process. Despite its technical simplicity, our method improves\nsignificantly upon the baseline cascade-based approach, achieving comparable\nresults with state-of-the-art methods with favorable computation overhead.\n","authors":["Jiang Wu","Rui Li","Yu Zhu","Wenxun Zhao","Jinqiu Sun","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11751v1.pdf","comment":"Code and models are available at https://github.com/Wuuu3511/LAMVSNET"},{"id":"http://arxiv.org/abs/2401.11740v1","updated":"2024-01-22T07:37:25Z","published":"2024-01-22T07:37:25Z","title":"Multi-level Cross-modal Alignment for Image Clustering","summary":"  Recently, the cross-modal pretraining model has been employed to produce\nmeaningful pseudo-labels to supervise the training of an image clustering\nmodel. However, numerous erroneous alignments in a cross-modal pre-training\nmodel could produce poor-quality pseudo-labels and degrade clustering\nperformance. To solve the aforementioned issue, we propose a novel\n\\textbf{Multi-level Cross-modal Alignment} method to improve the alignments in\na cross-modal pretraining model for downstream tasks, by building a smaller but\nbetter semantic space and aligning the images and texts in three levels, i.e.,\ninstance-level, prototype-level, and semantic-level. Theoretical results show\nthat our proposed method converges, and suggests effective means to reduce the\nexpected clustering risk of our method. Experimental results on five benchmark\ndatasets clearly show the superiority of our new method.\n","authors":["Liping Qiu","Qin Zhang","Xiaojun Chen","Shaotian Cai"],"pdf_url":"https://arxiv.org/pdf/2401.11740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11739v1","updated":"2024-01-22T07:34:06Z","published":"2024-01-22T07:34:06Z","title":"EmerDiff: Emerging Pixel-level Semantic Knowledge in Diffusion Models","summary":"  Diffusion models have recently received increasing research attention for\ntheir remarkable transfer abilities in semantic segmentation tasks. However,\ngenerating fine-grained segmentation masks with diffusion models often requires\nadditional training on annotated datasets, leaving it unclear to what extent\npre-trained diffusion models alone understand the semantic relations of their\ngenerated images. To address this question, we leverage the semantic knowledge\nextracted from Stable Diffusion (SD) and aim to develop an image segmentor\ncapable of generating fine-grained segmentation maps without any additional\ntraining. The primary difficulty stems from the fact that semantically\nmeaningful feature maps typically exist only in the spatially lower-dimensional\nlayers, which poses a challenge in directly extracting pixel-level semantic\nrelations from these feature maps. To overcome this issue, our framework\nidentifies semantic correspondences between image pixels and spatial locations\nof low-dimensional feature maps by exploiting SD's generation process and\nutilizes them for constructing image-resolution segmentation maps. In extensive\nexperiments, the produced segmentation maps are demonstrated to be well\ndelineated and capture detailed parts of the images, indicating the existence\nof highly accurate pixel-level semantic knowledge in diffusion models.\n","authors":["Koichi Namekata","Amirmojtaba Sabour","Sanja Fidler","Seung Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2401.11739v1.pdf","comment":"ICLR 2024. Project page: https://kmcode1.github.io/Projects/EmerDiff/"},{"id":"http://arxiv.org/abs/2401.11738v1","updated":"2024-01-22T07:31:52Z","published":"2024-01-22T07:31:52Z","title":"MetaSeg: Content-Aware Meta-Net for Omni-Supervised Semantic\n  Segmentation","summary":"  Noisy labels, inevitably existing in pseudo segmentation labels generated\nfrom weak object-level annotations, severely hampers model optimization for\nsemantic segmentation. Previous works often rely on massive hand-crafted losses\nand carefully-tuned hyper-parameters to resist noise, suffering poor\ngeneralization capability and high model complexity. Inspired by recent\nadvances in meta learning, we argue that rather than struggling to tolerate\nnoise hidden behind clean labels passively, a more feasible solution would be\nto find out the noisy regions actively, so as to simply ignore them during\nmodel optimization. With this in mind, this work presents a novel meta learning\nbased semantic segmentation method, MetaSeg, that comprises a primary\ncontent-aware meta-net (CAM-Net) to sever as a noise indicator for an arbitrary\nsegmentation model counterpart. Specifically, CAM-Net learns to generate\npixel-wise weights to suppress noisy regions with incorrect pseudo labels while\nhighlighting clean ones by exploiting hybrid strengthened features from image\ncontent, providing straightforward and reliable guidance for optimizing the\nsegmentation model. Moreover, to break the barrier of time-consuming training\nwhen applying meta learning to common large segmentation models, we further\npresent a new decoupled training strategy that optimizes different model layers\nin a divide-and-conquer manner. Extensive experiments on object, medical,\nremote sensing and human segmentation shows that our method achieves superior\nperformance, approaching that of fully supervised settings, which paves a new\npromising way for omni-supervised semantic segmentation.\n","authors":["Shenwang Jiang","Jianan Li","Ying Wang","Wenxuan Wu","Jizhou Zhang","Bo Huang","Tingfa Xu"],"pdf_url":"https://arxiv.org/pdf/2401.11738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16244v2","updated":"2024-01-22T07:29:09Z","published":"2023-12-25T11:39:00Z","title":"Modality-missing RGBT Tracking via Invertible Prompt Learning and A\n  High-quality Data Simulation Method","summary":"  Current RGBT tracking researches mainly focus on the modality-complete\nscenarios, overlooking the modality-missing challenge in real-world scenes. In\nthis work, we comprehensively investigate the impact of modality-missing\nchallenge in RGBT tracking and propose a novel invertible prompt learning\napproach, which integrates the content-preserving prompts into a well-trained\ntracking model to adapt to various modality-missing scenarios, for\nmodality-missing RGBT tracking. In particular, given one modality-missing\nscenario, we propose to utilize the available modality to generate the prompt\nof the missing modality to adapt to RGBT tracking model. However, the\ncross-modality gap between available and missing modalities usually causes\nsemantic distortion and information loss in prompt generation. To handle this\nissue, we propose the invertible prompt learning scheme by incorporating the\nfull reconstruction of the input available modality from the prompt in prompt\ngeneration model. Considering that there lacks a modality-missing RGBT tracking\ndataset and many modality-missing scenarios are difficult to capture, we design\na high-quality data simulation method based on hierarchical combination schemes\nto generate real-world modality-missing data. Extensive experiments on three\nmodality-missing datasets show that our method achieves significant performance\nimprovements compared with state-of-the-art methods. We will release the code\nand simulation dataset.\n","authors":["Andong Lu","Jiacong Zhao","Chenglong Li","Jin Tang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2312.16244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14571v4","updated":"2024-01-22T07:24:58Z","published":"2022-10-26T09:01:19Z","title":"Towards the Detection of Diffusion Model Deepfakes","summary":"  In the course of the past few years, diffusion models (DMs) have reached an\nunprecedented level of visual quality. However, relatively little attention has\nbeen paid to the detection of DM-generated images, which is critical to prevent\nadverse impacts on our society. In contrast, generative adversarial networks\n(GANs), have been extensively studied from a forensic perspective. In this\nwork, we therefore take the natural next step to evaluate whether previous\nmethods can be used to detect images generated by DMs. Our experiments yield\ntwo key findings: (1) state-of-the-art GAN detectors are unable to reliably\ndistinguish real from DM-generated images, but (2) re-training them on\nDM-generated images allows for almost perfect detection, which remarkably even\ngeneralizes to GANs. Together with a feature space analysis, our results lead\nto the hypothesis that DMs produce fewer detectable artifacts and are thus more\ndifficult to detect compared to GANs. One possible reason for this is the\nabsence of grid-like frequency artifacts in DM-generated images, which are a\nknown weakness of GANs. However, we make the interesting observation that\ndiffusion models tend to underestimate high frequencies, which we attribute to\nthe learning objective.\n","authors":["Jonas Ricker","Simon Damm","Thorsten Holz","Asja Fischer"],"pdf_url":"https://arxiv.org/pdf/2210.14571v4.pdf","comment":"Accepted at VISAPP 2024. This is the extended version with additional\n  experiments and supplemental material. Code and data:\n  https://github.com/jonasricker/diffusion-model-deepfake-detection"},{"id":"http://arxiv.org/abs/2401.11734v1","updated":"2024-01-22T07:23:44Z","published":"2024-01-22T07:23:44Z","title":"Colorectal Polyp Segmentation in the Deep Learning Era: A Comprehensive\n  Survey","summary":"  Colorectal polyp segmentation (CPS), an essential problem in medical image\nanalysis, has garnered growing research attention. Recently, the deep\nlearning-based model completely overwhelmed traditional methods in the field of\nCPS, and more and more deep CPS methods have emerged, bringing the CPS into the\ndeep learning era. To help the researchers quickly grasp the main techniques,\ndatasets, evaluation metrics, challenges, and trending of deep CPS, this paper\npresents a systematic and comprehensive review of deep-learning-based CPS\nmethods from 2014 to 2023, a total of 115 technical papers. In particular, we\nfirst provide a comprehensive review of the current deep CPS with a novel\ntaxonomy, including network architectures, level of supervision, and learning\nparadigm. More specifically, network architectures include eight subcategories,\nthe level of supervision comprises six subcategories, and the learning paradigm\nencompasses 12 subcategories, totaling 26 subcategories. Then, we provided a\ncomprehensive analysis the characteristics of each dataset, including the\nnumber of datasets, annotation types, image resolution, polyp size, contrast\nvalues, and polyp location. Following that, we summarized CPS's commonly used\nevaluation metrics and conducted a detailed analysis of 40 deep SOTA models,\nincluding out-of-distribution generalization and attribute-based performance\nanalysis. Finally, we discussed deep learning-based CPS methods' main\nchallenges and opportunities.\n","authors":["Zhenyu Wu","Fengmao Lv","Chenglizhao Chen","Aimin Hao","Shuo Li"],"pdf_url":"https://arxiv.org/pdf/2401.11734v1.pdf","comment":"21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.02773v3","updated":"2024-01-22T07:18:55Z","published":"2023-09-06T06:31:08Z","title":"Diffusion Model is Secretly a Training-free Open Vocabulary Semantic\n  Segmenter","summary":"  The pre-trained text-image discriminative models, such as CLIP, has been\nexplored for open-vocabulary semantic segmentation with unsatisfactory results\ndue to the loss of crucial localization information and awareness of object\nshapes. Recently, there has been a growing interest in expanding the\napplication of generative models from generation tasks to semantic\nsegmentation. These approaches utilize generative models either for generating\nannotated data or extracting features to facilitate semantic segmentation. This\ntypically involves generating a considerable amount of synthetic data or\nrequiring additional mask annotations. To this end, we uncover the potential of\ngenerative text-to-image diffusion models (e.g., Stable Diffusion) as highly\nefficient open-vocabulary semantic segmenters, and introduce a novel\ntraining-free approach named DiffSegmenter. The insight is that to generate\nrealistic objects that are semantically faithful to the input text, both the\ncomplete object shapes and the corresponding semantics are implicitly learned\nby diffusion models. We discover that the object shapes are characterized by\nthe self-attention maps while the semantics are indicated through the\ncross-attention maps produced by the denoising U-Net, forming the basis of our\nsegmentation results.Additionally, we carefully design effective textual\nprompts and a category filtering mechanism to further enhance the segmentation\nresults. Extensive experiments on three benchmark datasets show that the\nproposed DiffSegmenter achieves impressive results for open-vocabulary semantic\nsegmentation.\n","authors":["Jinglong Wang","Xiawei Li","Jing Zhang","Qingyuan Xu","Qin Zhou","Qian Yu","Lu Sheng","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2309.02773v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11726v1","updated":"2024-01-22T07:07:32Z","published":"2024-01-22T07:07:32Z","title":"Detecting Out-of-Distribution Samples via Conditional Distribution\n  Entropy with Optimal Transport","summary":"  When deploying a trained machine learning model in the real world, it is\ninevitable to receive inputs from out-of-distribution (OOD) sources. For\ninstance, in continual learning settings, it is common to encounter OOD samples\ndue to the non-stationarity of a domain. More generally, when we have access to\na set of test inputs, the existing rich line of OOD detection solutions,\nespecially the recent promise of distance-based methods, falls short in\neffectively utilizing the distribution information from training samples and\ntest inputs. In this paper, we argue that empirical probability distributions\nthat incorporate geometric information from both training samples and test\ninputs can be highly beneficial for OOD detection in the presence of test\ninputs available. To address this, we propose to model OOD detection as a\ndiscrete optimal transport problem. Within the framework of optimal transport,\nwe propose a novel score function known as the \\emph{conditional distribution\nentropy} to quantify the uncertainty of a test input being an OOD sample. Our\nproposal inherits the merits of certain distance-based methods while\neliminating the reliance on distribution assumptions, a-prior knowledge, and\nspecific training mechanisms. Extensive experiments conducted on benchmark\ndatasets demonstrate that our method outperforms its competitors in OOD\ndetection.\n","authors":["Chuanwen Feng","Wenlong Chen","Ao Ke","Yilong Ren","Xike Xie","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.11726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11724v1","updated":"2024-01-22T06:56:52Z","published":"2024-01-22T06:56:52Z","title":"Augmenting Prototype Network with TransMix for Few-shot Hyperspectral\n  Image Classification","summary":"  Few-shot hyperspectral image classification aims to identify the classes of\neach pixel in the images by only marking few of these pixels. And in order to\nobtain the spatial-spectral joint features of each pixel, the fixed-size\npatches centering around each pixel are often used for classification. However,\nobserving the classification results of existing methods, we found that\nboundary patches corresponding to the pixels which are located at the boundary\nof the objects in the hyperspectral images, are hard to classify. These\nboundary patchs are mixed with multi-class spectral information. Inspired by\nthis, we propose to augment the prototype network with TransMix for few-shot\nhyperspectrial image classification(APNT). While taking the prototype network\nas the backbone, it adopts the transformer as feature extractor to learn the\npixel-to-pixel relation and pay different attentions to different pixels. At\nthe same time, instead of directly using the patches which are cut from the\nhyperspectral images for training, it randomly mixs up two patches to imitate\nthe boundary patches and uses the synthetic patches to train the model, with\nthe aim to enlarge the number of hard training samples and enhance their\ndiversity. And by following the data agumentation technique TransMix, the\nattention returned by the transformer is also used to mix up the labels of two\npatches to generate better labels for synthetic patches. Compared with existing\nmethods, the proposed method has demonstrated sate of the art performance and\nbetter robustness for few-shot hyperspectral image classification in our\nexperiments.\n","authors":["Chun Liu","Longwei Yang","Dongmei Dong","Zheng Li","Wei Yang","Zhigang Han","Jiayao Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17240v3","updated":"2024-01-22T06:53:23Z","published":"2023-12-28T18:58:33Z","title":"LISA++: An Improved Baseline for Reasoning Segmentation with Large\n  Language Model","summary":"  While LISA effectively bridges the gap between segmentation and large\nlanguage models to enable reasoning segmentation, it poses certain limitations:\nunable to distinguish different instances of the target region, and constrained\nby the pre-defined textual response formats. In this work, we introduce LISA++,\nan update to the existing LISA model, focusing on improving core\nfunctionalities while keeping the base architecture intact. The main\nenhancements in LISA++ include: \\textbf{1) Enhanced Segmentation}: The instance\nsegmentation ability has been added, providing a more detailed scene analysis\nalong with the existing multi-region semantic segmentation. \\textbf{2) More\nNatural Conversation}: Improved capability for multi-turn dialogue, with the\nability to incorporate segmentation results directly into text responses, i.e.,\nSegmentation in Dialogue (SiD). These improvements are achieved by curating the\nexisting samples of generic segmentation datasets, aimed specifically at\nenhancing the segmentation and conversational skills without structural change\nand additional data sources. Comparative analysis with the original LISA model\nshows significant advancements in these areas, positioning LISA++ as a notable\nupgrade in visual understanding and interaction. LISA++'s adaptability and\nimproved features highlight the versatility of the mask-as-embedding paradigm\nproposed by LISA, and the potential as a foundational model for diverse\napplications.\n","authors":["Senqiao Yang","Tianyuan Qu","Xin Lai","Zhuotao Tian","Bohao Peng","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.17240v3.pdf","comment":"Typo fixed"},{"id":"http://arxiv.org/abs/2211.08824v4","updated":"2024-01-22T06:46:27Z","published":"2022-11-16T10:49:48Z","title":"SMILEtrack: SiMIlarity LEarning for Occlusion-Aware Multiple Object\n  Tracking","summary":"  Despite recent progress in Multiple Object Tracking (MOT), several obstacles\nsuch as occlusions, similar objects, and complex scenes remain an open\nchallenge. Meanwhile, a systematic study of the cost-performance tradeoff for\nthe popular tracking-by-detection paradigm is still lacking. This paper\nintroduces SMILEtrack, an innovative object tracker that effectively addresses\nthese challenges by integrating an efficient object detector with a Siamese\nnetwork-based Similarity Learning Module (SLM). The technical contributions of\nSMILETrack are twofold. First, we propose an SLM that calculates the appearance\nsimilarity between two objects, overcoming the limitations of feature\ndescriptors in Separate Detection and Embedding (SDE) models. The SLM\nincorporates a Patch Self-Attention (PSA) block inspired by the vision\nTransformer, which generates reliable features for accurate similarity\nmatching. Second, we develop a Similarity Matching Cascade (SMC) module with a\nnovel GATE function for robust object matching across consecutive video frames,\nfurther enhancing MOT performance. Together, these innovations help SMILETrack\nachieve an improved trade-off between the cost ({\\em e.g.}, running speed) and\nperformance (e.g., tracking accuracy) over several existing state-of-the-art\nbenchmarks, including the popular BYTETrack method. SMILETrack outperforms\nBYTETrack by 0.4-0.8 MOTA and 2.1-2.2 HOTA points on MOT17 and MOT20 datasets.\nCode is available at https://github.com/pingyang1117/SMILEtrack_Official\n","authors":["Yu-Hsiang Wang","Jun-Wei Hsieh","Ping-Yang Chen","Ming-Ching Chang","Hung Hin So","Xin Li"],"pdf_url":"https://arxiv.org/pdf/2211.08824v4.pdf","comment":"Our paper was accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2401.11719v1","updated":"2024-01-22T06:43:13Z","published":"2024-01-22T06:43:13Z","title":"SFC: Shared Feature Calibration in Weakly Supervised Semantic\n  Segmentation","summary":"  Image-level weakly supervised semantic segmentation has received increasing\nattention due to its low annotation cost. Existing methods mainly rely on Class\nActivation Mapping (CAM) to obtain pseudo-labels for training semantic\nsegmentation models. In this work, we are the first to demonstrate that\nlong-tailed distribution in training data can cause the CAM calculated through\nclassifier weights over-activated for head classes and under-activated for tail\nclasses due to the shared features among head- and tail- classes. This degrades\npseudo-label quality and further influences final semantic segmentation\nperformance. To address this issue, we propose a Shared Feature Calibration\n(SFC) method for CAM generation. Specifically, we leverage the class prototypes\nthat carry positive shared features and propose a Multi-Scaled\nDistribution-Weighted (MSDW) consistency loss for narrowing the gap between the\nCAMs generated through classifier weights and class prototypes during training.\nThe MSDW loss counterbalances over-activation and under-activation by\ncalibrating the shared features in head-/tail-class classifier weights.\nExperimental results show that our SFC significantly improves CAM boundaries\nand achieves new state-of-the-art performances. The project is available at\nhttps://github.com/Barrett-python/SFC.\n","authors":["Xinqiao Zhao","Feilong Tang","Xiaoyang Wang","Jimin Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.11719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11718v1","updated":"2024-01-22T06:42:23Z","published":"2024-01-22T06:42:23Z","title":"MsSVT++: Mixed-scale Sparse Voxel Transformer with Center Voting for 3D\n  Object Detection","summary":"  Accurate 3D object detection in large-scale outdoor scenes, characterized by\nconsiderable variations in object scales, necessitates features rich in both\nlong-range and fine-grained information. While recent detectors have utilized\nwindow-based transformers to model long-range dependencies, they tend to\noverlook fine-grained details. To bridge this gap, we propose MsSVT++, an\ninnovative Mixed-scale Sparse Voxel Transformer that simultaneously captures\nboth types of information through a divide-and-conquer approach. This approach\ninvolves explicitly dividing attention heads into multiple groups, each\nresponsible for attending to information within a specific range. The outputs\nof these groups are subsequently merged to obtain final mixed-scale features.\nTo mitigate the computational complexity associated with applying a\nwindow-based transformer in 3D voxel space, we introduce a novel Chessboard\nSampling strategy and implement voxel sampling and gathering operations\nsparsely using a hash map. Moreover, an important challenge stems from the\nobservation that non-empty voxels are primarily located on the surface of\nobjects, which impedes the accurate estimation of bounding boxes. To overcome\nthis challenge, we introduce a Center Voting module that integrates newly voted\nvoxels enriched with mixed-scale contextual information towards the centers of\nthe objects, thereby improving precise object localization. Extensive\nexperiments demonstrate that our single-stage detector, built upon the\nfoundation of MsSVT++, consistently delivers exceptional performance across\ndiverse datasets.\n","authors":["Jianan Li","Shaocong Dong","Lihe Ding","Tingfa Xu"],"pdf_url":"https://arxiv.org/pdf/2401.11718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.03842v4","updated":"2024-01-22T06:30:15Z","published":"2022-04-08T05:11:04Z","title":"From 2D Images to 3D Model:Weakly Supervised Multi-View Face\n  Reconstruction with Deep Fusion","summary":"  While weakly supervised multi-view face reconstruction (MVR) is garnering\nincreased attention, one critical issue still remains open: how to effectively\nfuse multiple image information to reconstruct high-precision 3D models. In\nthis regard, we propose a novel model called Deep Fusion MVR (DF-MVR) to\nreconstruct high-precision 3D facial shapes from multi-view images.\nSpecifically, we introduce MulEn-Unet, a multi-view encoding to single decoding\nframework with skip connections and attention. This design allows for the\nextraction, integration, and compensation of deep features with attention from\nmulti-view images. Furthermore, we adopt the involution kernel to enrich deep\nfusion features with channel features. In addition, we develop the face parse\nnetwork to learn, identify, and emphasize the critical common face area within\nmulti-view images. Experiments on Pixel-Face and Bosphorus datasets indicate\nthe superiority of our model. Without 3D annotation, DF-MVR achieves 5.2% and\n3.0% RMSE improvement over the existing weakly supervised MVRs respectively on\nPixel-Face and Bosphorus dataset. Code will be available publicly at\nhttps://github.com/weiguangzhao/DF_MVR.\n","authors":["Weiguang Zhao","Chaolong Yang","Jianan Ye","Rui Zhang","Yuyao Yan","Xi Yang","Bin Dong","Amir Hussain","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2204.03842v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11713v1","updated":"2024-01-22T06:29:52Z","published":"2024-01-22T06:29:52Z","title":"Medical Image Debiasing by Learning Adaptive Agreement from a Biased\n  Council","summary":"  Deep learning could be prone to learning shortcuts raised by dataset bias and\nresult in inaccurate, unreliable, and unfair models, which impedes its adoption\nin real-world clinical applications. Despite its significance, there is a\ndearth of research in the medical image classification domain to address\ndataset bias. Furthermore, the bias labels are often agnostic, as identifying\nbiases can be laborious and depend on post-hoc interpretation. This paper\nproposes learning Adaptive Agreement from a Biased Council (Ada-ABC), a\ndebiasing framework that does not rely on explicit bias labels to tackle\ndataset bias in medical images. Ada-ABC develops a biased council consisting of\nmultiple classifiers optimized with generalized cross entropy loss to learn the\ndataset bias. A debiasing model is then simultaneously trained under the\nguidance of the biased council. Specifically, the debiasing model is required\nto learn adaptive agreement with the biased council by agreeing on the\ncorrectly predicted samples and disagreeing on the wrongly predicted samples by\nthe biased council. In this way, the debiasing model could learn the target\nattribute on the samples without spurious correlations while also avoiding\nignoring the rich information in samples with spurious correlations. We\ntheoretically demonstrated that the debiasing model could learn the target\nfeatures when the biased model successfully captures dataset bias. Moreover, to\nour best knowledge, we constructed the first medical debiasing benchmark from\nfour datasets containing seven different bias scenarios. Our extensive\nexperiments practically showed that our proposed Ada-ABC outperformed\ncompetitive approaches, verifying its effectiveness in mitigating dataset bias\nfor medical image classification. The codes and organized benchmark datasets\nwill be made publicly available.\n","authors":["Luyang Luo","Xin Huang","Minghao Wang","Zhuoyue Wan","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11713v1.pdf","comment":"10 pages, 5 figures, 3 tables. Code and benchmark will be released\n  via https://github.com/LLYXC/Ada-ABC/tree/main"},{"id":"http://arxiv.org/abs/2401.11711v1","updated":"2024-01-22T06:28:08Z","published":"2024-01-22T06:28:08Z","title":"HG3-NeRF: Hierarchical Geometric, Semantic, and Photometric Guided\n  Neural Radiance Fields for Sparse View Inputs","summary":"  Neural Radiance Fields (NeRF) have garnered considerable attention as a\nparadigm for novel view synthesis by learning scene representations from\ndiscrete observations. Nevertheless, NeRF exhibit pronounced performance\ndegradation when confronted with sparse view inputs, consequently curtailing\nits further applicability. In this work, we introduce Hierarchical Geometric,\nSemantic, and Photometric Guided NeRF (HG3-NeRF), a novel methodology that can\naddress the aforementioned limitation and enhance consistency of geometry,\nsemantic content, and appearance across different views. We propose\nHierarchical Geometric Guidance (HGG) to incorporate the attachment of\nStructure from Motion (SfM), namely sparse depth prior, into the scene\nrepresentations. Different from direct depth supervision, HGG samples volume\npoints from local-to-global geometric regions, mitigating the misalignment\ncaused by inherent bias in the depth prior. Furthermore, we draw inspiration\nfrom notable variations in semantic consistency observed across images of\ndifferent resolutions and propose Hierarchical Semantic Guidance (HSG) to learn\nthe coarse-to-fine semantic content, which corresponds to the coarse-to-fine\nscene representations. Experimental results demonstrate that HG3-NeRF can\noutperform other state-of-the-art methods on different standard benchmarks and\nachieve high-fidelity synthesis results for sparse view inputs.\n","authors":["Zelin Gao","Weichen Dai","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11711v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.11708v1","updated":"2024-01-22T06:16:29Z","published":"2024-01-22T06:16:29Z","title":"Mastering Text-to-Image Diffusion: Recaptioning, Planning, and\n  Generating with Multimodal LLMs","summary":"  Diffusion models have exhibit exceptional performance in text-to-image\ngeneration and editing. However, existing methods often face challenges when\nhandling complex text prompts that involve multiple objects with multiple\nattributes and relationships. In this paper, we propose a brand new\ntraining-free text-to-image generation/editing framework, namely Recaption,\nPlan and Generate (RPG), harnessing the powerful chain-of-thought reasoning\nability of multimodal LLMs to enhance the compositionality of text-to-image\ndiffusion models. Our approach employs the MLLM as a global planner to\ndecompose the process of generating complex images into multiple simpler\ngeneration tasks within subregions. We propose complementary regional diffusion\nto enable region-wise compositional generation. Furthermore, we integrate\ntext-guided image generation and editing within the proposed RPG in a\nclosed-loop fashion, thereby enhancing generalization ability. Extensive\nexperiments demonstrate our RPG outperforms state-of-the-art text-to-image\ndiffusion models, including DALL-E 3 and SDXL, particularly in multi-category\nobject composition and text-image semantic alignment. Notably, our RPG\nframework exhibits wide compatibility with various MLLM architectures (e.g.,\nMiniGPT-4) and diffusion backbones (e.g., ControlNet). Our code is available\nat: https://github.com/YangLing0818/RPG-DiffusionMaster\n","authors":["Ling Yang","Zhaochen Yu","Chenlin Meng","Minkai Xu","Stefano Ermon","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2401.11708v1.pdf","comment":"Project: https://github.com/YangLing0818/RPG-DiffusionMaster"},{"id":"http://arxiv.org/abs/2401.11704v1","updated":"2024-01-22T06:05:26Z","published":"2024-01-22T06:05:26Z","title":"EK-Net:Real-time Scene Text Detection with Expand Kernel Distance","summary":"  Recently, scene text detection has received significant attention due to its\nwide application. However, accurate detection in complex scenes of multiple\nscales, orientations, and curvature remains a challenge. Numerous detection\nmethods adopt the Vatti clipping (VC) algorithm for multiple-instance training\nto address the issue of arbitrary-shaped text. Yet we identify several bias\nresults from these approaches called the \"shrinked kernel\". Specifically, it\nrefers to a decrease in accuracy resulting from an output that overly favors\nthe text kernel. In this paper, we propose a new approach named Expand Kernel\nNetwork (EK-Net) with expand kernel distance to compensate for the previous\ndeficiency, which includes three-stages regression to complete instance\ndetection. Moreover, EK-Net not only realize the precise positioning of\narbitrary-shaped text, but also achieve a trade-off between performance and\nspeed. Evaluation results demonstrate that EK-Net achieves state-of-the-art or\ncompetitive performance compared to other advanced methods, e.g., F-measure of\n85.72% at 35.42 FPS on ICDAR 2015, F-measure of 85.75% at 40.13 FPS on CTW1500.\n","authors":["Boyuan Zhu","Fagui Liu","Xi Chen","Quan Tang"],"pdf_url":"https://arxiv.org/pdf/2401.11704v1.pdf","comment":"2024 IEEE International Conference on Acoustics, Speech and Signal\n  Processing"},{"id":"http://arxiv.org/abs/2304.03047v3","updated":"2024-01-22T04:57:32Z","published":"2023-04-06T13:07:17Z","title":"ETPNav: Evolving Topological Planning for Vision-Language Navigation in\n  Continuous Environments","summary":"  Vision-language navigation is a task that requires an agent to follow\ninstructions to navigate in environments. It becomes increasingly crucial in\nthe field of embodied AI, with potential applications in autonomous navigation,\nsearch and rescue, and human-robot interaction. In this paper, we propose to\naddress a more practical yet challenging counterpart setting - vision-language\nnavigation in continuous environments (VLN-CE). To develop a robust VLN-CE\nagent, we propose a new navigation framework, ETPNav, which focuses on two\ncritical skills: 1) the capability to abstract environments and generate\nlong-range navigation plans, and 2) the ability of obstacle-avoiding control in\ncontinuous environments. ETPNav performs online topological mapping of\nenvironments by self-organizing predicted waypoints along a traversed path,\nwithout prior environmental experience. It privileges the agent to break down\nthe navigation procedure into high-level planning and low-level control.\nConcurrently, ETPNav utilizes a transformer-based cross-modal planner to\ngenerate navigation plans based on topological maps and instructions. The plan\nis then performed through an obstacle-avoiding controller that leverages a\ntrial-and-error heuristic to prevent navigation from getting stuck in\nobstacles. Experimental results demonstrate the effectiveness of the proposed\nmethod. ETPNav yields more than 10% and 20% improvements over prior\nstate-of-the-art on R2R-CE and RxR-CE datasets, respectively. Our code is\navailable at https://github.com/MarSaKi/ETPNav.\n","authors":["Dong An","Hanqing Wang","Wenguan Wang","Zun Wang","Yan Huang","Keji He","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.03047v3.pdf","comment":"Project page: https://github.com/MarSaKi/ETPNav"},{"id":"http://arxiv.org/abs/2401.11687v1","updated":"2024-01-22T04:54:42Z","published":"2024-01-22T04:54:42Z","title":"TIM: An Efficient Temporal Interaction Module for Spiking Transformer","summary":"  Spiking Neural Networks (SNNs), as the third generation of neural networks,\nhave gained prominence for their biological plausibility and computational\nefficiency, especially in processing diverse datasets. The integration of\nattention mechanisms, inspired by advancements in neural network architectures,\nhas led to the development of Spiking Transformers. These have shown promise in\nenhancing SNNs' capabilities, particularly in the realms of both static and\nneuromorphic datasets. Despite their progress, a discernible gap exists in\nthese systems, specifically in the Spiking Self Attention (SSA) mechanism's\neffectiveness in leveraging the temporal processing potential of SNNs. To\naddress this, we introduce the Temporal Interaction Module (TIM), a novel,\nconvolution-based enhancement designed to augment the temporal data processing\nabilities within SNN architectures. TIM's integration into existing SNN\nframeworks is seamless and efficient, requiring minimal additional parameters\nwhile significantly boosting their temporal information handling capabilities.\nThrough rigorous experimentation, TIM has demonstrated its effectiveness in\nexploiting temporal information, leading to state-of-the-art performance across\nvarious neuromorphic datasets.\n","authors":["Sicheng Shen","Dongcheng Zhao","Guobin Shen","Yi Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.11687v1.pdf","comment":"10pages,6figures"},{"id":"http://arxiv.org/abs/2310.09221v2","updated":"2024-01-22T04:48:57Z","published":"2023-10-13T16:18:48Z","title":"Ultrasound Image Segmentation of Thyroid Nodule via Latent Semantic\n  Feature Co-Registration","summary":"  Segmentation of nodules in thyroid ultrasound imaging plays a crucial role in\nthe detection and treatment of thyroid cancer. However, owing to the diversity\nof scanner vendors and imaging protocols in different hospitals, the automatic\nsegmentation model, which has already demonstrated expert-level accuracy in the\nfield of medical image segmentation, finds its accuracy reduced as the result\nof its weak generalization performance when being applied in clinically\nrealistic environments. To address this issue, the present paper proposes ASTN,\na framework for thyroid nodule segmentation achieved through a new type\nco-registration network. By extracting latent semantic information from the\natlas and target images and utilizing in-depth features to accomplish the\nco-registration of nodules in thyroid ultrasound images, this framework can\nensure the integrity of anatomical structure and reduce the impact on\nsegmentation as the result of overall differences in image caused by different\ndevices. In addition, this paper also provides an atlas selection algorithm to\nmitigate the difficulty of co-registration. As shown by the evaluation results\ncollected from the datasets of different devices, thanks to the method we\nproposed, the model generalization has been greatly improved while maintaining\na high level of segmentation accuracy.\n","authors":["Xuewei Li","Yaqiao Zhu","Jie Gao","Xi Wei","Ruixuan Zhang","Yuan Tian","ZhiQiang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.09221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07278v2","updated":"2024-01-22T04:43:04Z","published":"2024-01-14T12:22:34Z","title":"Semi-supervised Semantic Segmentation using Redesigned Self-Training for\n  White Blood Cell","summary":"  Artificial Intelligence (AI) in healthcare, especially in white blood cell\ncancer diagnosis, is hindered by two primary challenges: the lack of\nlarge-scale labeled datasets for white blood cell (WBC) segmentation and\noutdated segmentation methods. To address the first challenge, a\nsemi-supervised learning framework should be brought to efficiently annotate\nthe large dataset. In this work, we address this issue by proposing a novel\nself-training pipeline with the incorporation of FixMatch. We discover that by\nincorporating FixMatch in the self-training pipeline, the performance improves\nin the majority of cases. Our performance achieved the best performance with\nthe self-training scheme with consistency on DeepLab-V3 architecture and\nResNet-50, reaching 90.69%, 87.37%, and 76.49% on Zheng 1, Zheng 2, and LISC\ndatasets, respectively.\n","authors":["Vinh Quoc Luu","Duy Khanh Le","Huy Thanh Nguyen","Minh Thanh Nguyen","Thinh Tien Nguyen","Vinh Quang Dinh"],"pdf_url":"https://arxiv.org/pdf/2401.07278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11674v1","updated":"2024-01-22T03:24:45Z","published":"2024-01-22T03:24:45Z","title":"Memory-Efficient Prompt Tuning for Incremental Histopathology\n  Classification","summary":"  Recent studies have made remarkable progress in histopathology\nclassification. Based on current successes, contemporary works proposed to\nfurther upgrade the model towards a more generalizable and robust direction\nthrough incrementally learning from the sequentially delivered domains. Unlike\nprevious parameter isolation based approaches that usually demand massive\ncomputation resources during model updating, we present a memory-efficient\nprompt tuning framework to cultivate model generalization potential in\neconomical memory cost. For each incoming domain, we reuse the existing\nparameters of the initial classification model and attach lightweight trainable\nprompts into it for customized tuning. Considering the domain heterogeneity, we\nperform decoupled prompt tuning, where we adopt a domain-specific prompt for\neach domain to independently investigate its distinctive characteristics, and\none domain-invariant prompt shared across all domains to continually explore\nthe common content embedding throughout time. All domain-specific prompts will\nbe appended to the prompt bank and isolated from further changes to prevent\nforgetting the distinctive features of early-seen domains. While the\ndomain-invariant prompt will be passed on and iteratively evolve by\nstyle-augmented prompt refining to improve model generalization capability over\ntime. In specific, we construct a graph with existing prompts and build a\nstyle-augmented graph attention network to guide the domain-invariant prompt\nexploring the overlapped latent embedding among all delivered domains for more\ndomain generic representations. We have extensively evaluated our framework\nwith two histopathology tasks, i.e., breast cancer metastasis classification\nand epithelium-stroma tissue classification, where our approach yielded\nsuperior performance and memory efficiency over the competing methods.\n","authors":["Yu Zhu","Kang Li","Lequan Yu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2401.11674v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.11673v1","updated":"2024-01-22T03:22:49Z","published":"2024-01-22T03:22:49Z","title":"MVSFormer++: Revealing the Devil in Transformer's Details for Multi-View\n  Stereo","summary":"  Recent advancements in learning-based Multi-View Stereo (MVS) methods have\nprominently featured transformer-based models with attention mechanisms.\nHowever, existing approaches have not thoroughly investigated the profound\ninfluence of transformers on different MVS modules, resulting in limited depth\nestimation capabilities. In this paper, we introduce MVSFormer++, a method that\nprudently maximizes the inherent characteristics of attention to enhance\nvarious components of the MVS pipeline. Formally, our approach involves\ninfusing cross-view information into the pre-trained DINOv2 model to facilitate\nMVS learning. Furthermore, we employ different attention mechanisms for the\nfeature encoder and cost volume regularization, focusing on feature and spatial\naggregations respectively. Additionally, we uncover that some design details\nwould substantially impact the performance of transformer modules in MVS,\nincluding normalized 3D positional encoding, adaptive attention scaling, and\nthe position of layer normalization. Comprehensive experiments on DTU,\nTanks-and-Temples, BlendedMVS, and ETH3D validate the effectiveness of the\nproposed method. Notably, MVSFormer++ achieves state-of-the-art performance on\nthe challenging DTU and Tanks-and-Temples benchmarks.\n","authors":["Chenjie Cao","Xinlin Ren","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2401.11673v1.pdf","comment":"Accepted to ICLR2024"},{"id":"http://arxiv.org/abs/2310.01852v7","updated":"2024-01-22T03:11:15Z","published":"2023-10-03T07:33:27Z","title":"LanguageBind: Extending Video-Language Pretraining to N-modality by\n  Language-based Semantic Alignment","summary":"  The video-language (VL) pretraining has achieved remarkable improvement in\nmultiple downstream tasks. However, the current VL pretraining framework is\nhard to extend to multiple modalities (N modalities, N>=3) beyond vision and\nlanguage. We thus propose LanguageBind, taking the language as the bind across\ndifferent modalities because the language modality is well-explored and\ncontains rich semantics. Specifically, we freeze the language encoder acquired\nby VL pretraining, then train encoders for other modalities with contrastive\nlearning. As a result, all modalities are mapped to a shared feature space,\nimplementing multi-modal semantic alignment. While LanguageBind ensures that we\ncan extend VL modalities to N modalities, we also need a high-quality dataset\nwith alignment data pairs centered on language. We thus propose VIDAL-10M with\nVideo, Infrared, Depth, Audio and their corresponding Language, naming as\nVIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with\ncomplete semantics rather than truncated segments from long videos, and all the\nvideo, depth, infrared, and audio modalities are aligned to their textual\ndescriptions. LanguageBind has achieved superior performance on a wide range of\n15 benchmarks covering video, audio, depth, and infrared. Moreover, multiple\nexperiments have provided evidence for the effectiveness of LanguageBind in\nachieving indirect alignment and complementarity among diverse modalities. Code\naddress: https://github.com/PKU-YuanGroup/LanguageBind\n","authors":["Bin Zhu","Bin Lin","Munan Ning","Yang Yan","Jiaxi Cui","HongFa Wang","Yatian Pang","Wenhao Jiang","Junwu Zhang","Zongwei Li","Wancai Zhang","Zhifeng Li","Wei Liu","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2310.01852v7.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11671v1","updated":"2024-01-22T03:09:00Z","published":"2024-01-22T03:09:00Z","title":"RTA-Former: Reverse Transformer Attention for Polyp Segmentation","summary":"  Polyp segmentation is a key aspect of colorectal cancer prevention, enabling\nearly detection and guiding subsequent treatments. Intelligent diagnostic\ntools, including deep learning solutions, are widely explored to streamline and\npotentially automate this process. However, even with many powerful network\narchitectures, there still comes the problem of producing accurate edge\nsegmentation. In this paper, we introduce a novel network, namely RTA-Former,\nthat employs a transformer model as the encoder backbone and innovatively\nadapts Reverse Attention (RA) with a transformer stage in the decoder for\nenhanced edge segmentation. The results of the experiments illustrate that\nRTA-Former achieves state-of-the-art (SOTA) performance in five polyp\nsegmentation datasets. The strong capability of RTA-Former holds promise in\nimproving the accuracy of Transformer-based polyp segmentation, potentially\nleading to better clinical decisions and patient outcomes. Our code will be\npublicly available on GitHub.\n","authors":["Zhikai Li","Murong Yi","Ali Uneri","Sihan Niu","Craig Jones"],"pdf_url":"https://arxiv.org/pdf/2401.11671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08898v3","updated":"2024-01-22T03:01:28Z","published":"2023-01-21T05:34:29Z","title":"Recurrent Generic Contour-based Instance Segmentation with Progressive\n  Learning","summary":"  Contour-based instance segmentation has been actively studied, thanks to its\nflexibility and elegance in processing visual objects within complex\nbackgrounds. In this work, we propose a novel deep network architecture, i.e.,\nPolySnake, for generic contour-based instance segmentation. Motivated by the\nclassic Snake algorithm, the proposed PolySnake achieves superior and robust\nsegmentation performance with an iterative and progressive contour refinement\nstrategy. Technically, PolySnake introduces a recurrent update operator to\nestimate the object contour iteratively. It maintains a single estimate of the\ncontour that is progressively deformed toward the object boundary. At each\niteration, PolySnake builds a semantic-rich representation for the current\ncontour and feeds it to the recurrent operator for further contour adjustment.\nThrough the iterative refinements, the contour progressively converges to a\nstable status that tightly encloses the object instance. Beyond the scope of\ngeneral instance segmentation, extensive experiments are conducted to validate\nthe effectiveness and generalizability of our PolySnake in two additional\nspecific task scenarios, including scene text detection and lane detection. The\nresults demonstrate that the proposed PolySnake outperforms the existing\nadvanced methods on several multiple prevalent benchmarks across the three\ntasks. The codes and pre-trained models are available at\nhttps://github.com/fh2019ustc/PolySnake\n","authors":["Hao Feng","Keyi Zhou","Wengang Zhou","Yufei Yin","Jiajun Deng","Qi Sun","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2301.08898v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07444v3","updated":"2024-01-22T02:56:05Z","published":"2023-04-15T01:33:14Z","title":"The Art of Camouflage: Few-shot Learning for Animal Detection and\n  Segmentation","summary":"  Camouflaged object detection and segmentation is a new and challenging\nresearch topic in computer vision. There is a serious issue of lacking data of\ncamouflaged objects such as camouflaged animals in natural scenes. In this\npaper, we address the problem of few-shot learning for camouflaged object\ndetection and segmentation. To this end, we first collect a new dataset,\nCAMO-FS, for the benchmark. We then propose a novel method to efficiently\ndetect and segment the camouflaged objects in the images. In particular, we\nintroduce the instance triplet loss and the instance memory storage. The\nextensive experiments demonstrated that our proposed method achieves\nstate-of-the-art performance on the newly collected dataset.\n","authors":["Thanh-Danh Nguyen","Anh-Khoa Nguyen Vu","Nhat-Duy Nguyen","Vinh-Tiep Nguyen","Thanh Duc Ngo","Thanh-Toan Do","Minh-Triet Tran","Tam V. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2304.07444v3.pdf","comment":"Under-review Journal"},{"id":"http://arxiv.org/abs/2305.16789v2","updated":"2024-01-22T02:47:50Z","published":"2023-05-26T09:59:48Z","title":"Modulate Your Spectrum in Self-Supervised Learning","summary":"  Whitening loss offers a theoretical guarantee against feature collapse in\nself-supervised learning (SSL) with joint embedding architectures. Typically,\nit involves a hard whitening approach, transforming the embedding and applying\nloss to the whitened output. In this work, we introduce Spectral Transformation\n(ST), a framework to modulate the spectrum of embedding and to seek for\nfunctions beyond whitening that can avoid dimensional collapse. We show that\nwhitening is a special instance of ST by definition, and our empirical\ninvestigations unveil other ST instances capable of preventing collapse.\nAdditionally, we propose a novel ST instance named IterNorm with trace loss\n(INTL). Theoretical analysis confirms INTL's efficacy in preventing collapse\nand modulating the spectrum of embedding toward equal-eigenvalues during\noptimization. Our experiments on ImageNet classification and COCO object\ndetection demonstrate INTL's potential in learning superior representations.\nThe code is available at https://github.com/winci-ai/INTL.\n","authors":["Xi Weng","Yunhao Ni","Tengwei Song","Jie Luo","Rao Muhammad Anwer","Salman Khan","Fahad Shahbaz Khan","Lei Huang"],"pdf_url":"https://arxiv.org/pdf/2305.16789v2.pdf","comment":"Accepted at ICLR 2024. The code is available at\n  https://github.com/winci-ai/intl"},{"id":"http://arxiv.org/abs/2401.10150v3","updated":"2024-01-22T02:40:52Z","published":"2024-01-18T17:22:37Z","title":"Motion-Zero: Zero-Shot Moving Object Control Framework for\n  Diffusion-Based Video Generation","summary":"  Recent large-scale pre-trained diffusion models have demonstrated a powerful\ngenerative ability to produce high-quality videos from detailed text\ndescriptions. However, exerting control over the motion of objects in videos\ngenerated by any video diffusion model is a challenging problem. In this paper,\nwe propose a novel zero-shot moving object trajectory control framework,\nMotion-Zero, to enable a bounding-box-trajectories-controlled text-to-video\ndiffusion model. To this end, an initial noise prior module is designed to\nprovide a position-based prior to improve the stability of the appearance of\nthe moving object and the accuracy of position. In addition, based on the\nattention map of the U-net, spatial constraints are directly applied to the\ndenoising process of diffusion models, which further ensures the positional and\nspatial consistency of moving objects during the inference. Furthermore,\ntemporal consistency is guaranteed with a proposed shift temporal attention\nmechanism. Our method can be flexibly applied to various state-of-the-art video\ndiffusion models without any training process. Extensive experiments\ndemonstrate our proposed method can control the motion trajectories of objects\nand generate high-quality videos.\n","authors":["Changgu Chen","Junwei Shu","Lianggangxu Chen","Gaoqi He","Changbo Wang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2401.10150v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.11654v1","updated":"2024-01-22T02:21:26Z","published":"2024-01-22T02:21:26Z","title":"ActionHub: A Large-scale Action Video Description Dataset for Zero-shot\n  Action Recognition","summary":"  Zero-shot action recognition (ZSAR) aims to learn an alignment model between\nvideos and class descriptions of seen actions that is transferable to unseen\nactions. The text queries (class descriptions) used in existing ZSAR works,\nhowever, are often short action names that fail to capture the rich semantics\nin the videos, leading to misalignment. With the intuition that video content\ndescriptions (e.g., video captions) can provide rich contextual information of\nvisual concepts in videos, we propose to utilize human annotated video\ndescriptions to enrich the semantics of the class descriptions of each action.\nHowever, all existing action video description datasets are limited in terms of\nthe number of actions, the semantics of video descriptions, etc. To this end,\nwe collect a large-scale action video descriptions dataset named ActionHub,\nwhich covers a total of 1,211 common actions and provides 3.6 million action\nvideo descriptions. With the proposed ActionHub dataset, we further propose a\nnovel Cross-modality and Cross-action Modeling (CoCo) framework for ZSAR, which\nconsists of a Dual Cross-modality Alignment module and a Cross-action\nInvariance Mining module. Specifically, the Dual Cross-modality Alignment\nmodule utilizes both action labels and video descriptions from ActionHub to\nobtain rich class semantic features for feature alignment. The Cross-action\nInvariance Mining module exploits a cycle-reconstruction process between the\nclass semantic feature spaces of seen actions and unseen actions, aiming to\nguide the model to learn cross-action invariant representations. Extensive\nexperimental results demonstrate that our CoCo framework significantly\noutperforms the state-of-the-art on three popular ZSAR benchmarks (i.e.,\nKinetics-ZSAR, UCF101 and HMDB51) under two different learning protocols in\nZSAR. We will release our code, models, and the proposed ActionHub dataset.\n","authors":["Jiaming Zhou","Junwei Liang","Kun-Yu Lin","Jinrui Yang","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2401.11654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11652v1","updated":"2024-01-22T02:17:36Z","published":"2024-01-22T02:17:36Z","title":"OnDev-LCT: On-Device Lightweight Convolutional Transformers towards\n  federated learning","summary":"  Federated learning (FL) has emerged as a promising approach to\ncollaboratively train machine learning models across multiple edge devices\nwhile preserving privacy. The success of FL hinges on the efficiency of\nparticipating models and their ability to handle the unique challenges of\ndistributed learning. While several variants of Vision Transformer (ViT) have\nshown great potential as alternatives to modern convolutional neural networks\n(CNNs) for centralized training, the unprecedented size and higher\ncomputational demands hinder their deployment on resource-constrained edge\ndevices, challenging their widespread application in FL. Since client devices\nin FL typically have limited computing resources and communication bandwidth,\nmodels intended for such devices must strike a balance between model size,\ncomputational efficiency, and the ability to adapt to the diverse and non-IID\ndata distributions encountered in FL. To address these challenges, we propose\nOnDev-LCT: Lightweight Convolutional Transformers for On-Device vision tasks\nwith limited training data and resources. Our models incorporate image-specific\ninductive biases through the LCT tokenizer by leveraging efficient depthwise\nseparable convolutions in residual linear bottleneck blocks to extract local\nfeatures, while the multi-head self-attention (MHSA) mechanism in the LCT\nencoder implicitly facilitates capturing global representations of images.\nExtensive experiments on benchmark image datasets indicate that our models\noutperform existing lightweight vision models while having fewer parameters and\nlower computational demands, making them suitable for FL scenarios with data\nheterogeneity and communication bottlenecks.\n","authors":["Chu Myaet Thwal","Minh N. H. Nguyen","Ye Lin Tun","Seong Tae Kim","My T. Thai","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2401.11652v1.pdf","comment":"Published in Neural Networks"},{"id":"http://arxiv.org/abs/2401.11650v1","updated":"2024-01-22T02:05:33Z","published":"2024-01-22T02:05:33Z","title":"PointGL: A Simple Global-Local Framework for Efficient Point Cloud\n  Analysis","summary":"  Efficient analysis of point clouds holds paramount significance in real-world\n3D applications. Currently, prevailing point-based models adhere to the\nPointNet++ methodology, which involves embedding and abstracting point features\nwithin a sequence of spatially overlapping local point sets, resulting in\nnoticeable computational redundancy. Drawing inspiration from the streamlined\nparadigm of pixel embedding followed by regional pooling in Convolutional\nNeural Networks (CNNs), we introduce a novel, uncomplicated yet potent\narchitecture known as PointGL, crafted to facilitate efficient point cloud\nanalysis. PointGL employs a hierarchical process of feature acquisition through\ntwo recursive steps. First, the Global Point Embedding leverages\nstraightforward residual Multilayer Perceptrons (MLPs) to effectuate feature\nembedding for each individual point. Second, the novel Local Graph Pooling\ntechnique characterizes point-to-point relationships and abstracts regional\nrepresentations through succinct local graphs. The harmonious fusion of\none-time point embedding and parameter-free graph pooling contributes to\nPointGL's defining attributes of minimized model complexity and heightened\nefficiency. Our PointGL attains state-of-the-art accuracy on the ScanObjectNN\ndataset while exhibiting a runtime that is more than 5 times faster and\nutilizing only approximately 4% of the FLOPs and 30% of the parameters compared\nto the recent PointMLP model. The code for PointGL is available at\nhttps://github.com/Roywangj/PointGL.\n","authors":["Jianan Li","Jie Wang","Tingfa Xu"],"pdf_url":"https://arxiv.org/pdf/2401.11650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11649v1","updated":"2024-01-22T02:03:31Z","published":"2024-01-22T02:03:31Z","title":"M2-CLIP: A Multimodal, Multi-task Adapting Framework for Video Action\n  Recognition","summary":"  Recently, the rise of large-scale vision-language pretrained models like\nCLIP, coupled with the technology of Parameter-Efficient FineTuning (PEFT), has\ncaptured substantial attraction in video action recognition. Nevertheless,\nprevailing approaches tend to prioritize strong supervised performance at the\nexpense of compromising the models' generalization capabilities during\ntransfer. In this paper, we introduce a novel Multimodal, Multi-task CLIP\nadapting framework named \\name to address these challenges, preserving both\nhigh supervised performance and robust transferability. Firstly, to enhance the\nindividual modality architectures, we introduce multimodal adapters to both the\nvisual and text branches. Specifically, we design a novel visual TED-Adapter,\nthat performs global Temporal Enhancement and local temporal Difference\nmodeling to improve the temporal representation capabilities of the visual\nencoder. Moreover, we adopt text encoder adapters to strengthen the learning of\nsemantic label information. Secondly, we design a multi-task decoder with a\nrich set of supervisory signals to adeptly satisfy the need for strong\nsupervised performance and generalization within a multimodal framework.\nExperimental results validate the efficacy of our approach, demonstrating\nexceptional performance in supervised learning while maintaining strong\ngeneralization in zero-shot scenarios.\n","authors":["Mengmeng Wang","Jiazheng Xing","Boyuan Jiang","Jun Chen","Jianbiao Mei","Xingxing Zuo","Guang Dai","Jingdong Wang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2401.11649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05482v2","updated":"2024-01-22T01:48:29Z","published":"2023-04-11T20:28:33Z","title":"Computational Pathology: A Survey Review and The Way Forward","summary":"  Computational Pathology CPath is an interdisciplinary science that augments\ndevelopments of computational approaches to analyze and model medical\nhistopathology images. The main objective for CPath is to develop\ninfrastructure and workflows of digital diagnostics as an assistive CAD system\nfor clinical pathology, facilitating transformational changes in the diagnosis\nand treatment of cancer that are mainly address by CPath tools. With\nevergrowing developments in deep learning and computer vision algorithms, and\nthe ease of the data flow from digital pathology, currently CPath is witnessing\na paradigm shift. Despite the sheer volume of engineering and scientific works\nbeing introduced for cancer image analysis, there is still a considerable gap\nof adopting and integrating these algorithms in clinical practice. This raises\na significant question regarding the direction and trends that are undertaken\nin CPath. In this article we provide a comprehensive review of more than 800\npapers to address the challenges faced in problem design all-the-way to the\napplication and implementation viewpoints. We have catalogued each paper into a\nmodel-card by examining the key works and challenges faced to layout the\ncurrent landscape in CPath. We hope this helps the community to locate relevant\nworks and facilitate understanding of the field's future directions. In a\nnutshell, we oversee the CPath developments in cycle of stages which are\nrequired to be cohesively linked together to address the challenges associated\nwith such multidisciplinary science. We overview this cycle from different\nperspectives of data-centric, model-centric, and application-centric problems.\nWe finally sketch remaining challenges and provide directions for future\ntechnical developments and clinical integration of CPath\n(https://github.com/AtlasAnalyticsLab/CPath_Survey).\n","authors":["Mahdi S. Hosseini","Babak Ehteshami Bejnordi","Vincent Quoc-Huy Trinh","Danial Hasan","Xingwen Li","Taehyo Kim","Haochen Zhang","Theodore Wu","Kajanan Chinniah","Sina Maghsoudlou","Ryan Zhang","Stephen Yang","Jiadai Zhu","Lyndon Chan","Samir Khaki","Andrei Buin","Fatemeh Chaji","Ala Salehi","Bich Ngoc Nguyen","Dimitris Samaras","Konstantinos N. Plataniotis"],"pdf_url":"https://arxiv.org/pdf/2304.05482v2.pdf","comment":"Accepted in Elsevier Journal of Pathology Informatics (JPI) 2024"},{"id":"http://arxiv.org/abs/2401.11644v1","updated":"2024-01-22T01:34:03Z","published":"2024-01-22T01:34:03Z","title":"Friends Across Time: Multi-Scale Action Segmentation Transformer for\n  Surgical Phase Recognition","summary":"  Automatic surgical phase recognition is a core technology for modern\noperating rooms and online surgical video assessment platforms. Current\nstate-of-the-art methods use both spatial and temporal information to tackle\nthe surgical phase recognition task. Building on this idea, we propose the\nMulti-Scale Action Segmentation Transformer (MS-AST) for offline surgical phase\nrecognition and the Multi-Scale Action Segmentation Causal Transformer\n(MS-ASCT) for online surgical phase recognition. We use ResNet50 or\nEfficientNetV2-M for spatial feature extraction. Our MS-AST and MS-ASCT can\nmodel temporal information at different scales with multi-scale temporal\nself-attention and multi-scale temporal cross-attention, which enhances the\ncapture of temporal relationships between frames and segments. We demonstrate\nthat our method can achieve 95.26% and 96.15% accuracy on the Cholec80 dataset\nfor online and offline surgical phase recognition, respectively, which achieves\nnew state-of-the-art results. Our method can also achieve state-of-the-art\nresults on non-medical datasets in the video action segmentation domain.\n","authors":["Bokai Zhang","Jiayuan Meng","Bin Cheng","Dean Biskup","Svetlana Petculescu","Angela Chapman"],"pdf_url":"https://arxiv.org/pdf/2401.11644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17778v3","updated":"2024-01-22T00:54:30Z","published":"2023-06-30T16:31:14Z","title":"Look, Remember and Reason: Grounded reasoning in videos with language\n  models","summary":"  Multi-modal language models (LM) have recently shown promising performance in\nhigh-level reasoning tasks on videos. However, existing methods still fall\nshort in tasks like causal or compositional spatiotemporal reasoning over\nactions, in which model predictions need to be grounded in fine-grained\nlow-level details, such as object motions and object interactions. In this\nwork, we propose training an LM end-to-end on low-level surrogate tasks,\nincluding object detection, re-identification, and tracking, to endow the model\nwith the required low-level visual capabilities. We show that a two-stream\nvideo encoder with spatiotemporal attention is effective at capturing the\nrequired static and motion-based cues in the video. By leveraging the LM's\nability to perform the low-level surrogate tasks, we can cast reasoning in\nvideos as the three-step process of Look, Remember, Reason wherein visual\ninformation is extracted using low-level visual skills step-by-step and then\nintegrated to arrive at a final answer. We demonstrate the effectiveness of our\nframework on diverse visual reasoning tasks from the ACRE, CATER,\nSomething-Else and STAR datasets. Our approach is trainable end-to-end and\nsurpasses state-of-the-art task-specific methods across these tasks by a large\nmargin.\n","authors":["Apratim Bhattacharyya","Sunny Panchal","Mingu Lee","Reza Pourreza","Pulkit Madan","Roland Memisevic"],"pdf_url":"https://arxiv.org/pdf/2306.17778v3.pdf","comment":"To appear at ICLR 2024"},{"id":"http://arxiv.org/abs/2309.01409v5","updated":"2024-01-22T00:22:14Z","published":"2023-09-04T07:40:30Z","title":"Implicit Neural Image Stitching","summary":"  Existing frameworks for image stitching often provide visually reasonable\nstitchings. However, they suffer from blurry artifacts and disparities in\nillumination, depth level, etc. Although the recent learning-based stitchings\nrelax such disparities, the required methods impose sacrifice of image\nqualities failing to capture high-frequency details for stitched images. To\naddress the problem, we propose a novel approach, implicit Neural Image\nStitching (NIS) that extends arbitrary-scale super-resolution. Our method\nestimates Fourier coefficients of images for quality-enhancing warps. Then, the\nsuggested model blends color mismatches and misalignment in the latent space\nand decodes the features into RGB values of stitched images. Our experiments\nshow that our approach achieves improvement in resolving the low-definition\nimaging of the previous deep image stitching with favorable accelerated\nimage-enhancing methods. Our source code is available at\nhttps://github.com/minshu-kim/NIS.\n","authors":["Minsu Kim","Jaewon Lee","Byeonghun Lee","Sunghoon Im","Kyong Hwan Jin"],"pdf_url":"https://arxiv.org/pdf/2309.01409v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11633v1","updated":"2024-01-22T00:00:30Z","published":"2024-01-22T00:00:30Z","title":"Zoom-shot: Fast and Efficient Unsupervised Zero-Shot Transfer of CLIP to\n  Vision Encoders with Multimodal Loss","summary":"  The fusion of vision and language has brought about a transformative shift in\ncomputer vision through the emergence of Vision-Language Models (VLMs).\nHowever, the resource-intensive nature of existing VLMs poses a significant\nchallenge. We need an accessible method for developing the next generation of\nVLMs. To address this issue, we propose Zoom-shot, a novel method for\ntransferring the zero-shot capabilities of CLIP to any pre-trained vision\nencoder. We do this by exploiting the multimodal information (i.e. text and\nimage) present in the CLIP latent space through the use of specifically\ndesigned multimodal loss functions. These loss functions are (1)\ncycle-consistency loss and (2) our novel prompt-guided knowledge distillation\nloss (PG-KD). PG-KD combines the concept of knowledge distillation with CLIP's\nzero-shot classification, to capture the interactions between text and image\nfeatures. With our multimodal losses, we train a $\\textbf{linear mapping}$\nbetween the CLIP latent space and the latent space of a pre-trained vision\nencoder, for only a $\\textbf{single epoch}$. Furthermore, Zoom-shot is entirely\nunsupervised and is trained using $\\textbf{unpaired}$ data. We test the\nzero-shot capabilities of a range of vision encoders augmented as new VLMs, on\ncoarse and fine-grained classification datasets, outperforming the previous\nstate-of-the-art in this problem domain. In our ablations, we find Zoom-shot\nallows for a trade-off between data and compute during training; and our\nstate-of-the-art results can be obtained by reducing training from 20% to 1% of\nthe ImageNet training data with 20 epochs. All code and models are available on\nGitHub.\n","authors":["Jordan Shipard","Arnold Wiliem","Kien Nguyen Thanh","Wei Xiang","Clinton Fookes"],"pdf_url":"https://arxiv.org/pdf/2401.11633v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2311.02749v2","updated":"2024-01-22T21:30:26Z","published":"2023-11-05T19:59:36Z","title":"Fast Point Cloud to Mesh Reconstruction for Deformable Object Tracking","summary":"  The world around us is full of soft objects we perceive and deform with\ndexterous hand movements. For a robotic hand to control soft objects, it has to\nacquire online state feedback of the deforming object. While RGB-D cameras can\ncollect occluded point clouds at a rate of 30Hz, this does not represent a\ncontinuously trackable object surface. Hence, in this work, we developed a\nmethod that takes as input a template mesh which is the mesh of an object in\nits non-deformed state and a deformed point cloud of the same object, and then\nshapes the template mesh such that it matches the deformed point cloud. The\nreconstruction of meshes from point clouds has long been studied in the field\nof Computer graphics under 3D reconstruction and 4D reconstruction, however,\nboth lack the speed and generalizability needed for robotics applications. Our\nmodel is designed using a point cloud auto-encoder and a Real-NVP architecture.\nOur trained model can perform mesh reconstruction and tracking at a rate of\n58Hz on a template mesh of 3000 vertices and a deformed point cloud of 5000\npoints and is generalizable to the deformations of six different object\ncategories which are assumed to be made of soft material in our experiments\n(scissors, hammer, foam brick, cleanser bottle, orange, and dice). The object\nmeshes are taken from the YCB benchmark dataset. An instance of a downstream\napplication can be the control algorithm for a robotic hand that requires\nonline feedback from the state of the manipulated object which would allow\nonline grasp adaptation in a closed-loop manner. Furthermore, the tracking\ncapacity of our method can help in the system identification of deforming\nobjects in a marker-free approach. In future work, we will extend our trained\nmodel to generalize beyond six object categories and additionally to real-world\ndeforming point clouds.\n","authors":["Elham Amin Mansour","Hehui Zheng","Robert K. Katzschmann"],"pdf_url":"https://arxiv.org/pdf/2311.02749v2.pdf","comment":"8 pages with appendix,16 figures"},{"id":"http://arxiv.org/abs/2305.03053v2","updated":"2024-01-22T20:56:16Z","published":"2023-05-04T17:59:58Z","title":"ZipIt! Merging Models from Different Tasks without Training","summary":"  Typical deep visual recognition models are capable of performing the one task\nthey were trained on. In this paper, we tackle the extremely difficult problem\nof combining distinct models with different initializations, each solving a\nseparate task, into one multi-task model without any additional training. Prior\nwork in model merging permutes one model to the space of the other then\naverages them together. While this works for models trained on the same task,\nwe find that this fails to account for the differences in models trained on\ndisjoint tasks. Thus, we introduce \"ZipIt!\", a general method for merging two\narbitrary models of the same architecture that incorporates two simple\nstrategies. First, in order to account for features that aren't shared between\nmodels, we expand the model merging problem to allow for merging features\nwithin each model by defining a general \"zip\" operation. Second, we add support\nfor partially zipping the models up until a specified layer, naturally creating\na multi-head model. We find that these two changes combined account for 20-60%\nimprovement over prior work, making it more feasible to merge models trained on\ndisjoint tasks without retraining.\n","authors":["George Stoica","Daniel Bolya","Jakob Bjorner","Pratik Ramesh","Taylor Hearn","Judy Hoffman"],"pdf_url":"https://arxiv.org/pdf/2305.03053v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12350v1","updated":"2024-01-22T20:32:31Z","published":"2024-01-22T20:32:31Z","title":"Scaling Up Quantization-Aware Neural Architecture Search for Efficient\n  Deep Learning on the Edge","summary":"  Neural Architecture Search (NAS) has become the de-facto approach for\ndesigning accurate and efficient networks for edge devices. Since models are\ntypically quantized for edge deployment, recent work has investigated\nquantization-aware NAS (QA-NAS) to search for highly accurate and efficient\nquantized models. However, existing QA-NAS approaches, particularly few-bit\nmixed-precision (FB-MP) methods, do not scale to larger tasks. Consequently,\nQA-NAS has mostly been limited to low-scale tasks and tiny networks. In this\nwork, we present an approach to enable QA-NAS (INT8 and FB-MP) on large-scale\ntasks by leveraging the block-wise formulation introduced by block-wise NAS. We\ndemonstrate strong results for the semantic segmentation task on the Cityscapes\ndataset, finding FB-MP models 33% smaller and INT8 models 17.6% faster than\nDeepLabV3 (INT8) without compromising task performance.\n","authors":["Yao Lu","Hiram Rayo Torres Rodriguez","Sebastian Vogel","Nick van de Waterlaat","Pavol Jancura"],"pdf_url":"https://arxiv.org/pdf/2401.12350v1.pdf","comment":"Accepted at Workshop on Compilers, Deployment, and Tooling for Edge\n  AI (CODAI '23 ), September 21, 2023, Hamburg, Germany"},{"id":"http://arxiv.org/abs/2401.12344v1","updated":"2024-01-22T20:17:14Z","published":"2024-01-22T20:17:14Z","title":"OCT-SelfNet: A Self-Supervised Framework with Multi-Modal Datasets for\n  Generalized and Robust Retinal Disease Detection","summary":"  Despite the revolutionary impact of AI and the development of locally trained\nalgorithms, achieving widespread generalized learning from multi-modal data in\nmedical AI remains a significant challenge. This gap hinders the practical\ndeployment of scalable medical AI solutions. Addressing this challenge, our\nresearch contributes a self-supervised robust machine learning framework,\nOCT-SelfNet, for detecting eye diseases using optical coherence tomography\n(OCT) images. In this work, various data sets from various institutions are\ncombined enabling a more comprehensive range of representation. Our method\naddresses the issue using a two-phase training approach that combines\nself-supervised pretraining and supervised fine-tuning with a mask autoencoder\nbased on the SwinV2 backbone by providing a solution for real-world clinical\ndeployment. Extensive experiments on three datasets with different encoder\nbackbones, low data settings, unseen data settings, and the effect of\naugmentation show that our method outperforms the baseline model, Resnet-50 by\nconsistently attaining AUC-ROC performance surpassing 77% across all tests,\nwhereas the baseline model exceeds 54%. Moreover, in terms of the AUC-PR\nmetric, our proposed method exceeded 42%, showcasing a substantial increase of\nat least 10% in performance compared to the baseline, which exceeded only 33%.\nThis contributes to our understanding of our approach's potential and\nemphasizes its usefulness in clinical settings.\n","authors":["Fatema-E Jannat","Sina Gholami","Minhaj Nur Alam","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2401.12344v1.pdf","comment":"12 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2401.12340v1","updated":"2024-01-22T20:08:57Z","published":"2024-01-22T20:08:57Z","title":"Contrastive Learning and Cycle Consistency-based Transductive Transfer\n  Learning for Target Annotation","summary":"  Annotating automatic target recognition (ATR) is a highly challenging task,\nprimarily due to the unavailability of labeled data in the target domain.\nHence, it is essential to construct an optimal target domain classifier by\nutilizing the labeled information of the source domain images. The transductive\ntransfer learning (TTL) method that incorporates a CycleGAN-based unpaired\ndomain translation network has been previously proposed in the literature for\neffective ATR annotation. Although this method demonstrates great potential for\nATR, it severely suffers from lower annotation performance, higher Fr\\'echet\nInception Distance (FID) score, and the presence of visual artifacts in the\nsynthetic images. To address these issues, we propose a hybrid contrastive\nlearning base unpaired domain translation (H-CUT) network that achieves a\nsignificantly lower FID score. It incorporates both attention and entropy to\nemphasize the domain-specific region, a noisy feature mixup module to generate\nhigh variational synthetic negative patches, and a modulated noise contrastive\nestimation (MoNCE) loss to reweight all negative patches using optimal\ntransport for better performance. Our proposed contrastive learning and\ncycle-consistency-based TTL (C3TTL) framework consists of two H-CUT networks\nand two classifiers. It simultaneously optimizes cycle-consistency, MoNCE, and\nidentity losses. In C3TTL, two H-CUT networks have been employed through a\nbijection mapping to feed the reconstructed source domain images into a\npretrained classifier to guide the optimal target domain classifier. Extensive\nexperimental analysis conducted on three ATR datasets demonstrates that the\nproposed C3TTL method is effective in annotating civilian and military\nvehicles, as well as ship targets.\n","authors":["Shoaib Meraj Sami","Md Mahedi Hasan","Nasser M. Nasrabadi","Raghuveer Rao"],"pdf_url":"https://arxiv.org/pdf/2401.12340v1.pdf","comment":"This Paper is Accepted in IEEE TRANSACTIONS ON AEROSPACE AND\n  ELECTRONIC SYSTEMS. This Arxiv version is an older version than the reviewed\n  version"},{"id":"http://arxiv.org/abs/2002.04251v3","updated":"2024-01-22T20:05:23Z","published":"2020-02-11T08:24:19Z","title":"2.75D: Boosting learning by representing 3D Medical imaging to 2D\n  features for small data","summary":"  In medical-data driven learning, 3D convolutional neural networks (CNNs) have\nstarted to show superior performance to 2D CNNs in numerous deep learning\ntasks, proving the added value of 3D spatial information in feature\nrepresentation. However, the difficulty in collecting more training samples to\nconverge, more computational resources and longer execution time make this\napproach less applied. Also, applying transfer learning on 3D CNN is\nchallenging due to a lack of publicly available pre-trained 3D models. To\ntackle these issues, we proposed a novel 2D strategical representation of\nvolumetric data, namely 2.75D. In this work, the spatial information of 3D\nimages is captured in a single 2D view by a spiral-spinning technique. As a\nresult, 2D CNN networks can also be used to learn volumetric information.\nBesides, we can fully leverage pre-trained 2D CNNs for downstream vision\nproblems. We also explore a multi-view 2.75D strategy, 2.75D 3 channels\n(2.75Dx3), to boost the advantage of 2.75D. We evaluated the proposed methods\non three public datasets with different modalities or organs (Lung CT, Breast\nMRI, and Prostate MRI), against their 2D, 2.5D, and 3D counterparts in\nclassification tasks. Results show that the proposed methods significantly\noutperform other counterparts when all methods were trained from scratch on the\nlung dataset. Such performance gain is more pronounced with transfer learning\nor in the case of limited training data. Our methods also achieved comparable\nperformance on other datasets. In addition, our methods achieved a substantial\nreduction in time consumption of training and inference compared with the 2.5D\nor 3D method.\n","authors":["Xin Wang","Ruisheng Su","Weiyi Xie","Wenjin Wang","Yi Xu","Ritse Mann","Jungong Han","Tao Tan"],"pdf_url":"https://arxiv.org/pdf/2002.04251v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17189v3","updated":"2024-01-22T19:29:16Z","published":"2023-09-29T12:38:00Z","title":"RTFS-Net: Recurrent time-frequency modelling for efficient audio-visual\n  speech separation","summary":"  Audio-visual speech separation methods aim to integrate different modalities\nto generate high-quality separated speech, thereby enhancing the performance of\ndownstream tasks such as speech recognition. Most existing state-of-the-art\n(SOTA) models operate in the time domain. However, their overly simplistic\napproach to modeling acoustic features often necessitates larger and more\ncomputationally intensive models in order to achieve SOTA performance. In this\npaper, we present a novel time-frequency domain audio-visual speech separation\nmethod: Recurrent Time-Frequency Separation Network (RTFS-Net), which applies\nits algorithms on the complex time-frequency bins yielded by the Short-Time\nFourier Transform. We model and capture the time and frequency dimensions of\nthe audio independently using a multi-layered RNN along each dimension.\nFurthermore, we introduce a unique attention-based fusion technique for the\nefficient integration of audio and visual information, and a new mask\nseparation approach that takes advantage of the intrinsic spectral nature of\nthe acoustic features for a clearer separation. RTFS-Net outperforms the\nprevious SOTA method using only 10% of the parameters and 18% of the MACs. This\nis the first time-frequency domain audio-visual speech separation method to\noutperform all contemporary time-domain counterparts.\n","authors":["Samuel Pegg","Kai Li","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2309.17189v3.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2310.05207v2","updated":"2024-01-22T19:06:15Z","published":"2023-10-08T15:49:26Z","title":"Boosting Facial Action Unit Detection Through Jointly Learning Facial\n  Landmark Detection and Domain Separation and Reconstruction","summary":"  Recently how to introduce large amounts of unlabeled facial images in the\nwild into supervised Facial Action Unit (AU) detection frameworks has become a\nchallenging problem. In this paper, we propose a new AU detection framework\nwhere multi-task learning is introduced to jointly learn AU domain separation\nand reconstruction and facial landmark detection by sharing the parameters of\nhomostructural facial extraction modules. In addition, we propose a new feature\nalignment scheme based on contrastive learning by simple projectors and an\nimproved contrastive loss, which adds four additional intermediate supervisors\nto promote the feature reconstruction process. Experimental results on two\nbenchmarks demonstrate our superiority against the state-of-the-art methods for\nAU detection in the wild.\n","authors":["Ziqiao Shang","Li Yu"],"pdf_url":"https://arxiv.org/pdf/2310.05207v2.pdf","comment":"5 pages, 1 figure, published to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.12275v1","updated":"2024-01-22T18:58:22Z","published":"2024-01-22T18:58:22Z","title":"Multi-Agent Dynamic Relational Reasoning for Social Robot Navigation","summary":"  Social robot navigation can be helpful in various contexts of daily life but\nrequires safe human-robot interactions and efficient trajectory planning. While\nmodeling pairwise relations has been widely studied in multi-agent interacting\nsystems, the ability to capture larger-scale group-wise activities is limited.\nIn this paper, we propose a systematic relational reasoning approach with\nexplicit inference of the underlying dynamically evolving relational\nstructures, and we demonstrate its effectiveness for multi-agent trajectory\nprediction and social robot navigation. In addition to the edges between pairs\nof nodes (i.e., agents), we propose to infer hyperedges that adaptively connect\nmultiple nodes to enable group-wise reasoning in an unsupervised manner. Our\napproach infers dynamically evolving relation graphs and hypergraphs to capture\nthe evolution of relations, which the trajectory predictor employs to generate\nfuture states. Meanwhile, we propose to regularize the sharpness and sparsity\nof the learned relations and the smoothness of the relation evolution, which\nproves to enhance training stability and model performance. The proposed\napproach is validated on synthetic crowd simulations and real-world benchmark\ndatasets. Experiments demonstrate that the approach infers reasonable relations\nand achieves state-of-the-art prediction performance. In addition, we present a\ndeep reinforcement learning (DRL) framework for social robot navigation, which\nincorporates relational reasoning and trajectory prediction systematically. In\na group-based crowd simulation, our method outperforms the strongest baseline\nby a significant margin in terms of safety, efficiency, and social compliance\nin dense, interactive scenarios.\n","authors":["Jiachen Li","Chuanbo Hua","Hengbo Ma","Jinkyoo Park","Victoria Dax","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2401.12275v1.pdf","comment":"19 pages, 8 figures, 6 tables"},{"id":"http://arxiv.org/abs/2312.07063v2","updated":"2024-01-22T15:30:26Z","published":"2023-12-12T08:32:55Z","title":"Template Free Reconstruction of Human-object Interaction with Procedural\n  Interaction Generation","summary":"  Reconstructing human-object interaction in 3D from a single RGB image is a\nchallenging task and existing data driven methods do not generalize beyond the\nobjects present in the carefully curated 3D interaction datasets. Capturing\nlarge-scale real data to learn strong interaction and 3D shape priors is very\nexpensive due to the combinatorial nature of human-object interactions. In this\npaper, we propose ProciGen (Procedural interaction Generation), a method to\nprocedurally generate datasets with both, plausible interaction and diverse\nobject variation. We generate 1M+ human-object interaction pairs in 3D and\nleverage this large-scale data to train our HDM (Hierarchical Diffusion Model),\na novel method to reconstruct interacting human and unseen objects, without any\ntemplates. Our HDM is an image-conditioned diffusion model that learns both\nrealistic interaction and highly accurate human and object shapes. Experiments\nshow that our HDM trained with ProciGen significantly outperforms prior methods\nthat requires template meshes and that our dataset allows training methods with\nstrong generalization ability to unseen object instances. Our code and data\nwill be publicly released at:\nhttps://virtualhumans.mpi-inf.mpg.de/procigen-hdm.\n","authors":["Xianghui Xie","Bharat Lal Bhatnagar","Jan Eric Lenssen","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2312.07063v2.pdf","comment":"23 pages, 18 figures. Project page:\n  https://virtualhumans.mpi-inf.mpg.de/procigen-hdm (updated the\n  acknowledgement)"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2204.11209v3","updated":"2024-01-22T14:13:11Z","published":"2022-04-24T07:18:04Z","title":"Hierarchical Locality Sensitive Hashing for Structured Data: A Survey","summary":"  Data similarity (or distance) computation is a fundamental research topic\nwhich fosters a variety of similarity-based machine learning and data mining\napplications. In big data analytics, it is impractical to compute the exact\nsimilarity of data instances due to high computational cost. To this end, the\nLocality Sensitive Hashing (LSH) technique has been proposed to provide\naccurate estimators for various similarity measures between sets or vectors in\nan efficient manner without the learning process. Structured data (e.g.,\nsequences, trees and graphs), which are composed of elements and relations\nbetween the elements, are commonly seen in the real world, but the traditional\nLSH algorithms cannot preserve the structure information represented as\nrelations between elements. In order to conquer the issue, researchers have\nbeen devoted to the family of the hierarchical LSH algorithms. In this paper,\nwe explore the present progress of the research into hierarchical LSH from the\nfollowing perspectives: 1) Data structures, where we review various\nhierarchical LSH algorithms for three typical data structures and uncover their\ninherent connections; 2) Applications, where we review the hierarchical LSH\nalgorithms in multiple application scenarios; 3) Challenges, where we discuss\nsome potential challenges as future directions.\n","authors":["Wei Wu","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2204.11209v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16034v2","updated":"2024-01-22T11:26:35Z","published":"2023-09-27T21:26:01Z","title":"Analytical Modelling of Raw Data for Flow-Guided In-body Nanoscale\n  Localization","summary":"  Advancements in nanotechnology and material science are paving the way toward\nnanoscale devices that combine sensing, computing, data and energy storage, and\nwireless communication. In precision medicine, these nanodevices show promise\nfor disease diagnostics, treatment, and monitoring from within the patients'\nbloodstreams. Assigning the location of a sensed biological event with the\nevent itself, which is the main proposition of flow-guided in-body nanoscale\nlocalization, would be immensely beneficial from the perspective of precision\nmedicine. The nanoscale nature of the nanodevices and the challenging\nenvironment that the bloodstream represents, result in current flow-guided\nlocalization approaches being constrained in their communication and\nenergy-related capabilities. The communication and energy constraints of the\nnanodevices result in different features of raw data for flow-guided\nlocalization, in turn affecting its performance. An analytical modeling of the\neffects of imperfect communication and constrained energy causing intermittent\noperation of the nanodevices on the raw data produced by the nanodevices would\nbe beneficial. Hence, we propose an analytical model of raw data for\nflow-guided localization, where the raw data is modeled as a function of\ncommunication and energy-related capabilities of the nanodevice. We evaluate\nthe model by comparing its output with the one obtained through the utilization\nof a simulator for objective evaluation of flow-guided localization, featuring\ncomparably higher level of realism. Our results across a number of scenarios\nand heterogeneous performance metrics indicate high similarity between the\nmodel and simulator-generated raw datasets.\n","authors":["Guillem Pascual","Filip Lemic","Carmen Delgado","Xavier Costa-Perez"],"pdf_url":"https://arxiv.org/pdf/2309.16034v2.pdf","comment":"6 pages, 7 figures, 4 tables, 16 references"},{"id":"http://arxiv.org/abs/2401.11800v1","updated":"2024-01-22T10:01:06Z","published":"2024-01-22T10:01:06Z","title":"Revisiting Document-Level Relation Extraction with Context-Guided Link\n  Prediction","summary":"  Document-level relation extraction (DocRE) poses the challenge of identifying\nrelationships between entities within a document as opposed to the traditional\nRE setting where a single sentence is input. Existing approaches rely on\nlogical reasoning or contextual cues from entities. This paper reframes\ndocument-level RE as link prediction over a knowledge graph with distinct\nbenefits: 1) Our approach combines entity context with document-derived logical\nreasoning, enhancing link prediction quality. 2) Predicted links between\nentities offer interpretability, elucidating employed reasoning. We evaluate\nour approach on three benchmark datasets: DocRED, ReDocRED, and DWIE. The\nresults indicate that our proposed method outperforms the state-of-the-art\nmodels and suggests that incorporating context-based link prediction techniques\ncan enhance the performance of document-level relation extraction models.\n","authors":["Monika Jain","Raghava Mutharaju","Ramakanth Kavuluru","Kuldeep Singh"],"pdf_url":"https://arxiv.org/pdf/2401.11800v1.pdf","comment":"Accepted in AAAI 2024"},{"id":"http://arxiv.org/abs/2305.19604v3","updated":"2024-01-22T08:13:50Z","published":"2023-05-31T07:22:15Z","title":"Medication Recommendation via Domain Knowledge Informed Deep Learning","summary":"  Medication recommendation is a fundamental yet crucial branch of healthcare,\nwhich provides opportunities to support clinical physicians with more accurate\nmedication prescriptions for patients with complex health conditions. Learning\nfrom electronic health records (EHR) to recommend medications is the most\ncommon way in previous studies. However, most of them neglect incorporating\ndomain knowledge according to the clinical manifestations in the EHR of the\npatient. To address these issues, we propose a novel \\textbf{D}omain\n\\textbf{K}nowledge \\textbf{I}nformed \\textbf{Net}work (DKINet) to integrate\ndomain knowledge with observable clinical manifestations of the patient, which\nis the first dynamic domain knowledge informed framework toward medication\nrecommendation. In particular, we first design a knowledge-driven encoder to\ncapture the domain information and then develop a data-driven encoder to\nintegrate domain knowledge into the observable EHR. To endow the model with the\ncapability of temporal decision, we design an explicit medication encoder for\nlearning the longitudinal dependence of the patient. Extensive experiments on\nthree publicly available datasets verify the superiority of our method. The\ncode will be public upon acceptance.\n","authors":["Sicen Liu","Xiaolong Wang","Xianbing Zhao","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2305.19604v3.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.11742v1","updated":"2024-01-22T08:00:49Z","published":"2024-01-22T08:00:49Z","title":"Knowledge Navigation: Inferring the Interlocking Map of Knowledge from\n  Research Trajectories","summary":"  \"If I have seen further, it is by standing on the shoulders of giants,\" Isaac\nNewton's renowned statement hints that new knowledge builds upon existing\nfoundations, which means there exists an interdependent relationship between\nknowledge, which, yet uncovered, is implied in the historical development of\nscientific systems for hundreds of years. By leveraging natural language\nprocessing techniques, this study introduces an innovative embedding scheme\ndesigned to infer the \"knowledge interlocking map.\" This map, derived from the\nresearch trajectories of millions of scholars, reveals the intricate\nconnections among knowledge. We validate that the inferred map effectively\ndelineates disciplinary boundaries and captures the intricate relationships\nbetween diverse concepts. The utility of the interlocking map is showcased\nthrough multiple applications. Firstly, we demonstrated the multi-step analogy\ninferences within the knowledge space and the functional connectivity between\nconcepts in different disciplines. Secondly, we trace the evolution of\nknowledge across domains, observing trends such as shifts from \"Theoretical\" to\n\"Applied\" or \"Chemistry\" to \"Biomedical\" along predefined functional\ndirections. Lastly, by analyzing the high-dimensional knowledge network\nstructure, we found that knowledge connects each other with shorter global\npathways, and the interdisciplinary knowledge plays a critical role in\naccessibility of the global knowledge network. Our framework offers a novel\napproach to mining knowledge inheritance pathways in extensive scientific\nliterature, which is of great significance for understanding scientific\ndevelopment patterns, tailoring scientific learning trajectories, and\naccelerating scientific progress.\n","authors":["Shibing Xiang","Bing Liu","Yurui Huang","Chaolin Tian","Xin Jiang","Yifang Ma"],"pdf_url":"https://arxiv.org/pdf/2401.11742v1.pdf","comment":"28 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2304.01225v2","updated":"2024-01-22T06:31:50Z","published":"2023-04-02T07:25:01Z","title":"A greedy approach for increased vehicle utilization in ridesharing\n  networks","summary":"  In recent years, ridesharing platforms have become a prominent mode of\ntransportation for the residents of urban areas. As a fundamental problem,\nroute recommendation for these platforms is vital for their sustenance. The\nworks done in this direction have recommended routes with higher passenger\ndemand. Despite the existing works, statistics have suggested that these\nservices cause increased greenhouse emissions compared to private vehicles as\nthey roam around in search of riders. This analysis provides finer details\nregarding the functionality of ridesharing systems and it reveals that in the\nface of their boom, they have not utilized the vehicle capacity efficiently. We\npropose to overcome the above limitations and recommend routes that will fetch\nmultiple passengers simultaneously which will result in increased vehicle\nutilization and thereby decrease the effect of these systems on the\nenvironment. As route recommendation is NP-hard, we propose a k-hop-based\nsliding window approximation algorithm that reduces the search space from\nentire road network to a window. We further demonstrate that maximizing\nexpected demand is submodular and greedy algorithms can be used to optimize our\nobjective function within a window. We evaluate our proposed model on\nreal-world datasets and experimental results demonstrate superior performance\nby our proposed model.\n","authors":["Aqsa Ashraf Makhdomi","Iqra Altaf Gillani"],"pdf_url":"https://arxiv.org/pdf/2304.01225v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11705v1","updated":"2024-01-22T06:12:48Z","published":"2024-01-22T06:12:48Z","title":"Domain-Aware Cross-Attention for Cross-domain Recommendation","summary":"  Cross-domain recommendation (CDR) is an important method to improve\nrecommender system performance, especially when observations in target domains\nare sparse. However, most existing cross-domain recommendations fail to fully\nutilize the target domain's special features and are hard to be generalized to\nnew domains. The designed network is complex and is not suitable for rapid\nindustrial deployment. Our method introduces a two-step domain-aware\ncross-attention, extracting transferable features of the source domain from\ndifferent granularity, which allows the efficient expression of both domain and\nuser interests. In addition, we simplify the training process, and our model\ncan be easily deployed on new domains. We conduct experiments on both public\ndatasets and industrial datasets, and the experimental results demonstrate the\neffectiveness of our method. We have also deployed the model in an online\nadvertising system and observed significant improvements in both\nClick-Through-Rate (CTR) and effective cost per mille (ECPM).\n","authors":["Yuhao Luo","Shiwei Ma","Mingjun Nie","Changping Peng","Zhangang Lin","Jingping Shao","Qianfang Xu"],"pdf_url":"https://arxiv.org/pdf/2401.11705v1.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2401.11648v1","updated":"2024-01-22T01:58:32Z","published":"2024-01-22T01:58:32Z","title":"Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal\n  Contrastive EHR Modelling with Hierarchical Regularisation","summary":"  Predicting next visit diagnosis using Electronic Health Records (EHR) is an\nessential task in healthcare, critical for devising proactive future plans for\nboth healthcare providers and patients. Nonetheless, many preceding studies\nhave not sufficiently addressed the heterogeneous and hierarchical\ncharacteristics inherent in EHR data, inevitably leading to sub-optimal\nperformance. To this end, we propose NECHO, a novel medical code-centric\nmultimodal contrastive EHR learning framework with hierarchical regularisation.\nFirst, we integrate multifaceted information encompassing medical codes,\ndemographics, and clinical notes using a tailored network design and a pair of\nbimodal contrastive losses, all of which pivot around a medical code\nrepresentation. We also regularise modality-specific encoders using a parental\nlevel information in medical ontology to learn hierarchical structure of EHR\ndata. A series of experiments on MIMIC-III data demonstrates effectiveness of\nour approach.\n","authors":["Heejoon Koo"],"pdf_url":"https://arxiv.org/pdf/2401.11648v1.pdf","comment":"Accepted to EACL 2024 (The 18th Conference of the European Chapter of\n  the Association for Computational Linguistics)"},{"id":"http://arxiv.org/abs/2306.16001v2","updated":"2024-01-22T00:27:45Z","published":"2023-06-28T08:20:35Z","title":"Streamlining Social Media Information Extraction for Public Health\n  Research with Deep Learning","summary":"  Objective: Social media-based public health research is crucial for epidemic\nsurveillance, but most studies identify relevant corpora with keyword matching.\nThis study develops a system to streamline the process of curating colloquial\nmedical dictionaries. We demonstrate the pipeline by curating a UMLS-colloquial\nsymptom dictionary from COVID-19-related tweets as proof of concept. Methods:\nCOVID-19-related tweets from February 1, 2020, to April 30, 2022 were used. The\npipeline includes three modules: a named entity recognition module to detect\nsymptoms in tweets; an entity normalization module to aggregate detected\nentities; and a mapping module that iteratively maps entities to Unified\nMedical Language System concepts. A random 500 entity sample were drawn from\nthe final dictionary for accuracy validation. Additionally, we conducted a\nsymptom frequency distribution analysis to compare our dictionary to a\npre-defined lexicon from previous research. Results: We identified 498,480\nunique symptom entity expressions from the tweets. Pre-processing reduces the\nnumber to 18,226. The final dictionary contains 38,175 unique expressions of\nsymptoms that can be mapped to 966 UMLS concepts (accuracy = 95%). Symptom\ndistribution analysis found that our dictionary detects more symptoms and is\neffective at identifying psychiatric disorders like anxiety and depression,\noften missed by pre-defined lexicons. Conclusion: This study advances public\nhealth research by implementing a novel, systematic pipeline for curating\nsymptom lexicons from social media data. The final lexicon's high accuracy,\nvalidated by medical professionals, underscores the potential of this\nmethodology to reliably interpret and categorize vast amounts of unstructured\nsocial media data into actionable medical insights across diverse linguistic\nand regional landscapes.\n","authors":["Yining Hua","Shixu Lin","Minghui Li","Yujie Zhang","Dinah Foer","Siwen Wang","Peilin Zhou","Li Zhou","Jie Yang"],"pdf_url":"https://arxiv.org/pdf/2306.16001v2.pdf","comment":"Updated full paper. Abstract presented at IEEE ICHI 2023 and AMIA\n  Annual Symposium 2023"},{"id":"http://arxiv.org/abs/2312.11486v2","updated":"2024-01-22T19:57:27Z","published":"2023-11-30T11:49:33Z","title":"Preference and Concurrence Aware Bayesian Graph Neural Networks for\n  Recommender Systems","summary":"  Graph-based collaborative filtering methods have prevailing performance for\nrecommender systems since they can capture high-order information between users\nand items, in which the graphs are constructed from the observed user-item\ninteractions that might miss links or contain spurious positive interactions in\nindustrial scenarios. The Bayesian Graph Neural Network framework approaches\nthis issue with generative models for the interaction graphs. The critical\nproblem is to devise a proper family of graph generative models tailored to\nrecommender systems. We propose an efficient generative model that jointly\nconsiders the preferences of users, the concurrence of items and some important\ngraph structure information. Experiments on four popular benchmark datasets\ndemonstrate the effectiveness of our proposed graph generative methods for\nrecommender systems.\n","authors":["Hongjian Gu","Yaochen Hu","Yingxue Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.11486v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.12217v1","updated":"2024-01-22T18:59:29Z","published":"2024-01-22T18:59:29Z","title":"Exploring Simple Open-Vocabulary Semantic Segmentation","summary":"  Open-vocabulary semantic segmentation models aim to accurately assign a\nsemantic label to each pixel in an image from a set of arbitrary\nopen-vocabulary texts. In order to learn such pixel-level alignment, current\napproaches typically rely on a combination of (i) image-level VL model (e.g.\nCLIP), (ii) ground truth masks, and (iii) custom grouping encoders. In this\npaper, we introduce S-Seg, a novel model that can achieve surprisingly strong\nperformance without depending on any of the above elements. S-Seg leverages\npseudo-mask and language to train a MaskFormer, and can be easily trained from\npublicly available image-text datasets. Contrary to prior works, our model\ndirectly trains for pixel-level features and language alignment. Once trained,\nS-Seg generalizes well to multiple testing datasets without requiring\nfine-tuning. In addition, S-Seg has the extra benefits of scalability with data\nand consistently improvement when augmented with self-training. We believe that\nour simple yet effective approach will serve as a solid baseline for future\nresearch.\n","authors":["Zihang Lai"],"pdf_url":"https://arxiv.org/pdf/2401.12217v1.pdf","comment":"Code is available at: https://github.com/zlai0/S-Seg"},{"id":"http://arxiv.org/abs/2401.12216v1","updated":"2024-01-22T18:59:12Z","published":"2024-01-22T18:59:12Z","title":"Mitigating Covariate Shift in Misspecified Regression with Applications\n  to Reinforcement Learning","summary":"  A pervasive phenomenon in machine learning applications is distribution\nshift, where training and deployment conditions for a machine learning model\ndiffer. As distribution shift typically results in a degradation in\nperformance, much attention has been devoted to algorithmic interventions that\nmitigate these detrimental effects. In this paper, we study the effect of\ndistribution shift in the presence of model misspecification, specifically\nfocusing on $L_{\\infty}$-misspecified regression and adversarial covariate\nshift, where the regression target remains fixed while the covariate\ndistribution changes arbitrarily. We show that empirical risk minimization, or\nstandard least squares regression, can result in undesirable misspecification\namplification where the error due to misspecification is amplified by the\ndensity ratio between the training and testing distributions. As our main\nresult, we develop a new algorithm -- inspired by robust optimization\ntechniques -- that avoids this undesirable behavior, resulting in no\nmisspecification amplification while still obtaining optimal statistical rates.\nAs applications, we use this regression procedure to obtain new guarantees in\noffline and online reinforcement learning with misspecification and establish\nnew separations between previously studied structural conditions and notions of\ncoverage.\n","authors":["Philip Amortila","Tongyi Cao","Akshay Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2401.12216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13507v2","updated":"2024-01-22T18:54:52Z","published":"2023-08-25T17:33:05Z","title":"Large Language Models Should Ask Clarifying Questions to Increase\n  Confidence in Generated Code","summary":"  Large language models (LLMs) have significantly improved the ability to\nperform tasks in the field of code generation. However, there is still a gap\nbetween LLMs being capable coders and being top-tier software engineers. Based\non the observation that toplevel software engineers often ask clarifying\nquestions to reduce ambiguity in both requirements and coding solutions, I\nargue that the same should be applied to LLMs for code generation tasks. By\nasking probing questions in various topics before generating the final code,\nthe challenges of programming with LLMs, such as unclear intent specification,\nlack of computational thinking, and undesired code quality, may be alleviated.\nThis, in turn, increases confidence in the generated code. In this work, I\nexplore how to leverage better communication skills to achieve greater\nconfidence in generated code. I propose a communication-centered process that\nuses an LLM-generated communicator to identify issues with high ambiguity or\nlow confidence in problem descriptions and generated code. I then ask\nclarifying questions to obtain responses from users for refining the code.\n","authors":["Jie JW Wu"],"pdf_url":"https://arxiv.org/pdf/2308.13507v2.pdf","comment":"6 pages, 2 figures, 1 table. Accepted and presented at the 7th Annual\n  Symposium on Machine Programming (MAPS 2023 Workshop, see\n  https://mapsworkshop.github.io/). Reference: \"Wu, Jie JW. Large Language\n  Models Should Ask Clarifying Questions to Increase Confidence in Generated\n  Code. The 7th Annual Symposium on Machine Programming (MAPS 23), December 3,\n  2023, San Francisco, CA, USA\""},{"id":"http://arxiv.org/abs/2401.03506v3","updated":"2024-01-22T18:53:36Z","published":"2024-01-07T14:54:57Z","title":"DiarizationLM: Speaker Diarization Post-Processing with Large Language\n  Models","summary":"  In this paper, we introduce DiarizationLM, a framework to leverage large\nlanguage models (LLM) to post-process the outputs from a speaker diarization\nsystem. Various goals can be achieved with the proposed framework, such as\nimproving the readability of the diarized transcript, or reducing the word\ndiarization error rate (WDER). In this framework, the outputs of the automatic\nspeech recognition (ASR) and speaker diarization systems are represented as a\ncompact textual format, which is included in the prompt to an optionally\nfinetuned LLM. The outputs of the LLM can be used as the refined diarization\nresults with the desired enhancement. As a post-processing step, this framework\ncan be easily applied to any off-the-shelf ASR and speaker diarization systems\nwithout retraining existing components. Our experiments show that a finetuned\nPaLM 2-S model can reduce the WDER by rel. 55.5% on the Fisher telephone\nconversation dataset, and rel. 44.9% on the Callhome English dataset.\n","authors":["Quan Wang","Yiling Huang","Guanlong Zhao","Evan Clark","Wei Xia","Hank Liao"],"pdf_url":"https://arxiv.org/pdf/2401.03506v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12207v1","updated":"2024-01-22T18:49:56Z","published":"2024-01-22T18:49:56Z","title":"Rate-Distortion-Perception Tradeoff Based on the\n  Conditional-Distribution Perception Measure","summary":"  We study the rate-distortion-perception (RDP) tradeoff for a memoryless\nsource model in the asymptotic limit of large block-lengths. Our perception\nmeasure is based on a divergence between the distributions of the source and\nreconstruction sequences conditioned on the encoder output, which was first\nproposed in [1], [2]. We consider the case when there is no shared randomness\nbetween the encoder and the decoder. For the case of discrete memoryless\nsources we derive a single-letter characterization of the RDP function, thus\nsettling a problem that remains open for the marginal metric introduced in Blau\nand Michaeli [3] (with no shared randomness). Our achievability scheme is based\non lossy source coding with a posterior reference map proposed in [4]. For the\ncase of continuous valued sources under squared error distortion measure and\nsquared quadratic Wasserstein perception measure we also derive a single-letter\ncharacterization and show that a noise-adding mechanism at the decoder suffices\nto achieve the optimal representation. For the case of zero perception loss, we\nshow that our characterization interestingly coincides with the results for the\nmarginal metric derived in [5], [6] and again demonstrate that zero perception\nloss can be achieved with a $3$-dB penalty in the minimum distortion. Finally\nwe specialize our results to the case of Gaussian sources. We derive the RDP\nfunction for vector Gaussian sources and propose a waterfilling type solution.\nWe also partially characterize the RDP function for a mixture of vector\nGaussians.\n","authors":["Sadaf Salehkalaibar","Jun Chen","Ashish Khisti","Wei Yu"],"pdf_url":"https://arxiv.org/pdf/2401.12207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12205v1","updated":"2024-01-22T18:46:30Z","published":"2024-01-22T18:46:30Z","title":"Retrieval-Guided Reinforcement Learning for Boolean Circuit Minimization","summary":"  Logic synthesis, a pivotal stage in chip design, entails optimizing chip\nspecifications encoded in hardware description languages like Verilog into\nhighly efficient implementations using Boolean logic gates. The process\ninvolves a sequential application of logic minimization heuristics (``synthesis\nrecipe\"), with their arrangement significantly impacting crucial metrics such\nas area and delay. Addressing the challenge posed by the broad spectrum of\ndesign complexities - from variations of past designs (e.g., adders and\nmultipliers) to entirely novel configurations (e.g., innovative processor\ninstructions) - requires a nuanced `synthesis recipe` guided by human expertise\nand intuition. This study conducts a thorough examination of learning and\nsearch techniques for logic synthesis, unearthing a surprising revelation:\npre-trained agents, when confronted with entirely novel designs, may veer off\ncourse, detrimentally affecting the search trajectory. We present ABC-RL, a\nmeticulously tuned $\\alpha$ parameter that adeptly adjusts recommendations from\npre-trained agents during the search process. Computed based on similarity\nscores through nearest neighbor retrieval from the training dataset, ABC-RL\nyields superior synthesis recipes tailored for a wide array of hardware\ndesigns. Our findings showcase substantial enhancements in the\nQuality-of-result (QoR) of synthesized circuits, boasting improvements of up to\n24.8% compared to state-of-the-art techniques. Furthermore, ABC-RL achieves an\nimpressive up to 9x reduction in runtime (iso-QoR) when compared to current\nstate-of-the-art methodologies.\n","authors":["Animesh Basak Chowdhury","Marco Romanelli","Benjamin Tan","Ramesh Karri","Siddharth Garg"],"pdf_url":"https://arxiv.org/pdf/2401.12205v1.pdf","comment":"Accepted in ICLR 2024"},{"id":"http://arxiv.org/abs/2401.12202v1","updated":"2024-01-22T18:42:20Z","published":"2024-01-22T18:42:20Z","title":"OK-Robot: What Really Matters in Integrating Open-Knowledge Models for\n  Robotics","summary":"  Remarkable progress has been made in recent years in the fields of vision,\nlanguage, and robotics. We now have vision models capable of recognizing\nobjects based on language queries, navigation systems that can effectively\ncontrol mobile systems, and grasping models that can handle a wide range of\nobjects. Despite these advancements, general-purpose applications of robotics\nstill lag behind, even though they rely on these fundamental capabilities of\nrecognition, navigation, and grasping. In this paper, we adopt a systems-first\napproach to develop a new Open Knowledge-based robotics framework called\nOK-Robot. By combining Vision-Language Models (VLMs) for object detection,\nnavigation primitives for movement, and grasping primitives for object\nmanipulation, OK-Robot offers a integrated solution for pick-and-drop\noperations without requiring any training. To evaluate its performance, we run\nOK-Robot in 10 real-world home environments. The results demonstrate that\nOK-Robot achieves a 58.5% success rate in open-ended pick-and-drop tasks,\nrepresenting a new state-of-the-art in Open Vocabulary Mobile Manipulation\n(OVMM) with nearly 1.8x the performance of prior work. On cleaner, uncluttered\nenvironments, OK-Robot's performance increases to 82%. However, the most\nimportant insight gained from OK-Robot is the critical role of nuanced details\nwhen combining Open Knowledge systems like VLMs with robotic modules. Videos of\nour experiments are available on our website: https://ok-robot.github.io\n","authors":["Peiqi Liu","Yaswanth Orru","Chris Paxton","Nur Muhammad Mahi Shafiullah","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2401.12202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12200v1","updated":"2024-01-22T18:39:40Z","published":"2024-01-22T18:39:40Z","title":"APT: Adaptive Pruning and Tuning Pretrained Language Models for\n  Efficient Training and Inference","summary":"  Fine-tuning and inference with large Language Models (LM) are generally known\nto be expensive. Parameter-efficient fine-tuning over pretrained LMs reduces\ntraining memory by updating a small number of LM parameters but does not\nimprove inference efficiency. Structured pruning improves LM inference\nefficiency by removing consistent parameter blocks, yet often increases\ntraining memory and time. To improve both training and inference efficiency, we\nintroduce APT that adaptively prunes and tunes parameters for the LMs. At the\nearly stage of fine-tuning, APT dynamically adds salient tuning parameters for\nfast and accurate convergence while discarding unimportant parameters for\nefficiency. Compared to baselines, our experiments show that APT maintains up\nto 98% task performance when pruning RoBERTa and T5 models with 40% parameters\nleft while keeping 86.4% LLaMA models' performance with 70% parameters\nremained. Furthermore, APT speeds up LMs fine-tuning by up to 8x and reduces\nlarge LMs memory training footprint by up to 70%.\n","authors":["Bowen Zhao","Hannaneh Hajishirzi","Qingqing Cao"],"pdf_url":"https://arxiv.org/pdf/2401.12200v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.12187v1","updated":"2024-01-22T18:27:08Z","published":"2024-01-22T18:27:08Z","title":"WARM: On the Benefits of Weight Averaged Reward Models","summary":"  Aligning large language models (LLMs) with human preferences through\nreinforcement learning (RLHF) can lead to reward hacking, where LLMs exploit\nfailures in the reward model (RM) to achieve seemingly high rewards without\nmeeting the underlying objectives. We identify two primary challenges when\ndesigning RMs to mitigate reward hacking: distribution shifts during the RL\nprocess and inconsistencies in human preferences. As a solution, we propose\nWeight Averaged Reward Models (WARM), first fine-tuning multiple RMs, then\naveraging them in the weight space. This strategy follows the observation that\nfine-tuned weights remain linearly mode connected when sharing the same\npre-training. By averaging weights, WARM improves efficiency compared to the\ntraditional ensembling of predictions, while improving reliability under\ndistribution shifts and robustness to preference inconsistencies. Our\nexperiments on summarization tasks, using best-of-N and RL methods, shows that\nWARM improves the overall quality and alignment of LLM predictions; for\nexample, a policy RL fine-tuned with WARM has a 79.4% win rate against a policy\nRL fine-tuned with a single RM.\n","authors":["Alexandre Ramé","Nino Vieillard","Léonard Hussenot","Robert Dadashi","Geoffrey Cideron","Olivier Bachem","Johan Ferret"],"pdf_url":"https://arxiv.org/pdf/2401.12187v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.10305v2","updated":"2024-01-22T18:12:20Z","published":"2024-01-18T13:18:51Z","title":"Personality Trait Inference Via Mobile Phone Sensors: A Machine Learning\n  Approach","summary":"  This study provides evidence that personality can be reliably predicted from\nactivity data collected through mobile phone sensors. Employing a set of well\ninformed indicators calculable from accelerometer records and movement\npatterns, we were able to predict users' personality up to a 0.78 F1 score on a\ntwo class problem. Given the fast growing number of data collected from mobile\nphones, our novel personality indicators open the door to exciting avenues for\nfuture research in social sciences. Our results reveal distinct behavioral\npatterns that proved to be differentially predictive of big five personality\ntraits. They potentially enable cost effective, questionnaire free\ninvestigation of personality related questions at an unprecedented scale. We\nshow how a combination of rich behavioral data obtained with smartphone sensing\nand the use of machine learning techniques can help to advance personality\nresearch and can inform both practitioners and researchers about the different\nbehavioral patterns of personality. These findings have practical implications\nfor organizations harnessing mobile sensor data for personality assessment,\nguiding the refinement of more precise and efficient prediction models in the\nfuture.\n","authors":["Wun Yung Shaney Sze","Maryglen Pearl Herrero","Roger Garriga"],"pdf_url":"https://arxiv.org/pdf/2401.10305v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.12181v1","updated":"2024-01-22T18:11:01Z","published":"2024-01-22T18:11:01Z","title":"Universal Neurons in GPT2 Language Models","summary":"  A basic question within the emerging field of mechanistic interpretability is\nthe degree to which neural networks learn the same underlying mechanisms. In\nother words, are neural mechanisms universal across different models? In this\nwork, we study the universality of individual neurons across GPT2 models\ntrained from different initial random seeds, motivated by the hypothesis that\nuniversal neurons are likely to be interpretable. In particular, we compute\npairwise correlations of neuron activations over 100 million tokens for every\nneuron pair across five different seeds and find that 1-5\\% of neurons are\nuniversal, that is, pairs of neurons which consistently activate on the same\ninputs. We then study these universal neurons in detail, finding that they\nusually have clear interpretations and taxonomize them into a small number of\nneuron families. We conclude by studying patterns in neuron weights to\nestablish several universal functional roles of neurons in simple circuits:\ndeactivating attention heads, changing the entropy of the next token\ndistribution, and predicting the next token to (not) be within a particular\nset.\n","authors":["Wes Gurnee","Theo Horsley","Zifan Carl Guo","Tara Rezaei Kheirkhah","Qinyi Sun","Will Hathaway","Neel Nanda","Dimitris Bertsimas"],"pdf_url":"https://arxiv.org/pdf/2401.12181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12179v1","updated":"2024-01-22T18:10:10Z","published":"2024-01-22T18:10:10Z","title":"DITTO: Diffusion Inference-Time T-Optimization for Music Generation","summary":"  We propose Diffusion Inference-Time T-Optimization (DITTO), a general-purpose\nframe-work for controlling pre-trained text-to-music diffusion models at\ninference-time via optimizing initial noise latents. Our method can be used to\noptimize through any differentiable feature matching loss to achieve a target\n(stylized) output and leverages gradient checkpointing for memory efficiency.\nWe demonstrate a surprisingly wide-range of applications for music generation\nincluding inpainting, outpainting, and looping as well as intensity, melody,\nand musical structure control - all without ever fine-tuning the underlying\nmodel. When we compare our approach against related training, guidance, and\noptimization-based methods, we find DITTO achieves state-of-the-art performance\non nearly all tasks, including outperforming comparable approaches on\ncontrollability, audio quality, and computational efficiency, thus opening the\ndoor for high-quality, flexible, training-free control of diffusion models.\nSound examples can be found at https://DITTO-Music.github.io/web/.\n","authors":["Zachary Novack","Julian McAuley","Taylor Berg-Kirkpatrick","Nicholas J. Bryan"],"pdf_url":"https://arxiv.org/pdf/2401.12179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.11359v2","updated":"2024-01-22T18:01:37Z","published":"2022-05-23T14:45:34Z","title":"Towards Size-Independent Generalization Bounds for Deep Operator Nets","summary":"  In recent times machine learning methods have made significant advances in\nbecoming a useful tool for analyzing physical systems. A particularly active\narea in this theme has been \"physics-informed machine learning\" which focuses\non using neural nets for numerically solving differential equations. In this\nwork, we aim to advance the theory of measuring out-of-sample error while\ntraining DeepONets -- which is among the most versatile ways to solve PDE\nsystems in one-shot.\n  Firstly, for a class of DeepONets, we prove a bound on their Rademacher\ncomplexity which does not explicitly scale with the width of the nets involved.\nSecondly, we use this to show how the Huber loss can be chosen so that for\nthese DeepONet classes generalization error bounds can be obtained that have no\nexplicit dependence on the size of the nets. We note that our theoretical\nresults apply to any PDE being targeted to be solved by DeepONets.\n","authors":["Pulkit Gopalani","Sayar Karmakar","Dibyakanti Kumar","Anirbit Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2205.11359v2.pdf","comment":"27 pages, 5 figures; Added theorem on generalization error indicating\n  benefits of training DeepONets on the Huber loss and corresponding\n  experiments"},{"id":"http://arxiv.org/abs/2401.12168v1","updated":"2024-01-22T18:01:01Z","published":"2024-01-22T18:01:01Z","title":"SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning\n  Capabilities","summary":"  Understanding and reasoning about spatial relationships is a fundamental\ncapability for Visual Question Answering (VQA) and robotics. While Vision\nLanguage Models (VLM) have demonstrated remarkable performance in certain VQA\nbenchmarks, they still lack capabilities in 3D spatial reasoning, such as\nrecognizing quantitative relationships of physical objects like distances or\nsize differences. We hypothesize that VLMs' limited spatial reasoning\ncapability is due to the lack of 3D spatial knowledge in training data and aim\nto solve this problem by training VLMs with Internet-scale spatial reasoning\ndata. To this end, we present a system to facilitate this approach. We first\ndevelop an automatic 3D spatial VQA data generation framework that scales up to\n2 billion VQA examples on 10 million real-world images. We then investigate\nvarious factors in the training recipe, including data quality, training\npipeline, and VLM architecture. Our work features the first internet-scale 3D\nspatial reasoning dataset in metric space. By training a VLM on such data, we\nsignificantly enhance its ability on both qualitative and quantitative spatial\nVQA. Finally, we demonstrate that this VLM unlocks novel downstream\napplications in chain-of-thought spatial reasoning and robotics due to its\nquantitative estimation capability. Project website:\nhttps://spatial-vlm.github.io/\n","authors":["Boyuan Chen","Zhuo Xu","Sean Kirmani","Brian Ichter","Danny Driess","Pete Florence","Dorsa Sadigh","Leonidas Guibas","Fei Xia"],"pdf_url":"https://arxiv.org/pdf/2401.12168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08573v2","updated":"2024-01-22T17:54:58Z","published":"2024-01-16T18:58:36Z","title":"Benchmarking the Robustness of Image Watermarks","summary":"  This paper investigates the weaknesses of image watermarking techniques. We\npresent WAVES (Watermark Analysis Via Enhanced Stress-testing), a novel\nbenchmark for assessing watermark robustness, overcoming the limitations of\ncurrent evaluation methods.WAVES integrates detection and identification tasks,\nand establishes a standardized evaluation protocol comprised of a diverse range\nof stress tests. The attacks in WAVES range from traditional image distortions\nto advanced and novel variations of diffusive, and adversarial attacks. Our\nevaluation examines two pivotal dimensions: the degree of image quality\ndegradation and the efficacy of watermark detection after attacks. We develop a\nseries of Performance vs. Quality 2D plots, varying over several prominent\nimage similarity metrics, which are then aggregated in a heuristically novel\nmanner to paint an overall picture of watermark robustness and attack potency.\nOur comprehensive evaluation reveals previously undetected vulnerabilities of\nseveral modern watermarking algorithms. We envision WAVES as a toolkit for the\nfuture development of robust watermarking systems. The project is available at\nhttps://wavesbench.github.io/\n","authors":["Bang An","Mucong Ding","Tahseen Rabbani","Aakriti Agrawal","Yuancheng Xu","Chenghao Deng","Sicheng Zhu","Abdirisak Mohamed","Yuxin Wen","Tom Goldstein","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.08573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12149v1","updated":"2024-01-22T17:36:23Z","published":"2024-01-22T17:36:23Z","title":"Personalized Over-the-Air Federated Learning with Personalized\n  Reconfigurable Intelligent Surfaces","summary":"  Over-the-air federated learning (OTA-FL) provides bandwidth-efficient\nlearning by leveraging the inherent superposition property of wireless\nchannels. Personalized federated learning balances performance for users with\ndiverse datasets, addressing real-life data heterogeneity. We propose the first\npersonalized OTA-FL scheme through multi-task learning, assisted by personal\nreconfigurable intelligent surfaces (RIS) for each user. We take a cross-layer\napproach that optimizes communication and computation resources for global and\npersonalized tasks in time-varying channels with imperfect channel state\ninformation, using multi-task learning for non-i.i.d data. Our PROAR-PFed\nalgorithm adaptively designs power, local iterations, and RIS configurations.\nWe present convergence analysis for non-convex objectives and demonstrate that\nPROAR-PFed outperforms state-of-the-art on the Fashion-MNIST dataset.\n","authors":["Jiayu Mao","Aylin Yener"],"pdf_url":"https://arxiv.org/pdf/2401.12149v1.pdf","comment":"Copyright 2024 IEEE. Published in ICASSP 2024, 14-19 April, Seoul,\n  Korea. Personal use of this material is permitted. However, permission to\n  reprint/republish this material for advertising or promotional purposes or\n  for creating new collective works for resale or redistribution to servers or\n  lists, or to reuse any copyrighted component of this work in other works,\n  must be obtained from the IEEE"},{"id":"http://arxiv.org/abs/2401.12133v1","updated":"2024-01-22T17:15:02Z","published":"2024-01-22T17:15:02Z","title":"VRMN-bD: A Multi-modal Natural Behavior Dataset of Immersive Human Fear\n  Responses in VR Stand-up Interactive Games","summary":"  Understanding and recognizing emotions are important and challenging issues\nin the metaverse era. Understanding, identifying, and predicting fear, which is\none of the fundamental human emotions, in virtual reality (VR) environments\nplays an essential role in immersive game development, scene development, and\nnext-generation virtual human-computer interaction applications. In this\narticle, we used VR horror games as a medium to analyze fear emotions by\ncollecting multi-modal data (posture, audio, and physiological signals) from 23\nplayers. We used an LSTM-based model to predict fear with accuracies of 65.31%\nand 90.47% under 6-level classification (no fear and five different levels of\nfear) and 2-level classification (no fear and fear), respectively. We\nconstructed a multi-modal natural behavior dataset of immersive human fear\nresponses (VRMN-bD) and compared it with existing relevant advanced datasets.\nThe results show that our dataset has fewer limitations in terms of collection\nmethod, data scale and audience scope. We are unique and advanced in targeting\nmulti-modal datasets of fear and behavior in VR stand-up interactive\nenvironments. Moreover, we discussed the implications of this work for\ncommunities and applications. The dataset and pre-trained model are available\nat https://github.com/KindOPSTAR/VRMN-bD.\n","authors":["He Zhang","Xinyang Li","Yuanxi Sun","Xinyi Fu","Christine Qiu","John M. Carroll"],"pdf_url":"https://arxiv.org/pdf/2401.12133v1.pdf","comment":"Accepted to IEEE VR 2024"},{"id":"http://arxiv.org/abs/2401.12132v1","updated":"2024-01-22T17:14:47Z","published":"2024-01-22T17:14:47Z","title":"Evaluation of QCNN-LSTM for Disability Forecasting in Multiple Sclerosis\n  Using Sequential Multisequence MRI","summary":"  Introduction Quantum Convolutional Neural Network (QCNN)-Long Short-Term\nMemory (LSTM) models were studied to provide sequential relationships for each\ntimepoint in MRIs of patients with Multiple Sclerosis (MS). In this pilot\nstudy, we compared three QCNN-LSTM models for binary classification of MS\ndisability benchmarked against classical neural network architectures. Our\nhypothesis is that quantum models will provide competitive performance. Methods\nMatrix Product State (MPS), reverse Multistate Entanglement Renormalization\nAnsatz (MERA), and Tree-Tensor Network (TTN) circuits were paired with LSTM\nlayer to process near-annual MRI data of patients diagnosed with MS. These were\nbenchmarked against a Visual Geometry Group (VGG)-LSTM and a Video Vision\nTransformer (ViViT). Predicted logits were measured against ground truth labels\nof each patient's Extended Disability Severity Score (EDSS) using binary\ncross-entropy loss. Training/validation/holdout testing was partitioned using\n5-fold cross validation with a total split of 60:20:20. Levene's test of\nvariance was used to measure statistical difference and Student's t-test for\npaired model differences in mean. Results The MPS-LSTM, reverse MERA-LSTM, and\nTTN-LSTM had holdout testing ROC-AUC of 0.70, 0.77, and 0.81, respectively\n(p-value 0.915). VGG16-LSTM and ViViT performed similarly with ROC-AUC of 0.73\nand 0.77, respectively (p-value 0.631). Overall variance and mean were not\nstatistically significant (p-value 0.713), however, time to train was\nsignificantly faster for the QCNN-LSTMs (39.4 sec per fold vs. 224 and 218,\nrespectively, p-value <0.001). Conclusion QCNN-LSTM models perform\ncompetitively to their classical counterparts with greater efficiency in train\ntime. Clinically, these can add value in terms of efficiency to time-dependent\ndeep learning prediction of disease progression based upon medical imaging.\n","authors":["John D. Mayfield","Issam El Naqa"],"pdf_url":"https://arxiv.org/pdf/2401.12132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12131v1","updated":"2024-01-22T17:13:50Z","published":"2024-01-22T17:13:50Z","title":"NeuroSynt: A Neuro-symbolic Portfolio Solver for Reactive Synthesis","summary":"  We introduce NeuroSynt, a neuro-symbolic portfolio solver framework for\nreactive synthesis. At the core of the solver lies a seamless integration of\nneural and symbolic approaches to solving the reactive synthesis problem. To\nensure soundness, the neural engine is coupled with model checkers verifying\nthe predictions of the underlying neural models. The open-source implementation\nof NeuroSynt provides an integration framework for reactive synthesis in which\nnew neural and state-of-the-art symbolic approaches can be seamlessly\nintegrated. Extensive experiments demonstrate its efficacy in handling\nchallenging specifications, enhancing the state-of-the-art reactive synthesis\nsolvers, with NeuroSynt contributing novel solves in the current SYNTCOMP\nbenchmarks.\n","authors":["Matthias Cosler","Christopher Hahn","Ayham Omar","Frederik Schmitt"],"pdf_url":"https://arxiv.org/pdf/2401.12131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06144v2","updated":"2024-01-22T17:11:57Z","published":"2023-11-30T23:31:33Z","title":"DFU: scale-robust diffusion model for zero-shot super-resolution image\n  generation","summary":"  Diffusion generative models have achieved remarkable success in generating\nimages with a fixed resolution. However, existing models have limited ability\nto generalize to different resolutions when training data at those resolutions\nare not available. Leveraging techniques from operator learning, we present a\nnovel deep-learning architecture, Dual-FNO UNet (DFU), which approximates the\nscore operator by combining both spatial and spectral information at multiple\nresolutions. Comparisons of DFU to baselines demonstrate its scalability: 1)\nsimultaneously training on multiple resolutions improves FID over training at\nany single fixed resolution; 2) DFU generalizes beyond its training\nresolutions, allowing for coherent, high-fidelity generation at\nhigher-resolutions with the same model, i.e. zero-shot super-resolution\nimage-generation; 3) we propose a fine-tuning strategy to further enhance the\nzero-shot super-resolution image-generation capability of our model, leading to\na FID of 11.3 at 1.66 times the maximum training resolution on FFHQ, which no\nother method can come close to achieving.\n","authors":["Alex Havrilla","Kevin Rojas","Wenjing Liao","Molei Tao"],"pdf_url":"https://arxiv.org/pdf/2401.06144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12129v1","updated":"2024-01-22T17:11:01Z","published":"2024-01-22T17:11:01Z","title":"Out-of-Distribution Detection & Applications With Ablated Learned\n  Temperature Energy","summary":"  As deep neural networks become adopted in high-stakes domains, it is crucial\nto be able to identify when inference inputs are Out-of-Distribution (OOD) so\nthat users can be alerted of likely drops in performance and calibration\ndespite high confidence. Among many others, existing methods use the following\ntwo scores to do so without training on any apriori OOD examples: a learned\ntemperature and an energy score. In this paper we introduce Ablated Learned\nTemperature Energy (or \"AbeT\" for short), a method which combines these prior\nmethods in novel ways with effective modifications. Due to these contributions,\nAbeT lowers the False Positive Rate at $95\\%$ True Positive Rate (FPR@95) by\n$35.39\\%$ in classification (averaged across all ID and OOD datasets measured)\ncompared to state of the art without training networks in multiple stages or\nrequiring hyperparameters or test-time backward passes. We additionally provide\nempirical insights as to how our model learns to distinguish between\nIn-Distribution (ID) and OOD samples while only being explicitly trained on ID\nsamples via exposure to misclassified ID examples at training time. Lastly, we\nshow the efficacy of our method in identifying predicted bounding boxes and\npixels corresponding to OOD objects in object detection and semantic\nsegmentation, respectively - with an AUROC increase of $5.15\\%$ in object\ndetection and both a decrease in FPR@95 of $41.48\\%$ and an increase in AUPRC\nof $34.20\\%$ on average in semantic segmentation compared to previous state of\nthe art.\n","authors":["Will LeVine","Benjamin Pikus","Jacob Phillips","Berk Norman","Fernando Amat Gil","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2401.12129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15462v2","updated":"2024-01-22T17:02:16Z","published":"2023-09-27T07:57:37Z","title":"DTC: Deep Tracking Control","summary":"  Legged locomotion is a complex control problem that requires both accuracy\nand robustness to cope with real-world challenges. Legged systems have\ntraditionally been controlled using trajectory optimization with inverse\ndynamics. Such hierarchical model-based methods are appealing due to intuitive\ncost function tuning, accurate planning, generalization, and most importantly,\nthe insightful understanding gained from more than one decade of extensive\nresearch. However, model mismatch and violation of assumptions are common\nsources of faulty operation. Simulation-based reinforcement learning, on the\nother hand, results in locomotion policies with unprecedented robustness and\nrecovery skills. Yet, all learning algorithms struggle with sparse rewards\nemerging from environments where valid footholds are rare, such as gaps or\nstepping stones. In this work, we propose a hybrid control architecture that\ncombines the advantages of both worlds to simultaneously achieve greater\nrobustness, foot-placement accuracy, and terrain generalization. Our approach\nutilizes a model-based planner to roll out a reference motion during training.\nA deep neural network policy is trained in simulation, aiming to track the\noptimized footholds. We evaluate the accuracy of our locomotion pipeline on\nsparse terrains, where pure data-driven methods are prone to fail. Furthermore,\nwe demonstrate superior robustness in the presence of slippery or deformable\nground when compared to model-based counterparts. Finally, we show that our\nproposed tracking controller generalizes across different trajectory\noptimization methods not seen during training. In conclusion, our work unites\nthe predictive capabilities and optimality guarantees of online planning with\nthe inherent robustness attributed to offline learning.\n","authors":["Fabian Jenelten","Junzhe He","Farbod Farshidian","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2309.15462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12113v1","updated":"2024-01-22T16:51:01Z","published":"2024-01-22T16:51:01Z","title":"Extracting Formulae in Many-Valued Logic from Deep Neural Networks","summary":"  We propose a new perspective on deep ReLU networks, namely as circuit\ncounterparts of Lukasiewicz infinite-valued logic -- a many-valued (MV)\ngeneralization of Boolean logic. An algorithm for extracting formulae in MV\nlogic from deep ReLU networks is presented. As the algorithm applies to\nnetworks with general, in particular also real-valued, weights, it can be used\nto extract logical formulae from deep ReLU networks trained on data.\n","authors":["Yani Zhang","Helmut Bölcskei"],"pdf_url":"https://arxiv.org/pdf/2401.12113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12108v1","updated":"2024-01-22T16:45:15Z","published":"2024-01-22T16:45:15Z","title":"On-Time Delivery in Crowdshipping Systems: An Agent-Based Approach Using\n  Streaming Data","summary":"  In parcel delivery, the \"last mile\" from the parcel hub to the customer is\ncostly, especially for time-sensitive delivery tasks that have to be completed\nwithin hours after arrival. Recently, crowdshipping has attracted increased\nattention as a new alternative to traditional delivery modes. In crowdshipping,\nprivate citizens (\"the crowd\") perform short detours in their daily lives to\ncontribute to parcel delivery in exchange for small incentives. However,\nachieving desirable crowd behavior is challenging as the crowd is highly\ndynamic and consists of autonomous, self-interested individuals. Leveraging\ncrowdshipping for time-sensitive deliveries remains an open challenge. In this\npaper, we present an agent-based approach to on-time parcel delivery with\ncrowds. Our system performs data stream processing on the couriers' smartphone\nsensor data to predict delivery delays. Whenever a delay is predicted, the\nsystem attempts to forge an agreement for transferring the parcel from the\ncurrent deliverer to a more promising courier nearby. Our experiments show that\nthrough accurate delay predictions and purposeful task transfers many delays\ncan be prevented that would occur without our approach.\n","authors":["Jeremias Dötterl","Ralf Bruns","Jürgen Dunkel","Sascha Ossowski"],"pdf_url":"https://arxiv.org/pdf/2401.12108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12103v1","updated":"2024-01-22T16:38:33Z","published":"2024-01-22T16:38:33Z","title":"LearnedWMP: Workload Memory Prediction Using Distribution of Query\n  Templates","summary":"  In a modern DBMS, working memory is frequently the limiting factor when\nprocessing in-memory analytic query operations such as joins, sorting, and\naggregation. Existing resource estimation approaches for a DBMS estimate the\nresource consumption of a query by computing an estimate of each individual\ndatabase operator in the query execution plan. Such an approach is slow and\nerror-prone as it relies upon simplifying assumptions, such as uniformity and\nindependence of the underlying data. Additionally, the existing approach\nfocuses on individual queries separately and does not factor in other queries\nin the workload that may be executed concurrently. In this research, we are\ninterested in query performance optimization under concurrent execution of a\nbatch of queries (a workload). Specifically, we focus on predicting the memory\ndemand for a workload rather than providing separate estimates for each query\nwithin it. We introduce the problem of workload memory prediction and formalize\nit as a distribution regression problem. We propose Learned Workload Memory\nPrediction (LearnedWMP) to improve and simplify estimating the working memory\ndemands of workloads. Through a comprehensive experimental evaluation, we show\nthat LearnedWMP reduces the memory estimation error of the\nstate-of-the-practice method by up to 47.6%. Compared to an alternative\nsingle-query model, during training and inferencing, the LearnedWMP model and\nits variants were 3x to 10x faster. Moreover, LearnedWMP-based models were at\nleast 50% smaller in most cases. Overall, the results demonstrate the\nadvantages of the LearnedWMP approach and its potential for a broader impact on\nquery performance optimization.\n","authors":["Shaikh Quader","Andres Jaramillo","Sumona Mukhopadhyay","Ghadeer Abuoda","Calisto Zuzarte","David Kalmuk","Marin Litoiu","Manos Papagelis"],"pdf_url":"https://arxiv.org/pdf/2401.12103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17028v2","updated":"2024-01-22T16:25:13Z","published":"2023-05-26T15:36:59Z","title":"Better Batch for Deep Probabilistic Time Series Forecasting","summary":"  Deep probabilistic time series forecasting has gained significant attention\ndue to its superior performance in nonlinear approximation and its ability to\nprovide valuable uncertainty quantification for decision-making tasks. However,\nmany existing models oversimplify the problem by assuming that the error\nprocess is time-independent, thereby overlooking the serial correlation in the\nerror process. To overcome this limitation, we propose an innovative training\nmethod that incorporates error autocorrelation to further enhance the accuracy\nof probabilistic forecasting. Our method involves constructing a mini-batch as\na collection of $D$ consecutive time series segments for model training and\nexplicitly learning a time-varying covariance matrix over each mini-batch that\nencodes the error correlation among adjacent time steps. The learned covariance\nmatrix can be used to improve prediction accuracy and enhance uncertainty\nquantification. We evaluate our method on two different neural forecasting\nmodels and multiple public datasets, and the experimental results confirm the\neffectiveness of the proposed approach in enhancing the performance of both\nmodels across a wide range of datasets, yielding notable improvements in\npredictive accuracy.\n","authors":["Vincent Zhihao Zheng","Seongjin Choi","Lijun Sun"],"pdf_url":"https://arxiv.org/pdf/2305.17028v2.pdf","comment":"9 pages, 3 figures, camera-ready version, The 27th International\n  Conference on Artificial Intelligence and Statistics (AISTATS 2024)"},{"id":"http://arxiv.org/abs/2401.12086v1","updated":"2024-01-22T16:24:43Z","published":"2024-01-22T16:24:43Z","title":"West-of-N: Synthetic Preference Generation for Improved Reward Modeling","summary":"  The success of reinforcement learning from human feedback (RLHF) in language\nmodel alignment is strongly dependent on the quality of the underlying reward\nmodel. In this paper, we present a novel approach to improve reward model\nquality by generating synthetic preference data, thereby augmenting the\ntraining dataset with on-policy, high-quality preference pairs. Motivated by\nthe promising results of Best-of-N sampling strategies in language model\ntraining, we extend their application to reward model training. This results in\na self-training strategy to generate preference pairs by selecting the best and\nworst candidates in a pool of responses to a given query. Empirically, we find\nthat this approach improves the performance of any reward model, with an effect\ncomparable to the addition of a similar quantity of human preference data. This\nwork opens up new avenues of research for improving RLHF for language model\nalignment, by offering synthetic preference generation as a solution to reward\nmodeling challenges.\n","authors":["Alizée Pace","Jonathan Mallinson","Eric Malmi","Sebastian Krause","Aliaksei Severyn"],"pdf_url":"https://arxiv.org/pdf/2401.12086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12079v1","updated":"2024-01-22T16:21:19Z","published":"2024-01-22T16:21:19Z","title":"Collaborative Reinforcement Learning Based Unmanned Aerial Vehicle (UAV)\n  Trajectory Design for 3D UAV Tracking","summary":"  In this paper, the problem of using one active unmanned aerial vehicle (UAV)\nand four passive UAVs to localize a 3D target UAV in real time is investigated.\nIn the considered model, each passive UAV receives reflection signals from the\ntarget UAV, which are initially transmitted by the active UAV. The received\nreflection signals allow each passive UAV to estimate the signal transmission\ndistance which will be transmitted to a base station (BS) for the estimation of\nthe position of the target UAV. Due to the movement of the target UAV, each\nactive/passive UAV must optimize its trajectory to continuously localize the\ntarget UAV. Meanwhile, since the accuracy of the distance estimation depends on\nthe signal-to-noise ratio of the transmission signals, the active UAV must\noptimize its transmit power. This problem is formulated as an optimization\nproblem whose goal is to jointly optimize the transmit power of the active UAV\nand trajectories of both active and passive UAVs so as to maximize the target\nUAV positioning accuracy. To solve this problem, a Z function decomposition\nbased reinforcement learning (ZD-RL) method is proposed. Compared to value\nfunction decomposition based RL (VD-RL), the proposed method can find the\nprobability distribution of the sum of future rewards to accurately estimate\nthe expected value of the sum of future rewards thus finding better transmit\npower of the active UAV and trajectories for both active and passive UAVs and\nimproving target UAV positioning accuracy. Simulation results show that the\nproposed ZD-RL method can reduce the positioning errors by up to 39.4% and\n64.6%, compared to VD-RL and independent deep RL methods, respectively.\n","authors":["Yujiao Zhu","Mingzhe Chen","Sihua Wang","Ye Hu","Yuchen Liu","Changchuan Yin"],"pdf_url":"https://arxiv.org/pdf/2401.12079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12070v1","updated":"2024-01-22T16:09:47Z","published":"2024-01-22T16:09:47Z","title":"Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated\n  Text","summary":"  Detecting text generated by modern large language models is thought to be\nhard, as both LLMs and humans can exhibit a wide range of complex behaviors.\nHowever, we find that a score based on contrasting two closely related language\nmodels is highly accurate at separating human-generated and machine-generated\ntext. Based on this mechanism, we propose a novel LLM detector that only\nrequires simple calculations using a pair of pre-trained LLMs. The method,\ncalled Binoculars, achieves state-of-the-art accuracy without any training\ndata. It is capable of spotting machine text from a range of modern LLMs\nwithout any model-specific modifications. We comprehensively evaluate\nBinoculars on a number of text sources and in varied situations. Over a wide\nrange of document types, Binoculars detects over 90% of generated samples from\nChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being\ntrained on any ChatGPT data.\n","authors":["Abhimanyu Hans","Avi Schwarzschild","Valeriia Cherepanova","Hamid Kazemi","Aniruddha Saha","Micah Goldblum","Jonas Geiping","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2401.12070v1.pdf","comment":"20 pages, code available at https://github.com/ahans30/Binoculars"},{"id":"http://arxiv.org/abs/2401.12069v1","updated":"2024-01-22T16:08:41Z","published":"2024-01-22T16:08:41Z","title":"Beyond TreeSHAP: Efficient Computation of Any-Order Shapley Interactions\n  for Tree Ensembles","summary":"  While shallow decision trees may be interpretable, larger ensemble models\nlike gradient-boosted trees, which often set the state of the art in machine\nlearning problems involving tabular data, still remain black box models. As a\nremedy, the Shapley value (SV) is a well-known concept in explainable\nartificial intelligence (XAI) research for quantifying additive feature\nattributions of predictions. The model-specific TreeSHAP methodology solves the\nexponential complexity for retrieving exact SVs from tree-based models.\nExpanding beyond individual feature attribution, Shapley interactions reveal\nthe impact of intricate feature interactions of any order. In this work, we\npresent TreeSHAP-IQ, an efficient method to compute any-order additive Shapley\ninteractions for predictions of tree-based models. TreeSHAP-IQ is supported by\na mathematical framework that exploits polynomial arithmetic to compute the\ninteraction scores in a single recursive traversal of the tree, akin to Linear\nTreeSHAP. We apply TreeSHAP-IQ on state-of-the-art tree ensembles and explore\ninteractions on well-established benchmark datasets.\n","authors":["Maximilian Muschalik","Fabian Fumagalli","Barbara Hammer","Eyke Hüllermeier"],"pdf_url":"https://arxiv.org/pdf/2401.12069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12068v1","updated":"2024-01-22T16:05:30Z","published":"2024-01-22T16:05:30Z","title":"Resource-constrained stereo singing voice cancellation","summary":"  We study the problem of stereo singing voice cancellation, a subtask of music\nsource separation, whose goal is to estimate an instrumental background from a\nstereo mix. We explore how to achieve performance similar to large\nstate-of-the-art source separation networks starting from a small, efficient\nmodel for real-time speech separation. Such a model is useful when memory and\ncompute are limited and singing voice processing has to run with limited\nlook-ahead. In practice, this is realised by adapting an existing mono model to\nhandle stereo input. Improvements in quality are obtained by tuning model\nparameters and expanding the training set. Moreover, we highlight the benefits\na stereo model brings by introducing a new metric which detects attenuation\ninconsistencies between channels. Our approach is evaluated using objective\noffline metrics and a large-scale MUSHRA trial, confirming the effectiveness of\nour techniques in stringent listening tests.\n","authors":["Clara Borrelli","James Rae","Dogac Basaran","Matt McVicar","Mehrez Souden","Matthias Mauch"],"pdf_url":"https://arxiv.org/pdf/2401.12068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12058v1","updated":"2024-01-22T15:50:32Z","published":"2024-01-22T15:50:32Z","title":"The Dimension Strikes Back with Gradients: Generalization of Gradient\n  Methods in Stochastic Convex Optimization","summary":"  We study the generalization performance of gradient methods in the\nfundamental stochastic convex optimization setting, focusing on its dimension\ndependence. First, for full-batch gradient descent (GD) we give a construction\nof a learning problem in dimension $d=O(n^2)$, where the canonical version of\nGD (tuned for optimal performance of the empirical risk) trained with $n$\ntraining examples converges, with constant probability, to an approximate\nempirical risk minimizer with $\\Omega(1)$ population excess risk. Our bound\ntranslates to a lower bound of $\\Omega (\\sqrt{d})$ on the number of training\nexamples required for standard GD to reach a non-trivial test error, answering\nan open question raised by Feldman (2016) and Amir, Koren, and Livni (2021b)\nand showing that a non-trivial dimension dependence is unavoidable.\nFurthermore, for standard one-pass stochastic gradient descent (SGD), we show\nthat an application of the same construction technique provides a similar\n$\\Omega(\\sqrt{d})$ lower bound for the sample complexity of SGD to reach a\nnon-trivial empirical error, despite achieving optimal test performance. This\nagain provides an exponential improvement in the dimension dependence compared\nto previous work (Koren, Livni, Mansour, and Sherman, 2022), resolving an open\nquestion left therein.\n","authors":["Matan Schliserman","Uri Sherman","Tomer Koren"],"pdf_url":"https://arxiv.org/pdf/2401.12058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12055v1","updated":"2024-01-22T15:47:05Z","published":"2024-01-22T15:47:05Z","title":"NEUROSEC: FPGA-Based Neuromorphic Audio Security","summary":"  Neuromorphic systems, inspired by the complexity and functionality of the\nhuman brain, have gained interest in academic and industrial attention due to\ntheir unparalleled potential across a wide range of applications. While their\ncapabilities herald innovation, it is imperative to underscore that these\ncomputational paradigms, analogous to their traditional counterparts, are not\nimpervious to security threats. Although the exploration of neuromorphic\nmethodologies for image and video processing has been rigorously pursued, the\nrealm of neuromorphic audio processing remains in its early stages. Our results\nhighlight the robustness and precision of our FPGA-based neuromorphic system.\nSpecifically, our system showcases a commendable balance between desired signal\nand background noise, efficient spike rate encoding, and unparalleled\nresilience against adversarial attacks such as FGSM and PGD. A standout feature\nof our framework is its detection rate of 94%, which, when compared to other\nmethodologies, underscores its greater capability in identifying and mitigating\nthreats within 5.39 dB, a commendable SNR ratio. Furthermore, neuromorphic\ncomputing and hardware security serve many sensor domains in mission-critical\nand privacy-preserving applications.\n","authors":["Murat Isik","Hiruna Vishwamith","Yusuf Sur","Kayode Inadagbo","I. Can Dikmen"],"pdf_url":"https://arxiv.org/pdf/2401.12055v1.pdf","comment":"Audio processing, FPGA, Hardware Security, Neuromorphic Computing"},{"id":"http://arxiv.org/abs/2401.12046v1","updated":"2024-01-22T15:38:29Z","published":"2024-01-22T15:38:29Z","title":"Fourier Transporter: Bi-Equivariant Robotic Manipulation in 3D","summary":"  Many complex robotic manipulation tasks can be decomposed as a sequence of\npick and place actions. Training a robotic agent to learn this sequence over\nmany different starting conditions typically requires many iterations or\ndemonstrations, especially in 3D environments. In this work, we propose Fourier\nTransporter (\\ours{}) which leverages the two-fold $\\SE(d)\\times\\SE(d)$\nsymmetry in the pick-place problem to achieve much higher sample efficiency.\n\\ours{} is an open-loop behavior cloning method trained using expert\ndemonstrations to predict pick-place actions on new environments. \\ours{} is\nconstrained to incorporate symmetries of the pick and place actions\nindependently. Our method utilizes a fiber space Fourier transformation that\nallows for memory-efficient construction. We test our proposed network on the\nRLbench benchmark and achieve state-of-the-art results across various tasks.\n","authors":["Haojie Huang","Owen Howell","Xupeng Zhu","Dian Wang","Robin Walters","Robert Platt"],"pdf_url":"https://arxiv.org/pdf/2401.12046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08865v2","updated":"2024-01-22T15:30:08Z","published":"2024-01-16T22:36:23Z","title":"The Effect of Intrinsic Dataset Properties on Generalization: Unraveling\n  Learning Differences Between Natural and Medical Images","summary":"  This paper investigates discrepancies in how neural networks learn from\ndifferent imaging domains, which are commonly overlooked when adopting computer\nvision techniques from the domain of natural images to other specialized\ndomains such as medical images. Recent works have found that the generalization\nerror of a trained network typically increases with the intrinsic dimension\n($d_{data}$) of its training set. Yet, the steepness of this relationship\nvaries significantly between medical (radiological) and natural imaging\ndomains, with no existing theoretical explanation. We address this gap in\nknowledge by establishing and empirically validating a generalization scaling\nlaw with respect to $d_{data}$, and propose that the substantial scaling\ndiscrepancy between the two considered domains may be at least partially\nattributed to the higher intrinsic \"label sharpness\" ($K_F$) of medical imaging\ndatasets, a metric which we propose. Next, we demonstrate an additional benefit\nof measuring the label sharpness of a training set: it is negatively correlated\nwith the trained model's adversarial robustness, which notably leads to models\nfor medical images having a substantially higher vulnerability to adversarial\nattack. Finally, we extend our $d_{data}$ formalism to the related metric of\nlearned representation intrinsic dimension ($d_{repr}$), derive a\ngeneralization scaling law with respect to $d_{repr}$, and show that $d_{data}$\nserves as an upper bound for $d_{repr}$. Our theoretical results are supported\nby thorough experiments with six models and eleven natural and medical imaging\ndatasets over a range of training set sizes. Our findings offer insights into\nthe influence of intrinsic dataset properties on generalization, representation\nlearning, and robustness in deep neural networks.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2401.08865v2.pdf","comment":"ICLR 2024. Code:\n  https://github.com/mazurowski-lab/intrinsic-properties"},{"id":"http://arxiv.org/abs/2401.12033v1","updated":"2024-01-22T15:19:18Z","published":"2024-01-22T15:19:18Z","title":"Momentum-SAM: Sharpness Aware Minimization without Computational\n  Overhead","summary":"  The recently proposed optimization algorithm for deep neural networks\nSharpness Aware Minimization (SAM) suggests perturbing parameters before\ngradient calculation by a gradient ascent step to guide the optimization into\nparameter space regions of flat loss. While significant generalization\nimprovements and thus reduction of overfitting could be demonstrated, the\ncomputational costs are doubled due to the additionally needed gradient\ncalculation, making SAM unfeasible in case of limited computationally\ncapacities. Motivated by Nesterov Accelerated Gradient (NAG) we propose\nMomentum-SAM (MSAM), which perturbs parameters in the direction of the\naccumulated momentum vector to achieve low sharpness without significant\ncomputational overhead or memory demands over SGD or Adam. We evaluate MSAM in\ndetail and reveal insights on separable mechanisms of NAG, SAM and MSAM\nregarding training optimization and generalization. Code is available at\nhttps://github.com/MarlonBecker/MSAM.\n","authors":["Marlon Becker","Frederick Altrock","Benjamin Risse"],"pdf_url":"https://arxiv.org/pdf/2401.12033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12024v1","updated":"2024-01-22T15:11:57Z","published":"2024-01-22T15:11:57Z","title":"Multimodal Visual-Tactile Representation Learning through\n  Self-Supervised Contrastive Pre-Training","summary":"  The rapidly evolving field of robotics necessitates methods that can\nfacilitate the fusion of multiple modalities. Specifically, when it comes to\ninteracting with tangible objects, effectively combining visual and tactile\nsensory data is key to understanding and navigating the complex dynamics of the\nphysical world, enabling a more nuanced and adaptable response to changing\nenvironments. Nevertheless, much of the earlier work in merging these two\nsensory modalities has relied on supervised methods utilizing datasets labeled\nby humans.This paper introduces MViTac, a novel methodology that leverages\ncontrastive learning to integrate vision and touch sensations in a\nself-supervised fashion. By availing both sensory inputs, MViTac leverages\nintra and inter-modality losses for learning representations, resulting in\nenhanced material property classification and more adept grasping prediction.\nThrough a series of experiments, we showcase the effectiveness of our method\nand its superiority over existing state-of-the-art self-supervised and\nsupervised techniques. In evaluating our methodology, we focus on two distinct\ntasks: material classification and grasping success prediction. Our results\nindicate that MViTac facilitates the development of improved modality encoders,\nyielding more robust representations as evidenced by linear probing\nassessments.\n","authors":["Vedant Dave","Fotios Lygerakis","Elmar Rueckert"],"pdf_url":"https://arxiv.org/pdf/2401.12024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13141v2","updated":"2024-01-22T15:07:26Z","published":"2023-12-20T16:02:25Z","title":"Augment on Manifold: Mixup Regularization with UMAP","summary":"  Data augmentation techniques play an important role in enhancing the\nperformance of deep learning models. Despite their proven benefits in computer\nvision tasks, their application in the other domains remains limited. This\npaper proposes a Mixup regularization scheme, referred to as UMAP Mixup,\ndesigned for ``on-manifold\" automated data augmentation for deep learning\npredictive models. The proposed approach ensures that the Mixup operations\nresult in synthesized samples that lie on the data manifold of the features and\nlabels by utilizing a dimensionality reduction technique known as uniform\nmanifold approximation and projection. Evaluations across diverse regression\ntasks show that UMAP Mixup is competitive with or outperforms other Mixup\nvariants, show promise for its potential as an effective tool for enhancing the\ngeneralization performance of deep learning models.\n","authors":["Yousef El-Laham","Elizabeth Fons","Dillon Daudert","Svitlana Vyetrenko"],"pdf_url":"https://arxiv.org/pdf/2312.13141v2.pdf","comment":"accepted paper to be published in the proceedings of ICASSP 2024"},{"id":"http://arxiv.org/abs/2311.14212v3","updated":"2024-01-22T15:05:30Z","published":"2023-11-23T21:54:22Z","title":"Annotation Sensitivity: Training Data Collection Methods Affect Model\n  Performance","summary":"  When training data are collected from human annotators, the design of the\nannotation instrument, the instructions given to annotators, the\ncharacteristics of the annotators, and their interactions can impact training\ndata. This study demonstrates that design choices made when creating an\nannotation instrument also impact the models trained on the resulting\nannotations. We introduce the term annotation sensitivity to refer to the\nimpact of annotation data collection methods on the annotations themselves and\non downstream model performance and predictions. We collect annotations of hate\nspeech and offensive language in five experimental conditions of an annotation\ninstrument, randomly assigning annotators to conditions. We then fine-tune BERT\nmodels on each of the five resulting datasets and evaluate model performance on\na holdout portion of each condition. We find considerable differences between\nthe conditions for 1) the share of hate speech/offensive language annotations,\n2) model performance, 3) model predictions, and 4) model learning curves. Our\nresults emphasize the crucial role played by the annotation instrument which\nhas received little attention in the machine learning literature. We call for\nadditional research into how and why the instrument impacts the annotations to\ninform the development of best practices in instrument design.\n","authors":["Christoph Kern","Stephanie Eckman","Jacob Beck","Rob Chew","Bolei Ma","Frauke Kreuter"],"pdf_url":"https://arxiv.org/pdf/2311.14212v3.pdf","comment":"EMNLP 2023 Findings:\n  https://aclanthology.org/2023.findings-emnlp.992/"},{"id":"http://arxiv.org/abs/2312.13152v2","updated":"2024-01-22T15:04:57Z","published":"2023-12-20T16:16:29Z","title":"Neural Stochastic Differential Equations with Change Points: A\n  Generative Adversarial Approach","summary":"  Stochastic differential equations (SDEs) have been widely used to model real\nworld random phenomena. Existing works mainly focus on the case where the time\nseries is modeled by a single SDE, which might be restrictive for modeling time\nseries with distributional shift. In this work, we propose a change point\ndetection algorithm for time series modeled as neural SDEs. Given a time series\ndataset, the proposed method jointly learns the unknown change points and the\nparameters of distinct neural SDE models corresponding to each change point.\nSpecifically, the SDEs are learned under the framework of generative\nadversarial networks (GANs) and the change points are detected based on the\noutput of the GAN discriminator in a forward pass. At each step of the proposed\nalgorithm, the change points and the SDE model parameters are updated in an\nalternating fashion. Numerical results on both synthetic and real datasets are\nprovided to validate the performance of our algorithm in comparison to\nclassical change point detection benchmarks, standard GAN-based neural SDEs,\nand other state-of-the-art deep generative models for time series data.\n","authors":["Zhongchang Sun","Yousef El-Laham","Svitlana Vyetrenko"],"pdf_url":"https://arxiv.org/pdf/2312.13152v2.pdf","comment":"accepted paper to be published in the proceedings of ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.12014v1","updated":"2024-01-22T15:00:32Z","published":"2024-01-22T15:00:32Z","title":"Robustness to distribution shifts of compressed networks for edge\n  devices","summary":"  It is necessary to develop efficient DNNs deployed on edge devices with\nlimited computation resources. However, the compressed networks often execute\nnew tasks in the target domain, which is different from the source domain where\nthe original network is trained. It is important to investigate the robustness\nof compressed networks in two types of data distribution shifts: domain shifts\nand adversarial perturbations. In this study, we discover that compressed\nmodels are less robust to distribution shifts than their original networks.\nInterestingly, larger networks are more vulnerable to losing robustness than\nsmaller ones, even when they are compressed to a similar size as the smaller\nnetworks. Furthermore, compact networks obtained by knowledge distillation are\nmuch more robust to distribution shifts than pruned networks. Finally,\npost-training quantization is a reliable method for achieving significant\nrobustness to distribution shifts, and it outperforms both pruned and distilled\nmodels in terms of robustness.\n","authors":["Lulan Shen","Ali Edalati","Brett Meyer","Warren Gross","James J. Clark"],"pdf_url":"https://arxiv.org/pdf/2401.12014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12012v1","updated":"2024-01-22T14:59:11Z","published":"2024-01-22T14:59:11Z","title":"TurboSVM-FL: Boosting Federated Learning through SVM Aggregation for\n  Lazy Clients","summary":"  Federated learning is a distributed collaborative machine learning paradigm\nthat has gained strong momentum in recent years. In federated learning, a\ncentral server periodically coordinates models with clients and aggregates the\nmodels trained locally by clients without necessitating access to local data.\nDespite its potential, the implementation of federated learning continues to\nencounter several challenges, predominantly the slow convergence that is\nlargely due to data heterogeneity. The slow convergence becomes particularly\nproblematic in cross-device federated learning scenarios where clients may be\nstrongly limited by computing power and storage space, and hence counteracting\nmethods that induce additional computation or memory cost on the client side\nsuch as auxiliary objective terms and larger training iterations can be\nimpractical. In this paper, we propose a novel federated aggregation strategy,\nTurboSVM-FL, that poses no additional computation burden on the client side and\ncan significantly accelerate convergence for federated classification task,\nespecially when clients are \"lazy\" and train their models solely for few epochs\nfor next global aggregation. TurboSVM-FL extensively utilizes support vector\nmachine to conduct selective aggregation and max-margin spread-out\nregularization on class embeddings. We evaluate TurboSVM-FL on multiple\ndatasets including FEMNIST, CelebA, and Shakespeare using user-independent\nvalidation with non-iid data distribution. Our results show that TurboSVM-FL\ncan significantly outperform existing popular algorithms on convergence rate\nand reduce communication rounds while delivering better test metrics including\naccuracy, F1 score, and MCC.\n","authors":["Mengdi Wang","Anna Bodonhelyi","Efe Bozkir","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2401.12012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12007v1","updated":"2024-01-22T14:55:01Z","published":"2024-01-22T14:55:01Z","title":"Tensor-view Topological Graph Neural Network","summary":"  Graph classification is an important learning task for graph-structured data.\nGraph neural networks (GNNs) have recently gained growing attention in graph\nlearning and have shown significant improvements in many important graph\nproblems. Despite their state-of-the-art performances, existing GNNs only use\nlocal information from a very limited neighborhood around each node, suffering\nfrom loss of multi-modal information and overheads of excessive computation. To\naddress these issues, we propose a novel Tensor-view Topological Graph Neural\nNetwork (TTG-NN), a class of simple yet effective topological deep learning\nbuilt upon persistent homology, graph convolution, and tensor operations. This\nnew method incorporates tensor learning to simultaneously capture Tensor-view\nTopological (TT), as well as Tensor-view Graph (TG) structural information on\nboth local and global levels. Computationally, to fully exploit graph topology\nand structure, we propose two flexible TT and TG representation learning\nmodules that disentangle feature tensor aggregation and transformation and\nlearn to preserve multi-modal structure with less computation. Theoretically,\nwe derive high probability bounds on both the out-of-sample and in-sample mean\nsquared approximation errors for our proposed Tensor Transformation Layer\n(TTL). Real data experiments show that the proposed TTG-NN outperforms 20\nstate-of-the-art methods on various graph benchmarks.\n","authors":["Tao Wen","Elynn Chen","Yuzhou Chen"],"pdf_url":"https://arxiv.org/pdf/2401.12007v1.pdf","comment":"Accepted at AISTATS 2024"},{"id":"http://arxiv.org/abs/2309.12701v2","updated":"2024-01-22T14:53:22Z","published":"2023-09-22T08:18:08Z","title":"Decision Tree Search as a Markov Decision Problem","summary":"  Finding an optimal decision tree for a supervised learning task is a\nchallenging combinatorial problem to solve at scale. It was recently proposed\nto frame the problem as a Markov Decision Problem (MDP) and use deep\nreinforcement learning to tackle scaling. Unfortunately, these methods are not\ncompetitive with the current branch-and-bound state-of-the-art. We propose\ninstead to scale the resolution of such MDPs using an information-theoretic\ntests generating function that heuristically, and dynamically for every state,\nlimits the set of admissible test actions to a few good candidates. As a\nsolver, we show empirically that our algorithm is at the very least competitive\nwith branch-and-bound alternatives. As a machine learning tool, a key advantage\nof our approach is to solve for multiple complexity-performance trade-offs at\nvirtually no additional cost. With such a set of solutions, a user can then\nselect the tree that generalizes best and which has the interpretability level\nthat best suits their needs, which no current branch-and-bound method allows.\n","authors":["Hector Kohler","Riad Akrour","Philippe Preux"],"pdf_url":"https://arxiv.org/pdf/2309.12701v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12004v1","updated":"2024-01-22T14:53:21Z","published":"2024-01-22T14:53:21Z","title":"NLCG-Net: A Model-Based Zero-Shot Learning Framework for Undersampled\n  Quantitative MRI Reconstruction","summary":"  Typical quantitative MRI (qMRI) methods estimate parameter maps after image\nreconstructing, which is prone to biases and error propagation. We propose a\nNonlinear Conjugate Gradient (NLCG) optimizer for model-based T2/T1 estimation,\nwhich incorporates U-Net regularization trained in a scan-specific manner. This\nend-to-end method directly estimates qMRI maps from undersampled k-space data\nusing mono-exponential signal modeling with zero-shot scan-specific neural\nnetwork regularization to enable high fidelity T1 and T2 mapping. T2 and T1\nmapping results demonstrate the ability of the proposed NLCG-Net to improve\nestimation quality compared to subspace reconstruction at high accelerations.\n","authors":["Xinrui Jiang","Yohan Jun","Jaejin Cho","Mengze Gao","Xingwang Yong","Berkin Bilgic"],"pdf_url":"https://arxiv.org/pdf/2401.12004v1.pdf","comment":"8 pages, 5 figures, submitted to International Society for Magnetic\n  Resonance in Medicine 2024"},{"id":"http://arxiv.org/abs/2401.12002v1","updated":"2024-01-22T14:52:34Z","published":"2024-01-22T14:52:34Z","title":"HgbNet: predicting hemoglobin level/anemia degree from EHR data","summary":"  Anemia is a prevalent medical condition that typically requires invasive\nblood tests for diagnosis and monitoring. Electronic health records (EHRs) have\nemerged as valuable data sources for numerous medical studies. EHR-based\nhemoglobin level/anemia degree prediction is non-invasive and rapid but still\nfaces some challenges due to the fact that EHR data is typically an irregular\nmultivariate time series containing a significant number of missing values and\nirregular time intervals. To address these issues, we introduce HgbNet, a\nmachine learning-based prediction model that emulates clinicians'\ndecision-making processes for hemoglobin level/anemia degree prediction. The\nmodel incorporates a NanDense layer with a missing indicator to handle missing\nvalues and employs attention mechanisms to account for both local irregularity\nand global irregularity. We evaluate the proposed method using two real-world\ndatasets across two use cases. In our first use case, we predict hemoglobin\nlevel/anemia degree at moment T+1 by utilizing records from moments prior to\nT+1. In our second use case, we integrate all historical records with\nadditional selected test results at moment T+1 to predict hemoglobin\nlevel/anemia degree at the same moment, T+1. HgbNet outperforms the best\nbaseline results across all datasets and use cases. These findings demonstrate\nthe feasibility of estimating hemoglobin levels and anemia degree from EHR\ndata, positioning HgbNet as an effective non-invasive anemia diagnosis solution\nthat could potentially enhance the quality of life for millions of affected\nindividuals worldwide. To our knowledge, HgbNet is the first machine learning\nmodel leveraging EHR data for hemoglobin level/anemia degree prediction.\n","authors":["Zhuo Zhi","Moe Elbadawi","Adam Daneshmend","Mine Orlu","Abdul Basit","Andreas Demosthenous","Miguel Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2401.12002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12000v1","updated":"2024-01-22T14:51:01Z","published":"2024-01-22T14:51:01Z","title":"Integrating Statistical Significance and Discriminative Power in Pattern\n  Discovery","summary":"  Pattern discovery plays a central role in both descriptive and predictive\ntasks across multiple domains. Actionable patterns must meet rigorous\nstatistical significance criteria and, in the presence of target variables,\nfurther uphold discriminative power. Our work addresses the underexplored area\nof guiding pattern discovery by integrating statistical significance and\ndiscriminative power criteria into state-of-the-art algorithms while preserving\npattern quality. We also address how pattern quality thresholds, imposed by\nsome algorithms, can be rectified to accommodate these additional criteria. To\ntest the proposed methodology, we select the triclustering task as the guiding\npattern discovery case and extend well-known greedy and multi-objective\noptimization triclustering algorithms, $\\delta$-Trimax and TriGen, that use\nvarious pattern quality criteria, such as Mean Squared Residual (MSR), Least\nSquared Lines (LSL), and Multi Slope Measure (MSL). Results from three case\nstudies show the role of the proposed methodology in discovering patterns with\npronounced improvements of discriminative power and statistical significance\nwithout quality deterioration, highlighting its importance in supervisedly\nguiding the search. Although the proposed methodology is motivated over\nmultivariate time series data, it can be straightforwardly extended to pattern\ndiscovery tasks involving multivariate, N-way (N>3), transactional, and\nsequential data structures.\n  Availability: The code is freely available at\nhttps://github.com/JupitersMight/MOF_Triclustering under the MIT license.\n","authors":["Leonardo Alexandre","Rafael S. Costa","Rui Henriques"],"pdf_url":"https://arxiv.org/pdf/2401.12000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11993v1","updated":"2024-01-22T14:46:41Z","published":"2024-01-22T14:46:41Z","title":"Expert-Driven Monitoring of Operational ML Models","summary":"  We propose Expert Monitoring, an approach that leverages domain expertise to\nenhance the detection and mitigation of concept drift in machine learning (ML)\nmodels. Our approach supports practitioners by consolidating domain expertise\nrelated to concept drift-inducing events, making this expertise accessible to\non-call personnel, and enabling automatic adaptability with expert oversight.\n","authors":["Joran Leest","Claudia Raibulet","Ilias Gerostathopoulos","Patricia Lago"],"pdf_url":"https://arxiv.org/pdf/2401.11993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11985v1","updated":"2024-01-22T14:38:25Z","published":"2024-01-22T14:38:25Z","title":"Scaling Face Interaction Graph Networks to Real World Scenes","summary":"  Accurately simulating real world object dynamics is essential for various\napplications such as robotics, engineering, graphics, and design. To better\ncapture complex real dynamics such as contact and friction, learned simulators\nbased on graph networks have recently shown great promise. However, applying\nthese learned simulators to real scenes comes with two major challenges: first,\nscaling learned simulators to handle the complexity of real world scenes which\ncan involve hundreds of objects each with complicated 3D shapes, and second,\nhandling inputs from perception rather than 3D state information. Here we\nintroduce a method which substantially reduces the memory required to run\ngraph-based learned simulators. Based on this memory-efficient simulation\nmodel, we then present a perceptual interface in the form of editable NeRFs\nwhich can convert real-world scenes into a structured representation that can\nbe processed by graph network simulator. We show that our method uses\nsubstantially less memory than previous graph-based simulators while retaining\ntheir accuracy, and that the simulators learned in synthetic environments can\nbe applied to real world scenes captured from multiple camera angles. This\npaves the way for expanding the application of learned simulators to settings\nwhere only perceptual information is available at inference time.\n","authors":["Tatiana Lopez-Guevara","Yulia Rubanova","William F. Whitney","Tobias Pfaff","Kimberly Stachenfeld","Kelsey R. Allen"],"pdf_url":"https://arxiv.org/pdf/2401.11985v1.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2401.11974v1","updated":"2024-01-22T14:26:02Z","published":"2024-01-22T14:26:02Z","title":"Cross-Validation Conformal Risk Control","summary":"  Conformal risk control (CRC) is a recently proposed technique that applies\npost-hoc to a conventional point predictor to provide calibration guarantees.\nGeneralizing conformal prediction (CP), with CRC, calibration is ensured for a\nset predictor that is extracted from the point predictor to control a risk\nfunction such as the probability of miscoverage or the false negative rate. The\noriginal CRC requires the available data set to be split between training and\nvalidation data sets. This can be problematic when data availability is\nlimited, resulting in inefficient set predictors. In this paper, a novel CRC\nmethod is introduced that is based on cross-validation, rather than on\nvalidation as the original CRC. The proposed cross-validation CRC (CV-CRC)\nextends a version of the jackknife-minmax from CP to CRC, allowing for the\ncontrol of a broader range of risk functions. CV-CRC is proved to offer\ntheoretical guarantees on the average risk of the set predictor. Furthermore,\nnumerical experiments show that CV-CRC can reduce the average set size with\nrespect to CRC when the available data are limited.\n","authors":["Kfir M. Cohen","Sangwoo Park","Osvaldo Simeone","Shlomo Shamai"],"pdf_url":"https://arxiv.org/pdf/2401.11974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08738v2","updated":"2024-01-22T14:17:27Z","published":"2024-01-16T18:31:23Z","title":"Machine Learning-Based Analysis of Ebola Virus' Impact on Gene\n  Expression in Nonhuman Primates","summary":"  This study introduces the Supervised Magnitude-Altitude Scoring (SMAS)\nmethodology, a machine learning-based approach, for analyzing gene expression\ndata obtained from nonhuman primates (NHPs) infected with Ebola virus (EBOV).\nWe utilize a comprehensive dataset of NanoString gene expression profiles from\nEbola-infected NHPs, deploying the SMAS system for nuanced host-pathogen\ninteraction analysis. SMAS effectively combines gene selection based on\nstatistical significance and expression changes, employing linear classifiers\nsuch as logistic regression to accurately differentiate between RT-qPCR\npositive and negative NHP samples. A key finding of our research is the\nidentification of IFI6 and IFI27 as critical biomarkers, demonstrating\nexceptional predictive performance with 100% accuracy and Area Under the Curve\n(AUC) metrics in classifying various stages of Ebola infection. Alongside IFI6\nand IFI27, genes, including MX1, OAS1, and ISG15, were significantly\nupregulated, highlighting their essential roles in the immune response to EBOV.\nOur results underscore the efficacy of the SMAS method in revealing complex\ngenetic interactions and response mechanisms during EBOV infection. This\nresearch provides valuable insights into EBOV pathogenesis and aids in\ndeveloping more precise diagnostic tools and therapeutic strategies to address\nEBOV infection in particular and viral infection in general.\n","authors":["Mostafa Rezapour","Muhammad Khalid Khan Niazi","Hao Lu","Aarthi Narayanan","Metin Nafi Gurcan"],"pdf_url":"https://arxiv.org/pdf/2401.08738v2.pdf","comment":"28 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2401.10451v2","updated":"2024-01-22T14:14:16Z","published":"2024-01-19T01:40:58Z","title":"Learning-assisted Stochastic Capacity Expansion Planning: A Bayesian\n  Optimization Approach","summary":"  Solving large-scale capacity expansion problems (CEPs) is central to\ncost-effective decarbonization of regional-scale energy systems. To ensure the\nintended outcomes of CEPs, modeling uncertainty due to weather-dependent\nvariable renewable energy (VRE) supply and energy demand becomes crucially\nimportant. However, the resulting stochastic optimization models are often less\ncomputationally tractable than their deterministic counterparts. Here, we\npropose a learning-assisted approximate solution method to tractably solve\ntwo-stage stochastic CEPs. Our method identifies low-cost planning decisions by\nconstructing and solving a sequence of tractable temporally aggregated\nsurrogate problems. We adopt a Bayesian optimization approach to searching the\nspace of time series aggregation hyperparameters and compute approximate\nsolutions that minimize costs on a validation set of supply-demand projections.\nImportantly, we evaluate solved planning outcomes on a held-out set of test\nprojections. We apply our approach to generation and transmission expansion\nplanning for a joint power-gas system spanning New England. We show that our\napproach yields an estimated cost savings of up to 3.8% in comparison to\nbenchmark time series aggregation approaches.\n","authors":["Aron Brenner","Rahman Khorramfar","Dharik Mallapragada","Saurabh Amin"],"pdf_url":"https://arxiv.org/pdf/2401.10451v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11963v1","updated":"2024-01-22T14:06:37Z","published":"2024-01-22T14:06:37Z","title":"Bridging Evolutionary Algorithms and Reinforcement Learning: A\n  Comprehensive Survey","summary":"  Evolutionary Reinforcement Learning (ERL), which integrates Evolutionary\nAlgorithms (EAs) and Reinforcement Learning (RL) for optimization, has\ndemonstrated remarkable performance advancements. By fusing the strengths of\nboth approaches, ERL has emerged as a promising research direction. This survey\noffers a comprehensive overview of the diverse research branches in ERL.\nSpecifically, we systematically summarize recent advancements in relevant\nalgorithms and identify three primary research directions: EA-assisted\noptimization of RL, RL-assisted optimization of EA, and synergistic\noptimization of EA and RL. Following that, we conduct an in-depth analysis of\neach research direction, organizing multiple research branches. We elucidate\nthe problems that each branch aims to tackle and how the integration of EA and\nRL addresses these challenges. In conclusion, we discuss potential challenges\nand prospective future research directions across various research directions.\n","authors":["Pengyi Li","Jianye Hao","Hongyao Tang","Xian Fu","Yan Zheng","Ke Tang"],"pdf_url":"https://arxiv.org/pdf/2401.11963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11954v1","updated":"2024-01-22T13:54:26Z","published":"2024-01-22T13:54:26Z","title":"RUMBoost: Gradient Boosted Random Utility Models","summary":"  This paper introduces the RUMBoost model, a novel discrete choice modelling\napproach that combines the interpretability and behavioural robustness of\nRandom Utility Models (RUMs) with the generalisation and predictive ability of\ndeep learning methods. We obtain the full functional form of non-linear utility\nspecifications by replacing each linear parameter in the utility functions of a\nRUM with an ensemble of gradient boosted regression trees. This enables\npiece-wise constant utility values to be imputed for all alternatives directly\nfrom the data for any possible combination of input variables. We introduce\nadditional constraints on the ensembles to ensure three crucial features of the\nutility specifications: (i) dependency of the utilities of each alternative on\nonly the attributes of that alternative, (ii) monotonicity of marginal\nutilities, and (iii) an intrinsically interpretable functional form, where the\nexact response of the model is known throughout the entire input space.\nFurthermore, we introduce an optimisation-based smoothing technique that\nreplaces the piece-wise constant utility values of alternative attributes with\nmonotonic piece-wise cubic splines to identify non-linear parameters with\ndefined gradient. We demonstrate the potential of the RUMBoost model compared\nto various ML and Random Utility benchmark models for revealed preference mode\nchoice data from London. The results highlight the great predictive performance\nand the direct interpretability of our proposed approach. Furthermore, the\nsmoothed attribute utility functions allow for the calculation of various\nbehavioural indicators and marginal utilities. Finally, we demonstrate the\nflexibility of our methodology by showing how the RUMBoost model can be\nextended to complex model specifications, including attribute interactions,\ncorrelation within alternative error terms and heterogeneity within the\npopulation.\n","authors":["Nicolas Salvadé","Tim Hillel"],"pdf_url":"https://arxiv.org/pdf/2401.11954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.04033v4","updated":"2024-01-22T13:53:09Z","published":"2021-09-09T04:48:54Z","title":"New Versions of Gradient Temporal Difference Learning","summary":"  Sutton, Szepesv\\'{a}ri and Maei introduced the first gradient\ntemporal-difference (GTD) learning algorithms compatible with both linear\nfunction approximation and off-policy training. The goal of this paper is (a)\nto propose some variants of GTDs with extensive comparative analysis and (b) to\nestablish new theoretical analysis frameworks for the GTDs. These variants are\nbased on convex-concave saddle-point interpretations of GTDs, which effectively\nunify all the GTDs into a single framework, and provide simple stability\nanalysis based on recent results on primal-dual gradient dynamics. Finally,\nnumerical comparative analysis is given to evaluate these approaches.\n","authors":["Donghwan Lee","Han-Dong Lim","Jihoon Park","Okyong Choi"],"pdf_url":"https://arxiv.org/pdf/2109.04033v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03242v2","updated":"2024-01-22T13:40:16Z","published":"2023-11-06T16:31:09Z","title":"Approximating Langevin Monte Carlo with ResNet-like Neural Network\n  architectures","summary":"  We sample from a given target distribution by constructing a neural network\nwhich maps samples from a simple reference, e.g. the standard normal\ndistribution, to samples from the target. To that end, we propose using a\nneural network architecture inspired by the Langevin Monte Carlo (LMC)\nalgorithm. Based on LMC perturbation results, we show approximation rates of\nthe proposed architecture for smooth, log-concave target distributions measured\nin the Wasserstein-$2$ distance. The analysis heavily relies on the notion of\nsub-Gaussianity of the intermediate measures of the perturbed LMC process. In\nparticular, we derive bounds on the growth of the intermediate variance proxies\nunder different assumptions on the perturbations. Moreover, we propose an\narchitecture similar to deep residual neural networks and derive expressivity\nresults for approximating the sample to target distribution map.\n","authors":["Charles Miranda","Janina Schütte","David Sommer","Martin Eigel"],"pdf_url":"https://arxiv.org/pdf/2311.03242v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10107v2","updated":"2024-01-22T13:36:12Z","published":"2024-01-18T16:18:18Z","title":"Comparison analysis between standard polysomnographic data and\n  in-ear-EEG signals: A preliminary study","summary":"  Study Objectives: Polysomnography (PSG) currently serves as the benchmark for\nevaluating sleep disorders. Its discomfort, impracticality for home-use, and\nintroduction of bias in sleep quality assessment necessitate the exploration of\nless invasive, cost-effective, and portable alternatives. One promising\ncontender is the in-ear-EEG sensor, which offers advantages in terms of\ncomfort, fixed electrode positions, resistance to electromagnetic interference,\nand user-friendliness. This study aims to establish a methodology to assess the\nsimilarity between the in-ear-EEG signal and standard PSG.\n  Methods: We assess the agreement between the PSG and in-ear-EEG derived\nhypnograms. We extract features in the time- and frequency- domain from PSG and\nin-ear-EEG 30-second epochs. We only consider the epochs where the PSG-scorers\nand the in-ear-EEG-scorers were in agreement. We introduce a methodology to\nquantify the similarity between PSG derivations and the single-channel\nin-ear-EEG. The approach relies on a comparison of distributions of selected\nfeatures -- extracted for each sleep stage and subject on both PSG and the\nin-ear-EEG signals -- via a Jensen-Shannon Divergence Feature-based Similarity\nIndex (JSD-FSI).\n  Results: We found a high intra-scorer variability, mainly due to the\nuncertainty the scorers had in evaluating the in-ear-EEG signals. We show that\nthe similarity between PSG and in-ear-EEG signals is high (JSD-FSI: 0.61 +/-\n0.06 in awake, 0.60 +/- 0.07 in NREM and 0.51 +/- 0.08 in REM), and in line\nwith the similarity values computed independently on standard\nPSG-channel-combinations.\n  Conclusions: In-ear-EEG is a valuable solution for home-based sleep\nmonitoring, however further studies with a larger and more heterogeneous\ndataset are needed.\n","authors":["Gianpaolo Palo","Luigi Fiorillo","Giuliana Monachino","Michal Bechny","Mark Melnykowycz","Athina Tzovara","Valentina Agostini","Francesca Dalia Faraci"],"pdf_url":"https://arxiv.org/pdf/2401.10107v2.pdf","comment":"29 pages, 12 figures, 1 table"},{"id":"http://arxiv.org/abs/2401.11943v1","updated":"2024-01-22T13:33:53Z","published":"2024-01-22T13:33:53Z","title":"Benchmarking Large Multimodal Models against Common Corruptions","summary":"  This technical report aims to fill a deficiency in the assessment of large\nmultimodal models (LMMs) by specifically examining the self-consistency of\ntheir outputs when subjected to common corruptions. We investigate the\ncross-modal interactions between text, image, and speech, encompassing four\nessential generation tasks: text-to-image, image-to-text, text-to-speech, and\nspeech-to-text. We create a comprehensive benchmark, named MMCBench, that\ncovers more than 100 popular LMMs (totally over 150 model checkpoints). A\nthorough evaluation under common corruptions is critical for practical\ndeployment and facilitates a better understanding of the reliability of\ncutting-edge LMMs. The benchmarking code is available at\nhttps://github.com/sail-sg/MMCBench\n","authors":["Jiawei Zhang","Tianyu Pang","Chao Du","Yi Ren","Bo Li","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11943v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2401.11940v1","updated":"2024-01-22T13:30:11Z","published":"2024-01-22T13:30:11Z","title":"Low-Tubal-Rank Tensor Recovery via Factorized Gradient Descent","summary":"  This paper considers the problem of recovering a tensor with an underlying\nlow-tubal-rank structure from a small number of corrupted linear measurements.\nTraditional approaches tackling such a problem require the computation of\ntensor Singular Value Decomposition (t-SVD), that is a computationally\nintensive process, rendering them impractical for dealing with large-scale\ntensors. Aim to address this challenge, we propose an efficient and effective\nlow-tubal-rank tensor recovery method based on a factorization procedure akin\nto the Burer-Monteiro (BM) method. Precisely, our fundamental approach involves\ndecomposing a large tensor into two smaller factor tensors, followed by solving\nthe problem through factorized gradient descent (FGD). This strategy eliminates\nthe need for t-SVD computation, thereby reducing computational costs and\nstorage requirements. We provide rigorous theoretical analysis to ensure the\nconvergence of FGD under both noise-free and noisy situations. Additionally, it\nis worth noting that our method does not require the precise estimation of the\ntensor tubal-rank. Even in cases where the tubal-rank is slightly\noverestimated, our approach continues to demonstrate robust performance. A\nseries of experiments have been carried out to demonstrate that, as compared to\nother popular ones, our approach exhibits superior performance in multiple\nscenarios, in terms of the faster computational speed and the smaller\nconvergence error.\n","authors":["Zhiyu Liu","Zhi Han","Yandong Tang","Xi-Le Zhao","Yao Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11940v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.11929v1","updated":"2024-01-22T13:15:40Z","published":"2024-01-22T13:15:40Z","title":"The Bigger the Better? Rethinking the Effective Model Scale in Long-term\n  Time Series Forecasting","summary":"  Long-term time series forecasting (LTSF) represents a critical frontier in\ntime series analysis, distinguished by its focus on extensive input sequences,\nin contrast to the constrained lengths typical of traditional approaches. While\nlonger sequences inherently convey richer information, potentially enhancing\npredictive precision, prevailing techniques often respond by escalating model\ncomplexity. These intricate models can inflate into millions of parameters,\nincorporating parameter-intensive elements like positional encodings,\nfeed-forward networks and self-attention mechanisms. This complexity, however,\nleads to prohibitive model scale, particularly given the time series data's\nsemantic simplicity. Motivated by the pursuit of parsimony, our research\nemploys conditional correlation and auto-correlation as investigative tools,\nrevealing significant redundancies within the input data. Leveraging these\ninsights, we introduce the HDformer, a lightweight Transformer variant enhanced\nwith hierarchical decomposition. This novel architecture not only inverts the\nprevailing trend toward model expansion but also accomplishes precise\nforecasting with drastically fewer computations and parameters. Remarkably,\nHDformer outperforms existing state-of-the-art LTSF models, while requiring\nover 99\\% fewer parameters. Through this work, we advocate a paradigm shift in\nLTSF, emphasizing the importance to tailor the model to the inherent dynamics\nof time series data-a timely reminder that in the realm of LTSF, bigger is not\ninvariably better.\n","authors":["Jinliang Deng","Xuan Song","Ivor W. Tsang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2401.11929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09126v2","updated":"2024-01-22T13:14:33Z","published":"2023-10-13T14:14:43Z","title":"Physics-guided Noise Neural Proxy for Practical Low-light Raw Image\n  Denoising","summary":"  Recently, the mainstream practice for training low-light raw image denoising\nmethods has shifted towards employing synthetic data. Noise modeling, which\nfocuses on characterizing the noise distribution of real-world sensors,\nprofoundly influences the effectiveness and practicality of synthetic data.\nCurrently, physics-based noise modeling struggles to characterize the entire\nreal noise distribution, while learning-based noise modeling impractically\ndepends on paired real data. In this paper, we propose a novel strategy:\nlearning the noise model from dark frames instead of paired real data, to break\ndown the data dependency. Based on this strategy, we introduce an efficient\nphysics-guided noise neural proxy (PNNP) to approximate the real-world sensor\nnoise model. Specifically, we integrate physical priors into neural proxies and\nintroduce three efficient techniques: physics-guided noise decoupling (PND),\nphysics-guided proxy model (PPM), and differentiable distribution loss (DDL).\nPND decouples the dark frame into different components and handles different\nlevels of noise flexibly, which reduces the complexity of noise modeling. PPM\nincorporates physical priors to constrain the generated noise, which promotes\nthe accuracy of noise modeling. DDL provides explicit and reliable supervision\nfor noise distribution, which promotes the precision of noise modeling. PNNP\nexhibits powerful potential in characterizing the real noise distribution.\nExtensive experiments on public datasets demonstrate superior performance in\npractical low-light raw image denoising. The code will be available at\n\\url{https://github.com/fenghansen/PNNP}.\n","authors":["Hansen Feng","Lizhi Wang","Yiqi Huang","Yuzhi Wang","Lin Zhu","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2310.09126v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2401.10337v2","updated":"2024-01-22T12:33:43Z","published":"2024-01-18T19:02:00Z","title":"Noise Contrastive Estimation-based Matching Framework for Low-resource\n  Security Attack Pattern Recognition","summary":"  Tactics, Techniques and Procedures (TTPs) represent sophisticated attack\npatterns in the cybersecurity domain, described encyclopedically in textual\nknowledge bases. Identifying TTPs in cybersecurity writing, often called TTP\nmapping, is an important and challenging task. Conventional learning approaches\noften target the problem in the classical multi-class or multilabel\nclassification setting. This setting hinders the learning ability of the model\ndue to a large number of classes (i.e., TTPs), the inevitable skewness of the\nlabel distribution and the complex hierarchical structure of the label space.\nWe formulate the problem in a different learning paradigm, where the assignment\nof a text to a TTP label is decided by the direct semantic similarity between\nthe two, thus reducing the complexity of competing solely over the large\nlabeling space. To that end, we propose a neural matching architecture with an\neffective sampling-based learn-to-compare mechanism, facilitating the learning\nprocess of the matching model despite constrained resources.\n","authors":["Tu Nguyen","Nedim Srndic","Alexander Neth"],"pdf_url":"https://arxiv.org/pdf/2401.10337v2.pdf","comment":"accepted at EACL 2024, in ARR October 2023"},{"id":"http://arxiv.org/abs/2401.11888v1","updated":"2024-01-22T12:28:50Z","published":"2024-01-22T12:28:50Z","title":"Multimodal Deep Learning of Word-of-Mouth Text and Demographics to\n  Predict Customer Rating: Handling Consumer Heterogeneity in Marketing","summary":"  In the marketing field, understanding consumer heterogeneity, which is the\ninternal or psychological difference among consumers that cannot be captured by\nbehavioral logs, has long been a critical challenge. However, a number of\nconsumers today usually post their evaluation on the specific product on the\nonline platform, which can be the valuable source of such unobservable\ndifferences among consumers. Several previous studies have shown the validity\nof the analysis on text modality, but on the other hand, such analyses may not\nnecessarily demonstrate sufficient predictive accuracy for text alone, as they\nmay not include information readily available from cross-sectional data, such\nas consumer profile data. In addition, recent advances in machine learning\ntechniques, such as large-scale language models (LLMs) and multimodal learning\nhave made it possible to deal with the various kind of dataset simultaneously,\nincluding textual data and the traditional cross-sectional data, and the joint\nrepresentations can be effectively obtained from multiple modalities.\nTherefore, this study constructs a product evaluation model that takes into\naccount consumer heterogeneity by multimodal learning of online product reviews\nand consumer profile information. We also compare multiple models using\ndifferent modalities or hyper-parameters to demonstrate the robustness of\nmultimodal learning in marketing analysis.\n","authors":["Junichiro Niimi"],"pdf_url":"https://arxiv.org/pdf/2401.11888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.15269v3","updated":"2024-01-22T12:26:44Z","published":"2022-06-30T13:20:48Z","title":"Deep Reinforcement Learning with Swin Transformers","summary":"  Transformers are neural network models that utilize multiple layers of\nself-attention heads and have exhibited enormous potential in natural language\nprocessing tasks. Meanwhile, there have been efforts to adapt transformers to\nvisual tasks of machine learning, including Vision Transformers and Swin\nTransformers. Although some researchers use Vision Transformers for\nreinforcement learning tasks, their experiments remain at a small scale due to\nthe high computational cost. This article presents the first online\nreinforcement learning scheme that is based on Swin Transformers: Swin DQN. In\ncontrast to existing research, our novel approach demonstrate the superior\nperformance with experiments on 49 games in the Arcade Learning Environment.\nThe results show that our approach achieves significantly higher maximal\nevaluation scores than the baseline method in 45 of all the 49 games (92%), and\nhigher mean evaluation scores than the baseline method in 40 of all the 49\ngames (82%).\n","authors":["Li Meng","Morten Goodwin","Anis Yazidi","Paal Engelstad"],"pdf_url":"https://arxiv.org/pdf/2206.15269v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10393v2","updated":"2024-01-22T12:04:18Z","published":"2024-01-18T22:06:38Z","title":"Catastrophic Interference is Mitigated in Naturalistic Power-Law\n  Learning Environments","summary":"  Neural networks often suffer from catastrophic interference (CI): performance\non previously learned tasks drops off significantly when learning a new task.\nThis contrasts strongly with humans, who can sequentially learn new tasks\nwithout appreciably forgetting previous tasks. Prior work has explored various\ntechniques for mitigating CI such as regularization, rehearsal, generative\nreplay, and distillation methods. The current work takes a different approach,\none guided by cognitive science research showing that in naturalistic\nenvironments, the probability of encountering a task decreases as a power-law\nof the time since it was last performed. We argue that a realistic evaluation\nof techniques for the mitigation of CI should be performed in simulated\nnaturalistic learning environments. Thus, we evaluate the extent of mitigation\nof CI when training simple rehearsal-based methods in power-law environments\nsimilar to the ones humans face. Our work explores this novel rehearsal-based\napproach for a domain-incremental task: learning permutations in the MNIST\ntask. We compare our rehearsal environment with other baselines to show its\nefficacy in promoting continual learning. Additionally, we investigate whether\nthis environment shows forward facilitation, i.e., faster learning of later\ntasks. Next, we explore the robustness of our learning environment to the\nnumber of tasks, model size, and amount of data rehearsed after each task.\nNotably, our results show that the performance is comparable or superior to\nthat of models trained using popular regularization methods and also to\nrehearsals in non-power-law environments. The benefits of this training\nparadigm include simplicity and the lack of a need for extra neural circuitry.\nIn addition, because our method is orthogonal to other methods, future research\ncan combine training in power-law environments with other continual learning\nmechanisms.\n","authors":["Atith Gandhi","Raj Sanjay Shah","Vijay Marupudi","Sashank Varma"],"pdf_url":"https://arxiv.org/pdf/2401.10393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06064v4","updated":"2024-01-22T12:04:06Z","published":"2023-05-18T13:59:02Z","title":"Neural Algorithmic Reasoning for Combinatorial Optimisation","summary":"  Solving NP-hard/complete combinatorial problems with neural networks is a\nchallenging research area that aims to surpass classical approximate\nalgorithms. The long-term objective is to outperform hand-designed heuristics\nfor NP-hard/complete problems by learning to generate superior solutions solely\nfrom training data. Current neural-based methods for solving CO problems often\noverlook the inherent \"algorithmic\" nature of the problems. In contrast,\nheuristics designed for CO problems, e.g. TSP, frequently leverage\nwell-established algorithms, such as those for finding the minimum spanning\ntree. In this paper, we propose leveraging recent advancements in neural\nalgorithmic reasoning to improve the learning of CO problems. Specifically, we\nsuggest pre-training our neural model on relevant algorithms before training it\non CO instances. Our results demonstrate that by using this learning setup, we\nachieve superior performance compared to non-algorithmically informed deep\nlearning models.\n","authors":["Dobrik Georgiev","Danilo Numeroso","Davide Bacciu","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2306.06064v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04073v2","updated":"2024-01-22T12:00:58Z","published":"2023-05-06T15:26:22Z","title":"Explaining RL Decisions with Trajectories","summary":"  Explanation is a key component for the adoption of reinforcement learning\n(RL) in many real-world decision-making problems. In the literature, the\nexplanation is often provided by saliency attribution to the features of the RL\nagent's state. In this work, we propose a complementary approach to these\nexplanations, particularly for offline RL, where we attribute the policy\ndecisions of a trained RL agent to the trajectories encountered by it during\ntraining. To do so, we encode trajectories in offline training data\nindividually as well as collectively (encoding a set of trajectories). We then\nattribute policy decisions to a set of trajectories in this encoded space by\nestimating the sensitivity of the decision with respect to that set. Further,\nwe demonstrate the effectiveness of the proposed approach in terms of quality\nof attributions as well as practical scalability in diverse environments that\ninvolve both discrete and continuous state and action spaces such as\ngrid-worlds, video games (Atari) and continuous control (MuJoCo). We also\nconduct a human study on a simple navigation task to observe how their\nunderstanding of the task compares with data attributed for a trained RL\npolicy. Keywords -- Explainable AI, Verifiability of AI Decisions, Explainable\nRL.\n","authors":["Shripad Vilasrao Deshmukh","Arpan Dasgupta","Balaji Krishnamurthy","Nan Jiang","Chirag Agarwal","Georgios Theocharous","Jayakumar Subramanian"],"pdf_url":"https://arxiv.org/pdf/2305.04073v2.pdf","comment":"Published at International Conference on Learning Representations\n  (ICLR), 2023"},{"id":"http://arxiv.org/abs/2210.00108v3","updated":"2024-01-22T11:51:29Z","published":"2022-09-30T21:59:24Z","title":"ImpNet: Imperceptible and blackbox-undetectable backdoors in compiled\n  neural networks","summary":"  Early backdoor attacks against machine learning set off an arms race in\nattack and defence development. Defences have since appeared demonstrating some\nability to detect backdoors in models or even remove them. These defences work\nby inspecting the training data, the model, or the integrity of the training\nprocedure. In this work, we show that backdoors can be added during\ncompilation, circumventing any safeguards in the data preparation and model\ntraining stages. The attacker can not only insert existing weight-based\nbackdoors during compilation, but also a new class of weight-independent\nbackdoors, such as ImpNet. These backdoors are impossible to detect during the\ntraining or data preparation processes, because they are not yet present. Next,\nwe demonstrate that some backdoors, including ImpNet, can only be reliably\ndetected at the stage where they are inserted and removing them anywhere else\npresents a significant challenge. We conclude that ML model security requires\nassurance of provenance along the entire technical pipeline, including the\ndata, model architecture, compiler, and hardware specification.\n","authors":["Tim Clifford","Ilia Shumailov","Yiren Zhao","Ross Anderson","Robert Mullins"],"pdf_url":"https://arxiv.org/pdf/2210.00108v3.pdf","comment":"10 pages, 7 figures, to be published in IEEE Secure and Trustworthy\n  Machine Learning 2024. For website see https://ml.backdoors.uk . For source\n  code, see https://git.sr.ht/~tim-clifford/impnet_source"},{"id":"http://arxiv.org/abs/2401.11860v1","updated":"2024-01-22T11:29:44Z","published":"2024-01-22T11:29:44Z","title":"A Review of Physics-Informed Machine Learning Methods with Applications\n  to Condition Monitoring and Anomaly Detection","summary":"  This study presents a comprehensive overview of PIML techniques in the\ncontext of condition monitoring. The central concept driving PIML is the\nincorporation of known physical laws and constraints into machine learning\nalgorithms, enabling them to learn from available data while remaining\nconsistent with physical principles. Through fusing domain knowledge with\ndata-driven learning, PIML methods offer enhanced accuracy and interpretability\nin comparison to purely data-driven approaches. In this comprehensive survey,\ndetailed examinations are performed with regard to the methodology by which\nknown physical principles are integrated within machine learning frameworks, as\nwell as their suitability for specific tasks within condition monitoring.\nIncorporation of physical knowledge into the ML model may be realized in a\nvariety of methods, with each having its unique advantages and drawbacks. The\ndistinct advantages and limitations of each methodology for the integration of\nphysics within data-driven models are detailed, considering factors such as\ncomputational efficiency, model interpretability, and generalizability to\ndifferent systems in condition monitoring and fault detection. Several case\nstudies and works of literature utilizing this emerging concept are presented\nto demonstrate the efficacy of PIML in condition monitoring applications. From\nthe literature reviewed, the versatility and potential of PIML in condition\nmonitoring may be demonstrated. Novel PIML methods offer an innovative solution\nfor addressing the complexities of condition monitoring and associated\nchallenges. This comprehensive survey helps form the foundation for future work\nin the field. As the technology continues to advance, PIML is expected to play\na crucial role in enhancing maintenance strategies, system reliability, and\noverall operational efficiency in engineering systems.\n","authors":["Yuandi Wu","Brett Sicard","Stephen Andrew Gadsden"],"pdf_url":"https://arxiv.org/pdf/2401.11860v1.pdf","comment":"Paper has been submitted for review to the journal Expert Systems\n  with Applications (December 31, 2023). 90 pages, 22 figures, 9 tables"},{"id":"http://arxiv.org/abs/2309.16034v2","updated":"2024-01-22T11:26:35Z","published":"2023-09-27T21:26:01Z","title":"Analytical Modelling of Raw Data for Flow-Guided In-body Nanoscale\n  Localization","summary":"  Advancements in nanotechnology and material science are paving the way toward\nnanoscale devices that combine sensing, computing, data and energy storage, and\nwireless communication. In precision medicine, these nanodevices show promise\nfor disease diagnostics, treatment, and monitoring from within the patients'\nbloodstreams. Assigning the location of a sensed biological event with the\nevent itself, which is the main proposition of flow-guided in-body nanoscale\nlocalization, would be immensely beneficial from the perspective of precision\nmedicine. The nanoscale nature of the nanodevices and the challenging\nenvironment that the bloodstream represents, result in current flow-guided\nlocalization approaches being constrained in their communication and\nenergy-related capabilities. The communication and energy constraints of the\nnanodevices result in different features of raw data for flow-guided\nlocalization, in turn affecting its performance. An analytical modeling of the\neffects of imperfect communication and constrained energy causing intermittent\noperation of the nanodevices on the raw data produced by the nanodevices would\nbe beneficial. Hence, we propose an analytical model of raw data for\nflow-guided localization, where the raw data is modeled as a function of\ncommunication and energy-related capabilities of the nanodevice. We evaluate\nthe model by comparing its output with the one obtained through the utilization\nof a simulator for objective evaluation of flow-guided localization, featuring\ncomparably higher level of realism. Our results across a number of scenarios\nand heterogeneous performance metrics indicate high similarity between the\nmodel and simulator-generated raw datasets.\n","authors":["Guillem Pascual","Filip Lemic","Carmen Delgado","Xavier Costa-Perez"],"pdf_url":"https://arxiv.org/pdf/2309.16034v2.pdf","comment":"6 pages, 7 figures, 4 tables, 16 references"},{"id":"http://arxiv.org/abs/2309.10688v3","updated":"2024-01-22T11:26:17Z","published":"2023-09-19T15:23:07Z","title":"On the different regimes of Stochastic Gradient Descent","summary":"  Modern deep networks are trained with stochastic gradient descent (SGD) whose\nkey hyperparameters are the number of data considered at each step or batch\nsize $B$, and the step size or learning rate $\\eta$. For small $B$ and large\n$\\eta$, SGD corresponds to a stochastic evolution of the parameters, whose\nnoise amplitude is governed by the `temperature' $T\\equiv \\eta/B$. Yet this\ndescription is observed to break down for sufficiently large batches $B\\geq\nB^*$, or simplifies to gradient descent (GD) when the temperature is\nsufficiently small. Understanding where these cross-overs take place remains a\ncentral challenge. Here, we resolve these questions for a teacher-student\nperceptron classification model and show empirically that our key predictions\nstill apply to deep networks. Specifically, we obtain a phase diagram in the\n$B$-$\\eta$ plane that separates three dynamical phases: \\textit{(i)} a\nnoise-dominated SGD governed by temperature, \\textit{(ii)} a\nlarge-first-step-dominated SGD and \\textit{(iii)} GD. These different phases\nalso correspond to different regimes of generalization error. Remarkably, our\nanalysis reveals that the batch size $B^*$ separating regimes \\textit{(i)} and\n\\textit{(ii)} scale with the size $P$ of the training set, with an exponent\nthat characterizes the hardness of the classification problem.\n","authors":["Antonio Sclocchi","Matthieu Wyart"],"pdf_url":"https://arxiv.org/pdf/2309.10688v3.pdf","comment":"Main: 8 pages, 4 figures; Appendix: 20 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.09647v2","updated":"2024-01-22T11:14:39Z","published":"2023-08-18T16:07:01Z","title":"Robust Uncertainty Quantification Using Conformalised Monte Carlo\n  Prediction","summary":"  Deploying deep learning models in safety-critical applications remains a very\nchallenging task, mandating the provision of assurances for the dependable\noperation of these models. Uncertainty quantification (UQ) methods estimate the\nmodel's confidence per prediction, informing decision-making by considering the\neffect of randomness and model misspecification. Despite the advances of\nstate-of-the-art UQ methods, they are computationally expensive or produce\nconservative prediction sets/intervals. We introduce MC-CP, a novel hybrid UQ\nmethod that combines a new adaptive Monte Carlo (MC) dropout method with\nconformal prediction (CP). MC-CP adaptively modulates the traditional MC\ndropout at runtime to save memory and computation resources, enabling\npredictions to be consumed by CP, yielding robust prediction sets/intervals.\nThroughout comprehensive experiments, we show that MC-CP delivers significant\nimprovements over advanced UQ methods, like MC dropout, RAPS and CQR, both in\nclassification and regression benchmarks. MC-CP can be easily added to existing\nmodels, making its deployment simple.\n","authors":["Daniel Bethell","Simos Gerasimou","Radu Calinescu"],"pdf_url":"https://arxiv.org/pdf/2308.09647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11849v1","updated":"2024-01-22T11:08:36Z","published":"2024-01-22T11:08:36Z","title":"Self-Labeling the Job Shop Scheduling Problem","summary":"  In this work, we propose a Self-Supervised training strategy specifically\ndesigned for combinatorial problems. One of the main obstacles in applying\nsupervised paradigms to such problems is the requirement of expensive target\nsolutions as ground-truth, often produced with costly exact solvers. Inspired\nby Semi- and Self-Supervised learning, we show that it is possible to easily\ntrain generative models by sampling multiple solutions and using the best one\naccording to the problem objective as a pseudo-label. In this way, we\niteratively improve the model generation capability by relying only on its\nself-supervision, completely removing the need for optimality information. We\nprove the effectiveness of this Self-Labeling strategy on the Job Shop\nScheduling (JSP), a complex combinatorial problem that is receiving much\nattention from the Reinforcement Learning community. We propose a generative\nmodel based on the well-known Pointer Network and train it with our strategy.\nExperiments on two popular benchmarks demonstrate the potential of this\napproach as the resulting models outperform constructive heuristics and current\nstate-of-the-art Reinforcement Learning proposals.\n","authors":["Andrea Corsini","Angelo Porrello","Simone Calderara","Mauro Dell'Amico"],"pdf_url":"https://arxiv.org/pdf/2401.11849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11844v1","updated":"2024-01-22T11:01:52Z","published":"2024-01-22T11:01:52Z","title":"Adaptive Fusion of Multi-view Remote Sensing data for Optimal Sub-field\n  Crop Yield Prediction","summary":"  Accurate crop yield prediction is of utmost importance for informed\ndecision-making in agriculture, aiding farmers, and industry stakeholders.\nHowever, this task is complex and depends on multiple factors, such as\nenvironmental conditions, soil properties, and management practices. Combining\nheterogeneous data views poses a fusion challenge, like identifying the\nview-specific contribution to the predictive task. We present a novel\nmulti-view learning approach to predict crop yield for different crops\n(soybean, wheat, rapeseed) and regions (Argentina, Uruguay, and Germany). Our\nmulti-view input data includes multi-spectral optical images from Sentinel-2\nsatellites and weather data as dynamic features during the crop growing season,\ncomplemented by static features like soil properties and topographic\ninformation. To effectively fuse the data, we introduce a Multi-view Gated\nFusion (MVGF) model, comprising dedicated view-encoders and a Gated Unit (GU)\nmodule. The view-encoders handle the heterogeneity of data sources with varying\ntemporal resolutions by learning a view-specific representation. These\nrepresentations are adaptively fused via a weighted sum. The fusion weights are\ncomputed for each sample by the GU using a concatenation of the\nview-representations. The MVGF model is trained at sub-field level with 10 m\nresolution pixels. Our evaluations show that the MVGF outperforms conventional\nmodels on the same task, achieving the best results by incorporating all the\ndata sources, unlike the usual fusion results in the literature. For Argentina,\nthe MVGF model achieves an R2 value of 0.68 at sub-field yield prediction,\nwhile at field level evaluation (comparing field averages), it reaches around\n0.80 across different countries. The GU module learned different weights based\non the country and crop-type, aligning with the variable significance of each\ndata source to the prediction task.\n","authors":["Francisco Mena","Deepak Pathak","Hiba Najjar","Cristhian Sanchez","Patrick Helber","Benjamin Bischke","Peter Habelitz","Miro Miranda","Jayanth Siddamsetty","Marlon Nuske","Marcela Charfuelan","Diego Arenas","Michaela Vollmer","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2401.11844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11840v1","updated":"2024-01-22T10:57:11Z","published":"2024-01-22T10:57:11Z","title":"Learning to Approximate Adaptive Kernel Convolution on Graphs","summary":"  Various Graph Neural Networks (GNNs) have been successful in analyzing data\nin non-Euclidean spaces, however, they have limitations such as oversmoothing,\ni.e., information becomes excessively averaged as the number of hidden layers\nincreases. The issue stems from the intrinsic formulation of conventional graph\nconvolution where the nodal features are aggregated from a direct neighborhood\nper layer across the entire nodes in the graph. As setting different number of\nhidden layers per node is infeasible, recent works leverage a diffusion kernel\nto redefine the graph structure and incorporate information from farther nodes.\nUnfortunately, such approaches suffer from heavy diagonalization of a graph\nLaplacian or learning a large transform matrix. In this regards, we propose a\ndiffusion learning framework, where the range of feature aggregation is\ncontrolled by the scale of a diffusion kernel. For efficient computation, we\nderive closed-form derivatives of approximations of the graph convolution with\nrespect to the scale, so that node-wise range can be adaptively learned. With a\ndownstream classifier, the entire framework is made trainable in an end-to-end\nmanner. Our model is tested on various standard datasets for node-wise\nclassification for the state-of-the-art performance, and it is also validated\non a real-world brain network data for graph classifications to demonstrate its\npracticality for Alzheimer classification.\n","authors":["Jaeyoon Sim","Sooyeon Jeon","InJun Choi","Guorong Wu","Won Hwa Kim"],"pdf_url":"https://arxiv.org/pdf/2401.11840v1.pdf","comment":"15 pages, Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2401.11836v1","updated":"2024-01-22T10:52:22Z","published":"2024-01-22T10:52:22Z","title":"Privacy-Preserving Data Fusion for Traffic State Estimation: A Vertical\n  Federated Learning Approach","summary":"  This paper proposes a privacy-preserving data fusion method for traffic state\nestimation (TSE). Unlike existing works that assume all data sources to be\naccessible by a single trusted party, we explicitly address data privacy\nconcerns that arise in the collaboration and data sharing between multiple data\nowners, such as municipal authorities (MAs) and mobility providers (MPs). To\nthis end, we propose a novel vertical federated learning (FL) approach, FedTSE,\nthat enables multiple data owners to collaboratively train and apply a TSE\nmodel without having to exchange their private data. To enhance the\napplicability of the proposed FedTSE in common TSE scenarios with limited\navailability of ground-truth data, we further propose a privacy-preserving\nphysics-informed FL approach, i.e., FedTSE-PI, that integrates traffic models\ninto FL. Real-world data validation shows that the proposed methods can protect\nprivacy while yielding similar accuracy to the oracle method without privacy\nconsiderations.\n","authors":["Qiqing Wang","Kaidi Yang"],"pdf_url":"https://arxiv.org/pdf/2401.11836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18394v5","updated":"2024-01-22T10:44:50Z","published":"2023-05-28T12:34:07Z","title":"On Optimal Regularization Parameters via Bilevel Learning","summary":"  Variational regularization is commonly used to solve linear inverse problems,\nand involves augmenting a data fidelity by a regularizer. The regularizer is\nused to promote a priori information and is weighted by a regularization\nparameter. Selection of an appropriate regularization parameter is critical,\nwith various choices leading to very different reconstructions. Classical\nstrategies used to determine a suitable parameter value include the discrepancy\nprinciple and the L-curve criterion, and in recent years a supervised machine\nlearning approach called bilevel learning has been employed. Bilevel learning\nis a powerful framework to determine optimal parameters and involves solving a\nnested optimization problem. While previous strategies enjoy various\ntheoretical results, the well-posedness of bilevel learning in this setting is\nstill an open question. In particular, a necessary property is positivity of\nthe determined regularization parameter. In this work, we provide a new\ncondition that better characterizes positivity of optimal regularization\nparameters than the existing theory. Numerical results verify and explore this\nnew condition for both small and high-dimensional problems.\n","authors":["Matthias J. Ehrhardt","Silvia Gazzola","Sebastian J. Scott"],"pdf_url":"https://arxiv.org/pdf/2305.18394v5.pdf","comment":"34 pages, 11 figures. Version for publication"},{"id":"http://arxiv.org/abs/2401.11825v1","updated":"2024-01-22T10:38:14Z","published":"2024-01-22T10:38:14Z","title":"Sparse discovery of differential equations based on multi-fidelity\n  Gaussian process","summary":"  Sparse identification of differential equations aims to compute the analytic\nexpressions from the observed data explicitly. However, there exist two primary\nchallenges. Firstly, it exhibits sensitivity to the noise in the observed data,\nparticularly for the derivatives computations. Secondly, existing literature\npredominantly concentrates on single-fidelity (SF) data, which imposes\nlimitations on its applicability due to the computational cost. In this paper,\nwe present two novel approaches to address these problems from the view of\nuncertainty quantification. We construct a surrogate model employing the\nGaussian process regression (GPR) to mitigate the effect of noise in the\nobserved data, quantify its uncertainty, and ultimately recover the equations\naccurately. Subsequently, we exploit the multi-fidelity Gaussian processes\n(MFGP) to address scenarios involving multi-fidelity (MF), sparse, and noisy\nobserved data. We demonstrate the robustness and effectiveness of our\nmethodologies through several numerical experiments.\n","authors":["Yuhuang Meng","Yue Qiu"],"pdf_url":"https://arxiv.org/pdf/2401.11825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07178v2","updated":"2024-01-22T10:31:56Z","published":"2023-12-12T11:22:31Z","title":"Beyond Expected Return: Accounting for Policy Reproducibility when\n  Evaluating Reinforcement Learning Algorithms","summary":"  Many applications in Reinforcement Learning (RL) usually have noise or\nstochasticity present in the environment. Beyond their impact on learning,\nthese uncertainties lead the exact same policy to perform differently, i.e.\nyield different return, from one roll-out to another. Common evaluation\nprocedures in RL summarise the consequent return distributions using solely the\nexpected return, which does not account for the spread of the distribution. Our\nwork defines this spread as the policy reproducibility: the ability of a policy\nto obtain similar performance when rolled out many times, a crucial property in\nsome real-world applications. We highlight that existing procedures that only\nuse the expected return are limited on two fronts: first an infinite number of\nreturn distributions with a wide range of performance-reproducibility\ntrade-offs can have the same expected return, limiting its effectiveness when\nused for comparing policies; second, the expected return metric does not leave\nany room for practitioners to choose the best trade-off value for considered\napplications. In this work, we address these limitations by recommending the\nuse of Lower Confidence Bound, a metric taken from Bayesian optimisation that\nprovides the user with a preference parameter to choose a desired\nperformance-reproducibility trade-off. We also formalise and quantify policy\nreproducibility, and demonstrate the benefit of our metrics using extensive\nexperiments of popular RL algorithms on common uncertain RL tasks.\n","authors":["Manon Flageat","Bryan Lim","Antoine Cully"],"pdf_url":"https://arxiv.org/pdf/2312.07178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11817v1","updated":"2024-01-22T10:26:14Z","published":"2024-01-22T10:26:14Z","title":"Hallucination is Inevitable: An Innate Limitation of Large Language\n  Models","summary":"  Hallucination has been widely recognized to be a significant drawback for\nlarge language models (LLMs). There have been many works that attempt to reduce\nthe extent of hallucination. These efforts have mostly been empirical so far,\nwhich cannot answer the fundamental question whether it can be completely\neliminated. In this paper, we formalize the problem and show that it is\nimpossible to eliminate hallucination in LLMs. Specifically, we define a formal\nworld where hallucination is defined as inconsistencies between a computable\nLLM and a computable ground truth function. By employing results from learning\ntheory, we show that LLMs cannot learn all of the computable functions and will\ntherefore always hallucinate. Since the formal world is a part of the real\nworld which is much more complicated, hallucinations are also inevitable for\nreal world LLMs. Furthermore, for real world LLMs constrained by provable time\ncomplexity, we describe the hallucination-prone tasks and empirically validate\nour claims. Finally, using the formal world framework, we discuss the possible\nmechanisms and efficacies of existing hallucination mitigators as well as the\npractical implications on the safe deployment of LLMs.\n","authors":["Ziwei Xu","Sanjay Jain","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2401.11817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11810v1","updated":"2024-01-22T10:14:45Z","published":"2024-01-22T10:14:45Z","title":"Generalization and Informativeness of Conformal Prediction","summary":"  The safe integration of machine learning modules in decision-making processes\nhinges on their ability to quantify uncertainty. A popular technique to achieve\nthis goal is conformal prediction (CP), which transforms an arbitrary base\npredictor into a set predictor with coverage guarantees. While CP certifies the\npredicted set to contain the target quantity with a user-defined tolerance, it\ndoes not provide control over the average size of the predicted sets, i.e.,\nover the informativeness of the prediction. In this work, a theoretical\nconnection is established between the generalization properties of the base\npredictor and the informativeness of the resulting CP prediction sets. To this\nend, an upper bound is derived on the expected size of the CP set predictor\nthat builds on generalization error bounds for the base predictor. The derived\nupper bound provides insights into the dependence of the average size of the CP\nset predictor on the amount of calibration data, the target reliability, and\nthe generalization performance of the base predictor. The theoretical insights\nare validated using simple numerical regression and classification tasks.\n","authors":["Matteo Zecchin","Sangwoo Park","Osvaldo Simeone","Fredrik Hellström"],"pdf_url":"https://arxiv.org/pdf/2401.11810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01168v3","updated":"2024-01-22T10:09:20Z","published":"2022-12-02T13:47:21Z","title":"Towards Cross Domain Generalization of Hamiltonian Representation via\n  Meta Learning","summary":"  Recent advances in deep learning for physics have focused on discovering\nshared representations of target systems by incorporating physics priors or\ninductive biases into neural networks. While effective, these methods are\nlimited to the system domain, where the type of system remains consistent and\nthus cannot ensure the adaptation to new, or unseen physical systems governed\nby different laws. For instance, a neural network trained on a mass-spring\nsystem cannot guarantee accurate predictions for the behavior of a two-body\nsystem or any other system with different physical laws. In this work, we take\na significant leap forward by targeting cross domain generalization within the\nfield of Hamiltonian dynamics. We model our system with a graph neural network\nand employ a meta learning algorithm to enable the model to gain experience\nover a distribution of tasks and make it adapt to new physics. Our approach\naims to learn a unified Hamiltonian representation that is generalizable across\nmultiple system domains, thereby overcoming the limitations of system-specific\nmodels. Our results demonstrate that the meta-trained model not only adapts\neffectively to new systems but also captures a generalized Hamiltonian\nrepresentation that is consistent across different physical domains. Overall,\nthrough the use of meta learning, we offer a framework that achieves cross\ndomain generalization, providing a step towards a unified model for\nunderstanding a wide array of dynamical systems via deep learning.\n","authors":["Yeongwoo Song","Hawoong Jeong"],"pdf_url":"https://arxiv.org/pdf/2212.01168v3.pdf","comment":"Conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2311.06558v2","updated":"2024-01-22T10:07:39Z","published":"2023-11-11T12:28:31Z","title":"Convolve and Conquer: Data Comparison with Wiener Filters","summary":"  Quantitative evaluations of differences and/or similarities between data\nsamples define and shape optimisation problems associated with learning data\ndistributions. Current methods to compare data often suffer from limitations in\ncapturing such distributions or lack desirable mathematical properties for\noptimisation (e.g. smoothness, differentiability, or convexity). In this paper,\nwe introduce a new method to measure (dis)similarities between paired samples\ninspired by Wiener-filter theory. The convolutional nature of Wiener filters\nallows us to comprehensively compare data samples in a globally correlated way.\nWe validate our approach in four machine learning applications: data\ncompression, medical imaging imputation, translated classification, and\nnon-parametric generative modelling. Our results demonstrate increased\nresolution in reconstructed images with better perceptual quality and higher\ndata fidelity, as well as robustness against translations, compared to\nconventional mean-squared-error analogue implementations.\n","authors":["Deborah Pelacani Cruz","George Strong","Oscar Bates","Carlos Cueto","Jiashun Yao","Lluis Guasch"],"pdf_url":"https://arxiv.org/pdf/2311.06558v2.pdf","comment":"10 pages, 5 figures, Medical Imaging Meets Neurips Workshop"},{"id":"http://arxiv.org/abs/2401.11798v1","updated":"2024-01-22T09:54:49Z","published":"2024-01-22T09:54:49Z","title":"Knowledge Distillation on Spatial-Temporal Graph Convolutional Network\n  for Traffic Prediction","summary":"  Efficient real-time traffic prediction is crucial for reducing transportation\ntime. To predict traffic conditions, we employ a spatio-temporal graph neural\nnetwork (ST-GNN) to model our real-time traffic data as temporal graphs.\nDespite its capabilities, it often encounters challenges in delivering\nefficient real-time predictions for real-world traffic data. Recognizing the\nsignificance of timely prediction due to the dynamic nature of real-time data,\nwe employ knowledge distillation (KD) as a solution to enhance the execution\ntime of ST-GNNs for traffic prediction. In this paper, We introduce a cost\nfunction designed to train a network with fewer parameters (the student) using\ndistilled data from a complex network (the teacher) while maintaining its\naccuracy close to that of the teacher. We use knowledge distillation,\nincorporating spatial-temporal correlations from the teacher network to enable\nthe student to learn the complex patterns perceived by the teacher. However, a\nchallenge arises in determining the student network architecture rather than\nconsidering it inadvertently. To address this challenge, we propose an\nalgorithm that utilizes the cost function to calculate pruning scores,\naddressing small network architecture search issues, and jointly fine-tunes the\nnetwork resulting from each pruning stage using KD. Ultimately, we evaluate our\nproposed ideas on two real-world datasets, PeMSD7 and PeMSD8. The results\nindicate that our method can maintain the student's accuracy close to that of\nthe teacher, even with the retention of only $3\\%$ of network parameters.\n","authors":["Mohammad Izadi","Mehran Safayani","Abdolreza Mirzaei"],"pdf_url":"https://arxiv.org/pdf/2401.11798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12817v2","updated":"2024-01-22T09:44:18Z","published":"2023-10-19T15:12:44Z","title":"2D-3D Interlaced Transformer for Point Cloud Segmentation with\n  Scene-Level Supervision","summary":"  We present a Multimodal Interlaced Transformer (MIT) that jointly considers\n2D and 3D data for weakly supervised point cloud segmentation. Research studies\nhave shown that 2D and 3D features are complementary for point cloud\nsegmentation. However, existing methods require extra 2D annotations to achieve\n2D-3D information fusion. Considering the high annotation cost of point clouds,\neffective 2D and 3D feature fusion based on weakly supervised learning is in\ngreat demand. To this end, we propose a transformer model with two encoders and\none decoder for weakly supervised point cloud segmentation using only\nscene-level class tags. Specifically, the two encoders compute the\nself-attended features for 3D point clouds and 2D multi-view images,\nrespectively. The decoder implements interlaced 2D-3D cross-attention and\ncarries out implicit 2D and 3D feature fusion. We alternately switch the roles\nof queries and key-value pairs in the decoder layers. It turns out that the 2D\nand 3D features are iteratively enriched by each other. Experiments show that\nit performs favorably against existing weakly supervised point cloud\nsegmentation methods by a large margin on the S3DIS and ScanNet benchmarks. The\nproject page will be available at https://jimmy15923.github.io/mit_web/.\n","authors":["Cheng-Kun Yang","Min-Hung Chen","Yung-Yu Chuang","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2310.12817v2.pdf","comment":"ICCV 2023 (main + supp). Website:\n  https://jimmy15923.github.io/mit_web/"},{"id":"http://arxiv.org/abs/2401.11792v1","updated":"2024-01-22T09:44:16Z","published":"2024-01-22T09:44:16Z","title":"Safe and Generalized end-to-end Autonomous Driving System with\n  Reinforcement Learning and Demonstrations","summary":"  An intelligent driving system should be capable of dynamically formulating\nappropriate driving strategies based on the current environment and vehicle\nstatus, while ensuring the security and reliability of the system. However,\nexisting methods based on reinforcement learning and imitation learning suffer\nfrom low safety, poor generalization, and inefficient sampling. Additionally,\nthey cannot accurately predict future driving trajectories, and the accurate\nprediction of future driving trajectories is a precondition for making optimal\ndecisions. To solve these problems, in this paper, we introduce a Safe and\nGeneralized end-to-end Autonomous Driving System (SGADS) for complex and\nvarious scenarios. Our SGADS incorporates variational inference with\nnormalizing flows, enabling the intelligent vehicle to accurately predict\nfuture driving trajectories. Moreover, we propose the formulation of robust\nsafety constraints. Furthermore, we combine reinforcement learning with\ndemonstrations to augment search process of the agent. The experimental results\ndemonstrate that our SGADS can significantly improve safety performance,\nexhibit strong generalization, and enhance the training efficiency of\nintelligent vehicles in complex urban scenarios compared to existing methods.\n","authors":["Zuojin Tang","Xiaoyu Chen","YongQiang Li","Jianyu Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11791v1","updated":"2024-01-22T09:41:05Z","published":"2024-01-22T09:41:05Z","title":"SemPLeS: Semantic Prompt Learning for Weakly-Supervised Semantic\n  Segmentation","summary":"  Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation\nmodels using training image data with only image-level supervision. Since\nprecise pixel-level annotations are not accessible, existing methods typically\nfocus on producing pseudo masks for training segmentation models by refining\nCAM-like heatmaps. However, the produced heatmaps may only capture\ndiscriminative image regions of target object categories or the associated\nco-occurring backgrounds. To address the issues, we propose a Semantic Prompt\nLearning for WSSS (SemPLeS) framework, which learns to effectively prompt the\nCLIP space to enhance the semantic alignment between the segmented regions and\nthe target object categories. More specifically, we propose Contrastive Prompt\nLearning and Class-associated Semantic Refinement to learn the prompts that\nadequately describe and suppress the image backgrounds associated with each\ntarget object category. In this way, our proposed framework is able to perform\nbetter semantic matching between object regions and the associated text labels,\nresulting in desired pseudo masks for training the segmentation model. The\nproposed SemPLeS framework achieves SOTA performance on the standard WSSS\nbenchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the\nsemantic visualization of our learned prompts. The codes will be released.\n","authors":["Ci-Siang Lin","Chien-Yi Wang","Yu-Chiang Frank Wang","Min-Hung Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06101v2","updated":"2024-01-22T09:27:30Z","published":"2023-11-10T15:09:04Z","title":"In-Context Learning for MIMO Equalization Using Transformer-Based\n  Sequence Models","summary":"  Large pre-trained sequence models, such as transformer-based architectures,\nhave been recently shown to have the capacity to carry out in-context learning\n(ICL). In ICL, a decision on a new input is made via a direct mapping of the\ninput and of a few examples from the given task, serving as the task's context,\nto the output variable. No explicit updates of the model parameters are needed\nto tailor the decision to a new task. Pre-training, which amounts to a form of\nmeta-learning, is based on the observation of examples from several related\ntasks. Prior work has shown ICL capabilities for linear regression. In this\nstudy, we leverage ICL to address the inverse problem of multiple-input and\nmultiple-output (MIMO) equalization based on a context given by pilot symbols.\nA task is defined by the unknown fading channel and by the signal-to-noise\nratio (SNR) level, which may be known. To highlight the practical potential of\nthe approach, we allow the presence of quantization of the received signals. We\ndemonstrate via numerical results that transformer-based ICL has a threshold\nbehavior, whereby, as the number of pre-training tasks grows, the performance\nswitches from that of a minimum mean squared error (MMSE) equalizer with a\nprior determined by the pre-trained tasks to that of an MMSE equalizer with the\ntrue data-generating prior.\n","authors":["Matteo Zecchin","Kai Yu","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2311.06101v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11772v1","updated":"2024-01-22T09:09:10Z","published":"2024-01-22T09:09:10Z","title":"LightDiC: A Simple yet Effective Approach for Large-scale Digraph\n  Representation Learning","summary":"  Most existing graph neural networks (GNNs) are limited to undirected graphs,\nwhose restricted scope of the captured relational information hinders their\nexpressive capabilities and deployments in real-world scenarios. Compared with\nundirected graphs, directed graphs (digraphs) fit the demand for modeling more\ncomplex topological systems by capturing more intricate relationships between\nnodes, such as formulating transportation and financial networks. While some\ndirected GNNs have been introduced, their inspiration mainly comes from deep\nlearning architectures, which lead to redundant complexity and computation,\nmaking them inapplicable to large-scale databases. To address these issues, we\npropose LightDiC, a scalable variant of the digraph convolution based on the\nmagnetic Laplacian. Since topology-related computations are conducted solely\nduring offline pre-processing, LightDiC achieves exceptional scalability,\nenabling downstream predictions to be trained separately without incurring\nrecursive computational costs. Theoretical analysis shows that LightDiC\nutilizes directed information to achieve message passing based on the complex\nfield, which corresponds to the proximal gradient descent process of the\nDirichlet energy optimization function from the perspective of digraph signal\ndenoising, ensuring its expressiveness. Experimental results demonstrate that\nLightDiC performs comparably well or even outperforms other SOTA methods in\nvarious downstream tasks, with fewer learnable parameters and higher training\nefficiency. Notably, LightDiC is the first DiGNN to provide satisfactory\nresults in the most representative large-scale database (ogbn-papers100M).\n","authors":["Xunkai Li","Meihao Liao","Zhengyu Wu","Daohan Su","Wentao Zhang","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11772v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2401.11768v1","updated":"2024-01-22T09:03:16Z","published":"2024-01-22T09:03:16Z","title":"ADA-GNN: Atom-Distance-Angle Graph Neural Network for Crystal Material\n  Property Prediction","summary":"  Property prediction is a fundamental task in crystal material research. To\nmodel atoms and structures, structures represented as graphs are widely used\nand graph learning-based methods have achieved significant progress. Bond\nangles and bond distances are two key structural information that greatly\ninfluence crystal properties. However, most of the existing works only consider\nbond distances and overlook bond angles. The main challenge lies in the time\ncost of handling bond angles, which leads to a significant increase in\ninference time. To solve this issue, we first propose a crystal structure\nmodeling based on dual scale neighbor partitioning mechanism, which uses a\nlarger scale cutoff for edge neighbors and a smaller scale cutoff for angle\nneighbors. Then, we propose a novel Atom-Distance-Angle Graph Neural Network\n(ADA-GNN) for property prediction tasks, which can process node information and\nstructural information separately. The accuracy of predictions and inference\ntime are improved with the dual scale modeling and the specially designed\narchitecture of ADA-GNN. The experimental results validate that our approach\nachieves state-of-the-art results in two large-scale material benchmark\ndatasets on property prediction tasks.\n","authors":["Jiao Huang","Qianli Xing","Jinglong Ji","Bo Yang"],"pdf_url":"https://arxiv.org/pdf/2401.11768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05023v2","updated":"2024-01-22T08:47:49Z","published":"2023-06-08T08:22:27Z","title":"Beyond Vanilla Variational Autoencoders: Detecting Posterior Collapse in\n  Conditional and Hierarchical Variational Autoencoders","summary":"  The posterior collapse phenomenon in variational autoencoder (VAE), where the\nvariational posterior distribution closely matches the prior distribution, can\nhinder the quality of the learned latent variables. As a consequence of\nposterior collapse, the latent variables extracted by the encoder in VAE\npreserve less information from the input data and thus fail to produce\nmeaningful representations as input to the reconstruction process in the\ndecoder. While this phenomenon has been an actively addressed topic related to\nVAE performance, the theory for posterior collapse remains underdeveloped,\nespecially beyond the standard VAE. In this work, we advance the theoretical\nunderstanding of posterior collapse to two important and prevalent yet less\nstudied classes of VAE: conditional VAE and hierarchical VAE. Specifically, via\na non-trivial theoretical analysis of linear conditional VAE and hierarchical\nVAE with two levels of latent, we prove that the cause of posterior collapses\nin these models includes the correlation between the input and output of the\nconditional VAE and the effect of learnable encoder variance in the\nhierarchical VAE. We empirically validate our theoretical findings for linear\nconditional and hierarchical VAE and demonstrate that these results are also\npredictive for non-linear cases with extensive experiments.\n","authors":["Hien Dang","Tho Tran","Tan Nguyen","Nhat Ho"],"pdf_url":"https://arxiv.org/pdf/2306.05023v2.pdf","comment":"International Conference on Learning Representations (ICLR) 2024"},{"id":"http://arxiv.org/abs/2401.11760v1","updated":"2024-01-22T08:45:29Z","published":"2024-01-22T08:45:29Z","title":"Towards Effective and General Graph Unlearning via Mutual Evolution","summary":"  With the rapid advancement of AI applications, the growing needs for data\nprivacy and model robustness have highlighted the importance of machine\nunlearning, especially in thriving graph-based scenarios. However, most\nexisting graph unlearning strategies primarily rely on well-designed\narchitectures or manual process, rendering them less user-friendly and posing\nchallenges in terms of deployment efficiency. Furthermore, striking a balance\nbetween unlearning performance and framework generalization is also a pivotal\nconcern. To address the above issues, we propose \\underline{\\textbf{M}}utual\n\\underline{\\textbf{E}}volution \\underline{\\textbf{G}}raph\n\\underline{\\textbf{U}}nlearning (MEGU), a new mutual evolution paradigm that\nsimultaneously evolves the predictive and unlearning capacities of graph\nunlearning. By incorporating aforementioned two components, MEGU ensures\ncomplementary optimization in a unified training framework that aligns with the\nprediction and unlearning requirements. Extensive experiments on 9 graph\nbenchmark datasets demonstrate the superior performance of MEGU in addressing\nunlearning requirements at the feature, node, and edge levels. Specifically,\nMEGU achieves average performance improvements of 2.7\\%, 2.5\\%, and 3.2\\%\nacross these three levels of unlearning tasks when compared to state-of-the-art\nbaselines. Furthermore, MEGU exhibits satisfactory training efficiency,\nreducing time and space overhead by an average of 159.8x and 9.6x,\nrespectively, in comparison to retraining GNN from scratch.\n","authors":["Xunkai Li","Yulin Zhao","Zhengyu Wu","Wentao Zhang","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11760v1.pdf","comment":"Accepted by AAAI 2024 Oral"},{"id":"http://arxiv.org/abs/2401.09953v2","updated":"2024-01-22T08:32:02Z","published":"2024-01-18T12:58:53Z","title":"Through the Dual-Prism: A Spectral Perspective on Graph Data\n  Augmentation for Graph Classification","summary":"  Graph Neural Networks (GNNs) have become the preferred tool to process graph\ndata, with their efficacy being boosted through graph data augmentation\ntechniques. Despite the evolution of augmentation methods, issues like graph\nproperty distortions and restricted structural changes persist. This leads to\nthe question: Is it possible to develop more property-conserving and\nstructure-sensitive augmentation methods? Through a spectral lens, we\ninvestigate the interplay between graph properties, their augmentation, and\ntheir spectral behavior, and found that keeping the low-frequency eigenvalues\nunchanged can preserve the critical properties at a large scale when generating\naugmented graphs. These observations inform our introduction of the Dual-Prism\n(DP) augmentation method, comprising DP-Noise and DP-Mask, which adeptly\nretains essential graph properties while diversifying augmented graphs.\nExtensive experiments validate the efficiency of our approach, providing a new\nand promising direction for graph data augmentation.\n","authors":["Yutong Xia","Runpeng Yu","Yuxuan Liang","Xavier Bresson","Xinchao Wang","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2401.09953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11755v1","updated":"2024-01-22T08:31:53Z","published":"2024-01-22T08:31:53Z","title":"FedGTA: Topology-aware Averaging for Federated Graph Learning","summary":"  Federated Graph Learning (FGL) is a distributed machine learning paradigm\nthat enables collaborative training on large-scale subgraphs across multiple\nlocal systems. Existing FGL studies fall into two categories: (i) FGL\nOptimization, which improves multi-client training in existing machine learning\nmodels; (ii) FGL Model, which enhances performance with complex local models\nand multi-client interactions. However, most FGL optimization strategies are\ndesigned specifically for the computer vision domain and ignore graph\nstructure, presenting dissatisfied performance and slow convergence. Meanwhile,\ncomplex local model architectures in FGL Models studies lack scalability for\nhandling large-scale subgraphs and have deployment limitations. To address\nthese issues, we propose Federated Graph Topology-aware Aggregation (FedGTA), a\npersonalized optimization strategy that optimizes through topology-aware local\nsmoothing confidence and mixed neighbor features. During experiments, we deploy\nFedGTA in 12 multi-scale real-world datasets with the Louvain and Metis split.\nThis allows us to evaluate the performance and robustness of FedGTA across a\nrange of scenarios. Extensive experiments demonstrate that FedGTA achieves\nstate-of-the-art performance while exhibiting high scalability and efficiency.\nThe experiment includes ogbn-papers100M, the most representative large-scale\ngraph database so that we can verify the applicability of our method to\nlarge-scale graph learning. To the best of our knowledge, our study is the\nfirst to bridge large-scale graph learning with FGL using this optimization\nstrategy, contributing to the development of efficient and scalable FGL\nmethods.\n","authors":["Xunkai Li","Zhengyu Wu","Wentao Zhang","Yinlin Zhu","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11755v1.pdf","comment":"Accepted by VLDB 2024"},{"id":"http://arxiv.org/abs/2401.11750v1","updated":"2024-01-22T08:23:31Z","published":"2024-01-22T08:23:31Z","title":"AdaFGL: A New Paradigm for Federated Node Classification with Topology\n  Heterogeneity","summary":"  Recently, Federated Graph Learning (FGL) has attracted significant attention\nas a distributed framework based on graph neural networks, primarily due to its\ncapability to break data silos. Existing FGL studies employ community split on\nthe homophilous global graph by default to simulate federated semi-supervised\nnode classification settings. Such a strategy assumes the consistency of\ntopology between the multi-client subgraphs and the global graph, where\nconnected nodes are highly likely to possess similar feature distributions and\nthe same label. However, in real-world implementations, the varying\nperspectives of local data engineering result in various subgraph topologies,\nposing unique heterogeneity challenges in FGL. Unlike the well-known label\nNon-independent identical distribution (Non-iid) problems in federated\nlearning, FGL heterogeneity essentially reveals the topological divergence\namong multiple clients, namely homophily or heterophily. To simulate and handle\nthis unique challenge, we introduce the concept of structure Non-iid split and\nthen present a new paradigm called \\underline{Ada}ptive \\underline{F}ederated\n\\underline{G}raph \\underline{L}earning (AdaFGL), a decoupled two-step\npersonalized approach. To begin with, AdaFGL employs standard multi-client\nfederated collaborative training to acquire the federated knowledge extractor\nby aggregating uploaded models in the final round at the server. Then, each\nclient conducts personalized training based on the local subgraph and the\nfederated knowledge extractor. Extensive experiments on the 12 graph benchmark\ndatasets validate the superior performance of AdaFGL over state-of-the-art\nbaselines. Specifically, in terms of test accuracy, our proposed AdaFGL\noutperforms baselines by significant margins of 3.24\\% and 5.57\\% on community\nsplit and structure Non-iid split, respectively.\n","authors":["Xunkai Li","Zhengyu Wu","Wentao Zhang","Henan Sun","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11750v1.pdf","comment":"Accepted by ICDE 2024"},{"id":"http://arxiv.org/abs/2401.11748v1","updated":"2024-01-22T08:20:47Z","published":"2024-01-22T08:20:47Z","title":"GI-PIP: Do We Require Impractical Auxiliary Dataset for Gradient\n  Inversion Attacks?","summary":"  Deep gradient inversion attacks expose a serious threat to Federated Learning\n(FL) by accurately recovering private data from shared gradients. However, the\nstate-of-the-art heavily relies on impractical assumptions to access excessive\nauxiliary data, which violates the basic data partitioning principle of FL. In\nthis paper, a novel method, Gradient Inversion Attack using Practical Image\nPrior (GI-PIP), is proposed under a revised threat model. GI-PIP exploits\nanomaly detection models to capture the underlying distribution from fewer\ndata, while GAN-based methods consume significant more data to synthesize\nimages. The extracted distribution is then leveraged to regulate the attack\nprocess as Anomaly Score loss. Experimental results show that GI-PIP achieves a\n16.12 dB PSNR recovery using only 3.8\\% data of ImageNet, while GAN-based\nmethods necessitate over 70\\%. Moreover, GI-PIP exhibits superior capability on\ndistribution generalization compared to GAN-based methods. Our approach\nsignificantly alleviates the auxiliary data requirement on both amount and\ndistribution in gradient inversion attacks, hence posing more substantial\nthreat to real-world FL.\n","authors":["Yu sun","Gaojian Xiong","Xianxun Yao","Kailang Ma","Jian Cui"],"pdf_url":"https://arxiv.org/pdf/2401.11748v1.pdf","comment":"5pages, 5 figures, accepted to ICASSP 2024, not published yet"},{"id":"http://arxiv.org/abs/2401.10765v2","updated":"2024-01-22T08:17:42Z","published":"2024-01-19T15:37:11Z","title":"Starlit: Privacy-Preserving Federated Learning to Enhance Financial\n  Fraud Detection","summary":"  Federated Learning (FL) is a data-minimization approach enabling\ncollaborative model training across diverse clients with local data, avoiding\ndirect data exchange. However, state-of-the-art FL solutions to identify\nfraudulent financial transactions exhibit a subset of the following\nlimitations. They (1) lack a formal security definition and proof, (2) assume\nprior freezing of suspicious customers' accounts by financial institutions\n(limiting the solutions' adoption), (3) scale poorly, involving either $O(n^2)$\ncomputationally expensive modular exponentiation (where $n$ is the total number\nof financial institutions) or highly inefficient fully homomorphic encryption,\n(4) assume the parties have already completed the identity alignment phase,\nhence excluding it from the implementation, performance evaluation, and\nsecurity analysis, and (5) struggle to resist clients' dropouts. This work\nintroduces Starlit, a novel scalable privacy-preserving FL mechanism that\novercomes these limitations. It has various applications, such as enhancing\nfinancial fraud detection, mitigating terrorism, and enhancing digital health.\nWe implemented Starlit and conducted a thorough performance analysis using\nsynthetic data from a key player in global financial transactions. The\nevaluation indicates Starlit's scalability, efficiency, and accuracy.\n","authors":["Aydin Abadi","Bradley Doyle","Francesco Gini","Kieron Guinamard","Sasi Kumar Murakonda","Jack Liddell","Paul Mellor","Steven J. Murdoch","Mohammad Naseri","Hector Page","George Theodorakopoulos","Suzanne Weller"],"pdf_url":"https://arxiv.org/pdf/2401.10765v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19604v3","updated":"2024-01-22T08:13:50Z","published":"2023-05-31T07:22:15Z","title":"Medication Recommendation via Domain Knowledge Informed Deep Learning","summary":"  Medication recommendation is a fundamental yet crucial branch of healthcare,\nwhich provides opportunities to support clinical physicians with more accurate\nmedication prescriptions for patients with complex health conditions. Learning\nfrom electronic health records (EHR) to recommend medications is the most\ncommon way in previous studies. However, most of them neglect incorporating\ndomain knowledge according to the clinical manifestations in the EHR of the\npatient. To address these issues, we propose a novel \\textbf{D}omain\n\\textbf{K}nowledge \\textbf{I}nformed \\textbf{Net}work (DKINet) to integrate\ndomain knowledge with observable clinical manifestations of the patient, which\nis the first dynamic domain knowledge informed framework toward medication\nrecommendation. In particular, we first design a knowledge-driven encoder to\ncapture the domain information and then develop a data-driven encoder to\nintegrate domain knowledge into the observable EHR. To endow the model with the\ncapability of temporal decision, we design an explicit medication encoder for\nlearning the longitudinal dependence of the patient. Extensive experiments on\nthree publicly available datasets verify the superiority of our method. The\ncode will be public upon acceptance.\n","authors":["Sicen Liu","Xiaolong Wang","Xianbing Zhao","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2305.19604v3.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.11740v1","updated":"2024-01-22T07:37:25Z","published":"2024-01-22T07:37:25Z","title":"Multi-level Cross-modal Alignment for Image Clustering","summary":"  Recently, the cross-modal pretraining model has been employed to produce\nmeaningful pseudo-labels to supervise the training of an image clustering\nmodel. However, numerous erroneous alignments in a cross-modal pre-training\nmodel could produce poor-quality pseudo-labels and degrade clustering\nperformance. To solve the aforementioned issue, we propose a novel\n\\textbf{Multi-level Cross-modal Alignment} method to improve the alignments in\na cross-modal pretraining model for downstream tasks, by building a smaller but\nbetter semantic space and aligning the images and texts in three levels, i.e.,\ninstance-level, prototype-level, and semantic-level. Theoretical results show\nthat our proposed method converges, and suggests effective means to reduce the\nexpected clustering risk of our method. Experimental results on five benchmark\ndatasets clearly show the superiority of our new method.\n","authors":["Liping Qiu","Qin Zhang","Xiaojun Chen","Shaotian Cai"],"pdf_url":"https://arxiv.org/pdf/2401.11740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11739v1","updated":"2024-01-22T07:34:06Z","published":"2024-01-22T07:34:06Z","title":"EmerDiff: Emerging Pixel-level Semantic Knowledge in Diffusion Models","summary":"  Diffusion models have recently received increasing research attention for\ntheir remarkable transfer abilities in semantic segmentation tasks. However,\ngenerating fine-grained segmentation masks with diffusion models often requires\nadditional training on annotated datasets, leaving it unclear to what extent\npre-trained diffusion models alone understand the semantic relations of their\ngenerated images. To address this question, we leverage the semantic knowledge\nextracted from Stable Diffusion (SD) and aim to develop an image segmentor\ncapable of generating fine-grained segmentation maps without any additional\ntraining. The primary difficulty stems from the fact that semantically\nmeaningful feature maps typically exist only in the spatially lower-dimensional\nlayers, which poses a challenge in directly extracting pixel-level semantic\nrelations from these feature maps. To overcome this issue, our framework\nidentifies semantic correspondences between image pixels and spatial locations\nof low-dimensional feature maps by exploiting SD's generation process and\nutilizes them for constructing image-resolution segmentation maps. In extensive\nexperiments, the produced segmentation maps are demonstrated to be well\ndelineated and capture detailed parts of the images, indicating the existence\nof highly accurate pixel-level semantic knowledge in diffusion models.\n","authors":["Koichi Namekata","Amirmojtaba Sabour","Sanja Fidler","Seung Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2401.11739v1.pdf","comment":"ICLR 2024. Project page: https://kmcode1.github.io/Projects/EmerDiff/"},{"id":"http://arxiv.org/abs/2401.11736v1","updated":"2024-01-22T07:24:15Z","published":"2024-01-22T07:24:15Z","title":"Attention on Personalized Clinical Decision Support System: Federated\n  Learning Approach","summary":"  Health management has become a primary problem as new kinds of diseases and\ncomplex symptoms are introduced to a rapidly growing modern society. Building a\nbetter and smarter healthcare infrastructure is one of the ultimate goals of a\nsmart city. To the best of our knowledge, neural network models are already\nemployed to assist healthcare professionals in achieving this goal. Typically,\ntraining a neural network requires a rich amount of data but heterogeneous and\nvulnerable properties of clinical data introduce a challenge for the\ntraditional centralized network. Moreover, adding new inputs to a medical\ndatabase requires re-training an existing model from scratch. To tackle these\nchallenges, we proposed a deep learning-based clinical decision support system\ntrained and managed under a federated learning paradigm. We focused on a novel\nstrategy to guarantee the safety of patient privacy and overcome the risk of\ncyberattacks while enabling large-scale clinical data mining. As a result, we\ncan leverage rich clinical data for training each local neural network without\nthe need for exchanging the confidential data of patients. Moreover, we\nimplemented the proposed scheme as a sequence-to-sequence model architecture\nintegrating the attention mechanism. Thus, our objective is to provide a\npersonalized clinical decision support system with evolvable characteristics\nthat can deliver accurate solutions and assist healthcare professionals in\nmedical diagnosing.\n","authors":["Chu Myaet Thwal","Kyi Thar","Ye Lin Tun","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2401.11736v1.pdf","comment":"Published in IEEE BigComp 2021"},{"id":"http://arxiv.org/abs/2401.11731v1","updated":"2024-01-22T07:19:16Z","published":"2024-01-22T07:19:16Z","title":"Fast and Scalable Network Slicing by Integrating Deep Learning with\n  Lagrangian Methods","summary":"  Network slicing is a key technique in 5G and beyond for efficiently\nsupporting diverse services. Many network slicing solutions rely on deep\nlearning to manage complex and high-dimensional resource allocation problems.\nHowever, deep learning models suffer limited generalization and adaptability to\ndynamic slicing configurations. In this paper, we propose a novel framework\nthat integrates constrained optimization methods and deep learning models,\nresulting in strong generalization and superior approximation capability. Based\non the proposed framework, we design a new neural-assisted algorithm to\nallocate radio resources to slices to maximize the network utility under\ninter-slice resource constraints. The algorithm exhibits high scalability,\naccommodating varying numbers of slices and slice configurations with ease. We\nimplement the proposed solution in a system-level network simulator and\nevaluate its performance extensively by comparing it to state-of-the-art\nsolutions including deep reinforcement learning approaches. The numerical\nresults show that our solution obtains near-optimal quality-of-service\nsatisfaction and promising generalization performance under different network\nslicing scenarios.\n","authors":["Tianlun Hu","Qi Liao","Qiang Liu","Antonio Massaro","Georg Carle"],"pdf_url":"https://arxiv.org/pdf/2401.11731v1.pdf","comment":"6 pages, 5 figures, IEEE Global Communications Conference 2023"},{"id":"http://arxiv.org/abs/2305.00418v3","updated":"2024-01-22T07:09:17Z","published":"2023-04-30T07:28:06Z","title":"An Empirical Study of Using Large Language Models for Unit Test\n  Generation","summary":"  A code generation model generates code by taking a prompt from a code\ncomment, existing code, or a combination of both. Although code generation\nmodels (e.g., GitHub Copilot) are increasingly being adopted in practice, it is\nunclear whether they can successfully be used for unit test generation without\nfine-tuning for a strongly typed language like Java. To fill this gap, we\ninvestigated how well three models (Codex, GPT-3.5-Turbo, and StarCoder) can\ngenerate unit tests. We used two benchmarks (HumanEval and Evosuite SF110) to\ninvestigate the effect of context generation on the unit test generation\nprocess. We evaluated the models based on compilation rates, test correctness,\ntest coverage, and test smells. We found that the Codex model achieved above\n80% coverage for the HumanEval dataset, but no model had more than 2% coverage\nfor the EvoSuite SF110 benchmark. The generated tests also suffered from test\nsmells, such as Duplicated Asserts and Empty Tests.\n","authors":["Mohammed Latif Siddiq","Joanna C. S. Santos","Ridwanul Hasan Tanvir","Noshin Ulfat","Fahmid Al Rifat","Vinicius Carvalho Lopes"],"pdf_url":"https://arxiv.org/pdf/2305.00418v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11726v1","updated":"2024-01-22T07:07:32Z","published":"2024-01-22T07:07:32Z","title":"Detecting Out-of-Distribution Samples via Conditional Distribution\n  Entropy with Optimal Transport","summary":"  When deploying a trained machine learning model in the real world, it is\ninevitable to receive inputs from out-of-distribution (OOD) sources. For\ninstance, in continual learning settings, it is common to encounter OOD samples\ndue to the non-stationarity of a domain. More generally, when we have access to\na set of test inputs, the existing rich line of OOD detection solutions,\nespecially the recent promise of distance-based methods, falls short in\neffectively utilizing the distribution information from training samples and\ntest inputs. In this paper, we argue that empirical probability distributions\nthat incorporate geometric information from both training samples and test\ninputs can be highly beneficial for OOD detection in the presence of test\ninputs available. To address this, we propose to model OOD detection as a\ndiscrete optimal transport problem. Within the framework of optimal transport,\nwe propose a novel score function known as the \\emph{conditional distribution\nentropy} to quantify the uncertainty of a test input being an OOD sample. Our\nproposal inherits the merits of certain distance-based methods while\neliminating the reliance on distribution assumptions, a-prior knowledge, and\nspecific training mechanisms. Extensive experiments conducted on benchmark\ndatasets demonstrate that our method outperforms its competitors in OOD\ndetection.\n","authors":["Chuanwen Feng","Wenlong Chen","Ao Ke","Yilong Ren","Xike Xie","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.11726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11720v1","updated":"2024-01-22T06:47:00Z","published":"2024-01-22T06:47:00Z","title":"Graph Condensation: A Survey","summary":"  The burgeoning volume of graph data poses significant challenges in storage,\ntransmission, and particularly the training of graph neural networks (GNNs). To\naddress these challenges, graph condensation (GC) has emerged as an innovative\nsolution. GC focuses on synthesizing a compact yet highly representative graph,\non which GNNs can achieve performance comparable to trained on the large\noriginal graph. The notable efficacy of GC and its broad prospects have\ngarnered significant attention and spurred extensive research. This survey\npaper provides an up-to-date and systematic overview of GC, organizing existing\nresearch into four categories aligned with critical GC evaluation criteria:\neffectiveness, generalization, fairness, and efficiency. To facilitate an\nin-depth and comprehensive understanding of GC, we examine various methods\nunder each category and thoroughly discuss two essential components within GC:\noptimization strategies and condensed graph generation. Additionally, we\nintroduce the applications of GC in a variety of fields, and highlight the\npresent challenges and novel insights in GC, promoting advancements in future\nresearch.\n","authors":["Xinyi Gao","Junliang Yu","Wei Jiang","Tong Chen","Wentao Zhang","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2401.11720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11698v1","updated":"2024-01-22T05:44:43Z","published":"2024-01-22T05:44:43Z","title":"Admission Prediction in Undergraduate Applications: an Interpretable\n  Deep Learning Approach","summary":"  This article addresses the challenge of validating the admission committee's\ndecisions for undergraduate admissions. In recent years, the traditional review\nprocess has struggled to handle the overwhelmingly large amount of applicants'\ndata. Moreover, this traditional assessment often leads to human bias, which\nmight result in discrimination among applicants. Although classical machine\nlearning-based approaches exist that aim to verify the quantitative assessment\nmade by the application reviewers, these methods lack scalability and suffer\nfrom performance issues when a large volume of data is in place. In this\ncontext, we propose deep learning-based classifiers, namely Feed-Forward and\nInput Convex neural networks, which overcome the challenges faced by the\nexisting methods. Furthermore, we give additional insights into our model by\nincorporating an interpretability module, namely LIME. Our training and test\ndatasets comprise applicants' data with a wide range of variables and\ninformation. Our models achieve higher accuracy compared to the best-performing\ntraditional machine learning-based approach by a considerable margin of 3.03\\%.\nAdditionally, we show the sensitivity of different features and their relative\nimpacts on the overall admission decision using the LIME technique.\n","authors":["Amisha Priyadarshini","Barbara Martinez-Neda","Sergio Gago-Masague"],"pdf_url":"https://arxiv.org/pdf/2401.11698v1.pdf","comment":"This paper has been accepted for Transdisciplinary AI 2023 conference"},{"id":"http://arxiv.org/abs/2401.11694v1","updated":"2024-01-22T05:26:18Z","published":"2024-01-22T05:26:18Z","title":"Parametric Matrix Models","summary":"  We present a general class of machine learning algorithms called parametric\nmatrix models. Parametric matrix models are based on matrix equations, and the\ndesign is motivated by the efficiency of reduced basis methods for\napproximating solutions of parametric equations. The dependent variables can be\ndefined implicitly or explicitly, and the equations may use algebraic,\ndifferential, or integral relations. Parametric matrix models can be trained\nwith empirical data only, and no high-fidelity model calculations are needed.\nWhile originally designed for scientific computing, parametric matrix models\nare universal function approximators that can be applied to general machine\nlearning problems. After introducing the underlying theory, we apply parametric\nmatrix models to a series of different challenges that show their performance\nfor a wide range of problems. For all the challenges tested here, parametric\nmatrix models produce accurate results within a computational framework that\nallows for parameter extrapolation and interpretability.\n","authors":["Patrick Cook","Danny Jammooa","Morten Hjorth-Jensen","Daniel D. Lee","Dean Lee"],"pdf_url":"https://arxiv.org/pdf/2401.11694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10371v2","updated":"2024-01-22T05:24:17Z","published":"2024-01-18T20:35:47Z","title":"Langevin Unlearning: A New Perspective of Noisy Gradient Descent for\n  Machine Unlearning","summary":"  Machine unlearning has raised significant interest with the adoption of laws\nensuring the ``right to be forgotten''. Researchers have provided a\nprobabilistic notion of approximate unlearning under a similar definition of\nDifferential Privacy (DP), where privacy is defined as statistical\nindistinguishability to retraining from scratch. We propose Langevin\nunlearning, an unlearning framework based on noisy gradient descent with\nprivacy guarantees for approximate unlearning problems. Langevin unlearning\nunifies the DP learning process and the privacy-certified unlearning process\nwith many algorithmic benefits. These include approximate certified unlearning\nfor non-convex problems, complexity saving compared to retraining, sequential\nand batch unlearning for multiple unlearning requests. We verify the\npracticality of Langevin unlearning by studying its privacy-utility-complexity\ntrade-off via experiments on benchmark datasets, and also demonstrate its\nsuperiority against gradient-decent-plus-output-perturbation based approximate\nunlearning.\n","authors":["Eli Chien","Haoyu Wang","Ziang Chen","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2401.10371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11687v1","updated":"2024-01-22T04:54:42Z","published":"2024-01-22T04:54:42Z","title":"TIM: An Efficient Temporal Interaction Module for Spiking Transformer","summary":"  Spiking Neural Networks (SNNs), as the third generation of neural networks,\nhave gained prominence for their biological plausibility and computational\nefficiency, especially in processing diverse datasets. The integration of\nattention mechanisms, inspired by advancements in neural network architectures,\nhas led to the development of Spiking Transformers. These have shown promise in\nenhancing SNNs' capabilities, particularly in the realms of both static and\nneuromorphic datasets. Despite their progress, a discernible gap exists in\nthese systems, specifically in the Spiking Self Attention (SSA) mechanism's\neffectiveness in leveraging the temporal processing potential of SNNs. To\naddress this, we introduce the Temporal Interaction Module (TIM), a novel,\nconvolution-based enhancement designed to augment the temporal data processing\nabilities within SNN architectures. TIM's integration into existing SNN\nframeworks is seamless and efficient, requiring minimal additional parameters\nwhile significantly boosting their temporal information handling capabilities.\nThrough rigorous experimentation, TIM has demonstrated its effectiveness in\nexploiting temporal information, leading to state-of-the-art performance across\nvarious neuromorphic datasets.\n","authors":["Sicheng Shen","Dongcheng Zhao","Guobin Shen","Yi Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.11687v1.pdf","comment":"10pages,6figures"},{"id":"http://arxiv.org/abs/2310.03298v3","updated":"2024-01-22T04:39:36Z","published":"2023-10-05T03:56:09Z","title":"A Latent Variable Approach for Non-Hierarchical Multi-Fidelity Adaptive\n  Sampling","summary":"  Multi-fidelity (MF) methods are gaining popularity for enhancing surrogate\nmodeling and design optimization by incorporating data from various\nlow-fidelity (LF) models. While most existing MF methods assume a fixed\ndataset, adaptive sampling methods that dynamically allocate resources among\nfidelity models can achieve higher efficiency in the exploring and exploiting\nthe design space. However, most existing MF methods rely on the hierarchical\nassumption of fidelity levels or fail to capture the intercorrelation between\nmultiple fidelity levels and utilize it to quantify the value of the future\nsamples and navigate the adaptive sampling. To address this hurdle, we propose\na framework hinged on a latent embedding for different fidelity models and the\nassociated pre-posterior analysis to explicitly utilize their correlation for\nadaptive sampling. In this framework, each infill sampling iteration includes\ntwo steps: We first identify the location of interest with the greatest\npotential improvement using the high-fidelity (HF) model, then we search for\nthe next sample across all fidelity levels that maximize the improvement per\nunit cost at the location identified in the first step. This is made possible\nby a single Latent Variable Gaussian Process (LVGP) model that maps different\nfidelity models into an interpretable latent space to capture their\ncorrelations without assuming hierarchical fidelity levels. The LVGP enables us\nto assess how LF sampling candidates will affect HF response with pre-posterior\nanalysis and determine the next sample with the best benefit-to-cost ratio.\nThrough test cases, we demonstrate that the proposed method outperforms the\nbenchmark methods in both MF global fitting (GF) and Bayesian Optimization (BO)\nproblems in convergence rate and robustness. Moreover, the method offers the\nflexibility to switch between GF and BO by simply changing the acquisition\nfunction.\n","authors":["Yi-Ping Chen","Liwei Wang","Yigitcan Comlek","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2310.03298v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13158v2","updated":"2024-01-22T03:47:17Z","published":"2023-07-24T22:52:02Z","title":"Multi-UAV Speed Control with Collision Avoidance and Handover-aware Cell\n  Association: DRL with Action Branching","summary":"  This paper presents a deep reinforcement learning solution for optimizing\nmulti-UAV cell-association decisions and their moving velocity on a 3D aerial\nhighway. The objective is to enhance transportation and communication\nperformance, including collision avoidance, connectivity, and handovers. The\nproblem is formulated as a Markov decision process (MDP) with UAVs' states\ndefined by velocities and communication data rates. We propose a neural\narchitecture with a shared decision module and multiple network branches, each\ndedicated to a specific action dimension in a 2D transportation-communication\nspace. This design efficiently handles the multi-dimensional action space,\nallowing independence for individual action dimensions. We introduce two\nmodels, Branching Dueling Q-Network (BDQ) and Branching Dueling Double Deep\nQ-Network (Dueling DDQN), to demonstrate the approach. Simulation results show\na significant improvement of 18.32% compared to existing benchmarks.\n","authors":["Zijiang Yan","Wael Jaafar","Bassant Selim","Hina Tabassum"],"pdf_url":"https://arxiv.org/pdf/2307.13158v2.pdf","comment":"IEEE Globecom 2023 Accepted"},{"id":"http://arxiv.org/abs/2401.11679v1","updated":"2024-01-22T03:44:35Z","published":"2024-01-22T03:44:35Z","title":"Simulating Nighttime Visible Satellite Imagery of Tropical Cyclones\n  Using Conditional Generative Adversarial Networks","summary":"  Visible (VIS) imagery of satellites has various important applications in\nmeteorology, including monitoring Tropical Cyclones (TCs). However, it is\nunavailable at night because of the lack of sunlight. This study presents a\nConditional Generative Adversarial Networks (CGAN) model that generates highly\naccurate nighttime visible reflectance using infrared (IR) bands and sunlight\ndirection parameters as input. The model was trained and validated using target\narea observations of the Advanced Himawari Imager (AHI) in the daytime. This\nstudy also presents the first nighttime model validation using the Day/Night\nBand (DNB) of the Visible/Infrared Imager Radiometer Suite (VIIRS). The daytime\nstatistical results of the Structural Similarity Index Measure (SSIM), Peak\nSignal-to-Noise Ratio (PSNR), Root Mean Square Error (RMSE), Correlation\nCoefficient (CC), and Bias are 0.885, 28.3, 0.0428, 0.984, and -0.0016\nrespectively, completely surpassing the model performance of previous studies.\nThe nighttime statistical results of SSIM, PSNR, RMSE, and CC are 0.821, 24.4,\n0.0643, and 0.969 respectively, which are slightly negatively impacted by the\nparallax between satellites. We performed full-disk model validation which\nproves our model could also be readily applied in the tropical ocean without\nTCs in the northern hemisphere. This model contributes to the nighttime\nmonitoring of meteorological phenomena by providing accurate AI-generated\nvisible imagery with adjustable virtual sunlight directions.\n","authors":["Jinghuai Yao","Puyuan Du","Yucheng Zhao","Yubo Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01841v3","updated":"2024-01-22T03:43:34Z","published":"2024-01-03T17:19:54Z","title":"Act as You Learn: Adaptive Decision-Making in Non-Stationary Markov\n  Decision Processes","summary":"  A fundamental (and largely open) challenge in sequential decision-making is\ndealing with non-stationary environments, where exogenous environmental\nconditions change over time. Such problems are traditionally modeled as\nnon-stationary Markov decision processes (NSMDP). However, existing approaches\nfor decision-making in NSMDPs have two major shortcomings: first, they assume\nthat the updated environmental dynamics at the current time are known (although\nfuture dynamics can change); and second, planning is largely pessimistic, i.e.,\nthe agent acts ``safely'' to account for the non-stationary evolution of the\nenvironment. We argue that both these assumptions are invalid in practice --\nupdated environmental conditions are rarely known, and as the agent interacts\nwith the environment, it can learn about the updated dynamics and avoid being\npessimistic, at least in states whose dynamics it is confident about. We\npresent a heuristic search algorithm called \\textit{Adaptive Monte Carlo Tree\nSearch (ADA-MCTS)} that addresses these challenges. We show that the agent can\nlearn the updated dynamics of the environment over time and then act as it\nlearns, i.e., if the agent is in a region of the state space about which it has\nupdated knowledge, it can avoid being pessimistic. To quantify ``updated\nknowledge,'' we disintegrate the aleatoric and epistemic uncertainty in the\nagent's updated belief and show how the agent can use these estimates for\ndecision-making. We compare the proposed approach with the multiple\nstate-of-the-art approaches in decision-making across multiple well-established\nopen-source problems and empirically show that our approach is faster and\nhighly adaptive without sacrificing safety.\n","authors":["Baiting Luo","Yunuo Zhang","Abhishek Dubey","Ayan Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2401.01841v3.pdf","comment":"Accepted for publication at the International Conference on\n  Autonomous Agents and MultiAgent Systems (AAMAS), 2024"},{"id":"http://arxiv.org/abs/2401.11671v1","updated":"2024-01-22T03:09:00Z","published":"2024-01-22T03:09:00Z","title":"RTA-Former: Reverse Transformer Attention for Polyp Segmentation","summary":"  Polyp segmentation is a key aspect of colorectal cancer prevention, enabling\nearly detection and guiding subsequent treatments. Intelligent diagnostic\ntools, including deep learning solutions, are widely explored to streamline and\npotentially automate this process. However, even with many powerful network\narchitectures, there still comes the problem of producing accurate edge\nsegmentation. In this paper, we introduce a novel network, namely RTA-Former,\nthat employs a transformer model as the encoder backbone and innovatively\nadapts Reverse Attention (RA) with a transformer stage in the decoder for\nenhanced edge segmentation. The results of the experiments illustrate that\nRTA-Former achieves state-of-the-art (SOTA) performance in five polyp\nsegmentation datasets. The strong capability of RTA-Former holds promise in\nimproving the accuracy of Transformer-based polyp segmentation, potentially\nleading to better clinical decisions and patient outcomes. Our code will be\npublicly available on GitHub.\n","authors":["Zhikai Li","Murong Yi","Ali Uneri","Sihan Niu","Craig Jones"],"pdf_url":"https://arxiv.org/pdf/2401.11671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11669v1","updated":"2024-01-22T03:07:24Z","published":"2024-01-22T03:07:24Z","title":"An Improved Grey Wolf Optimization Algorithm for Heart Disease\n  Prediction","summary":"  This paper presents a unique solution to challenges in medical image\nprocessing by incorporating an adaptive curve grey wolf optimization (ACGWO)\nalgorithm into neural network backpropagation. Neural networks show potential\nin medical data but suffer from issues like overfitting and lack of\ninterpretability due to imbalanced and scarce data. Traditional Gray Wolf\nOptimization (GWO) also has its drawbacks, such as a lack of population\ndiversity and premature convergence. This paper addresses these problems by\nintroducing an adaptive algorithm, enhancing the standard GWO with a sigmoid\nfunction. This algorithm was extensively compared to four leading algorithms\nusing six well-known test functions, outperforming them effectively. Moreover,\nby utilizing the ACGWO, we increase the robustness and generalization of the\nneural network, resulting in more interpretable predictions. Applied to the\npublicly accessible Cleveland Heart Disease dataset, our technique surpasses\nten other methods, achieving 86.8% accuracy, indicating its potential for\nefficient heart disease prediction in the clinical setting.\n","authors":["Sihan Niu","Yifan Zhou","Zhikai Li","Shuyao Huang","Yujun Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.11669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11667v1","updated":"2024-01-22T02:59:27Z","published":"2024-01-22T02:59:27Z","title":"INCPrompt: Task-Aware incremental Prompting for Rehearsal-Free\n  Class-incremental Learning","summary":"  This paper introduces INCPrompt, an innovative continual learning solution\nthat effectively addresses catastrophic forgetting. INCPrompt's key innovation\nlies in its use of adaptive key-learner and task-aware prompts that capture\ntask-relevant information. This unique combination encapsulates general\nknowledge across tasks and encodes task-specific knowledge. Our comprehensive\nevaluation across multiple continual learning benchmarks demonstrates\nINCPrompt's superiority over existing algorithms, showing its effectiveness in\nmitigating catastrophic forgetting while maintaining high performance. These\nresults highlight the significant impact of task-aware incremental prompting on\ncontinual learning performance.\n","authors":["Zhiyuan Wang","Xiaoyang Qu","Jing Xiao","Bokui Chen","Jianzong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11667v1.pdf","comment":"Accepted by the 49th IEEE International Conference on Acoustics,\n  Speech, and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2401.11666v1","updated":"2024-01-22T02:58:53Z","published":"2024-01-22T02:58:53Z","title":"P2DT: Mitigating Forgetting in task-incremental Learning with\n  progressive prompt Decision Transformer","summary":"  Catastrophic forgetting poses a substantial challenge for managing\nintelligent agents controlled by a large model, causing performance degradation\nwhen these agents face new tasks. In our work, we propose a novel solution -\nthe Progressive Prompt Decision Transformer (P2DT). This method enhances a\ntransformer-based model by dynamically appending decision tokens during new\ntask training, thus fostering task-specific policies. Our approach mitigates\nforgetting in continual and offline reinforcement learning scenarios. Moreover,\nP2DT leverages trajectories collected via traditional reinforcement learning\nfrom all tasks and generates new task-specific tokens during training, thereby\nretaining knowledge from previous studies. Preliminary results demonstrate that\nour model effectively alleviates catastrophic forgetting and scales well with\nincreasing task environments.\n","authors":["Zhiyuan Wang","Xiaoyang Qu","Jing Xiao","Bokui Chen","Jianzong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11666v1.pdf","comment":"Accepted by the 49th IEEE International Conference on Acoustics,\n  Speech, and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2212.00325v2","updated":"2024-01-22T02:56:53Z","published":"2022-12-01T07:19:17Z","title":"HashVFL: Defending Against Data Reconstruction Attacks in Vertical\n  Federated Learning","summary":"  Vertical Federated Learning (VFL) is a trending collaborative machine\nlearning model training solution. Existing industrial frameworks employ secure\nmulti-party computation techniques such as homomorphic encryption to ensure\ndata security and privacy. Despite these efforts, studies have revealed that\ndata leakage remains a risk in VFL due to the correlations between intermediate\nrepresentations and raw data. Neural networks can accurately capture these\ncorrelations, allowing an adversary to reconstruct the data. This emphasizes\nthe need for continued research into securing VFL systems.\n  Our work shows that hashing is a promising solution to counter data\nreconstruction attacks. The one-way nature of hashing makes it difficult for an\nadversary to recover data from hash codes. However, implementing hashing in VFL\npresents new challenges, including vanishing gradients and information loss. To\naddress these issues, we propose HashVFL, which integrates hashing and\nsimultaneously achieves learnability, bit balance, and consistency.\n  Experimental results indicate that HashVFL effectively maintains task\nperformance while defending against data reconstruction attacks. It also brings\nadditional benefits in reducing the degree of label leakage, mitigating\nadversarial attacks, and detecting abnormal inputs. We hope our work will\ninspire further research into the potential applications of HashVFL.\n","authors":["Pengyu Qiu","Xuhong Zhang","Shouling Ji","Chong Fu","Xing Yang","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2212.00325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11665v1","updated":"2024-01-22T02:54:58Z","published":"2024-01-22T02:54:58Z","title":"Accelerating Approximate Thompson Sampling with Underdamped Langevin\n  Monte Carlo","summary":"  Approximate Thompson sampling with Langevin Monte Carlo broadens its reach\nfrom Gaussian posterior sampling to encompass more general smooth posteriors.\nHowever, it still encounters scalability issues in high-dimensional problems\nwhen demanding high accuracy. To address this, we propose an approximate\nThompson sampling strategy, utilizing underdamped Langevin Monte Carlo, where\nthe latter is the go-to workhorse for simulations of high-dimensional\nposteriors. Based on the standard smoothness and log-concavity conditions, we\nstudy the accelerated posterior concentration and sampling using a specific\npotential function. This design improves the sample complexity for realizing\nlogarithmic regrets from $\\mathcal{\\tilde O}(d)$ to $\\mathcal{\\tilde\nO}(\\sqrt{d})$. The scalability and robustness of our algorithm are also\nempirically validated through synthetic experiments in high-dimensional bandit\nproblems.\n","authors":["Haoyang Zheng","Wei Deng","Christian Moya","Guang Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11665v1.pdf","comment":"50 pages, 1 figure, to appear in AISTATS 2024"},{"id":"http://arxiv.org/abs/2401.11664v1","updated":"2024-01-22T02:50:38Z","published":"2024-01-22T02:50:38Z","title":"Zero-Space Cost Fault Tolerance for Transformer-based Language Models on\n  ReRAM","summary":"  Resistive Random Access Memory (ReRAM) has emerged as a promising platform\nfor deep neural networks (DNNs) due to its support for parallel in-situ\nmatrix-vector multiplication. However, hardware failures, such as\nstuck-at-fault defects, can result in significant prediction errors during\nmodel inference. While additional crossbars can be used to address these\nfailures, they come with storage overhead and are not efficient in terms of\nspace, energy, and cost. In this paper, we propose a fault protection mechanism\nthat incurs zero space cost. Our approach includes: 1) differentiable structure\npruning of rows and columns to reduce model redundancy, 2) weight duplication\nand voting for robust output, and 3) embedding duplicated most significant bits\n(MSBs) into the model weight. We evaluate our method on nine tasks of the GLUE\nbenchmark with the BERT model, and experimental results prove its\neffectiveness.\n","authors":["Bingbing Li","Geng Yuan","Zigeng Wang","Shaoyi Huang","Hongwu Peng","Payman Behnam","Wujie Wen","Hang Liu","Caiwen Ding"],"pdf_url":"https://arxiv.org/pdf/2401.11664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11835v2","updated":"2024-01-22T02:48:48Z","published":"2023-12-19T04:03:47Z","title":"Provably Convergent Federated Trilevel Learning","summary":"  Trilevel learning, also called trilevel optimization (TLO), has been\nrecognized as a powerful modelling tool for hierarchical decision process and\nwidely applied in many machine learning applications, such as robust neural\narchitecture search, hyperparameter optimization, and domain adaptation.\nTackling TLO problems has presented a great challenge due to their nested\ndecision-making structure. In addition, existing works on TLO face the\nfollowing key challenges: 1) they all focus on the non-distributed setting,\nwhich may lead to privacy breach; 2) they do not offer any non-asymptotic\nconvergence analysis which characterizes how fast an algorithm converges. To\naddress the aforementioned challenges, this paper proposes an asynchronous\nfederated trilevel optimization method to solve TLO problems. The proposed\nmethod utilizes $\\mu$-cuts to construct a hyper-polyhedral approximation for\nthe TLO problem and solve it in an asynchronous manner. We demonstrate that the\nproposed $\\mu$-cuts are applicable to not only convex functions but also a wide\nrange of non-convex functions that meet the $\\mu$-weakly convex assumption.\nFurthermore, we theoretically analyze the non-asymptotic convergence rate for\nthe proposed method by showing its iteration complexity to obtain\n$\\epsilon$-stationary point is upper bounded by\n$\\mathcal{O}(\\frac{1}{\\epsilon^2})$. Extensive experiments on real-world\ndatasets have been conducted to elucidate the superiority of the proposed\nmethod, e.g., it has a faster convergence rate with a maximum acceleration of\napproximately 80$\\%$.\n","authors":["Yang Jiao","Kai Yang","Tiancheng Wu","Chengtao Jian","Jianwei Huang"],"pdf_url":"https://arxiv.org/pdf/2312.11835v2.pdf","comment":"Accepted at AAAI 2024"},{"id":"http://arxiv.org/abs/2305.16789v2","updated":"2024-01-22T02:47:50Z","published":"2023-05-26T09:59:48Z","title":"Modulate Your Spectrum in Self-Supervised Learning","summary":"  Whitening loss offers a theoretical guarantee against feature collapse in\nself-supervised learning (SSL) with joint embedding architectures. Typically,\nit involves a hard whitening approach, transforming the embedding and applying\nloss to the whitened output. In this work, we introduce Spectral Transformation\n(ST), a framework to modulate the spectrum of embedding and to seek for\nfunctions beyond whitening that can avoid dimensional collapse. We show that\nwhitening is a special instance of ST by definition, and our empirical\ninvestigations unveil other ST instances capable of preventing collapse.\nAdditionally, we propose a novel ST instance named IterNorm with trace loss\n(INTL). Theoretical analysis confirms INTL's efficacy in preventing collapse\nand modulating the spectrum of embedding toward equal-eigenvalues during\noptimization. Our experiments on ImageNet classification and COCO object\ndetection demonstrate INTL's potential in learning superior representations.\nThe code is available at https://github.com/winci-ai/INTL.\n","authors":["Xi Weng","Yunhao Ni","Tengwei Song","Jie Luo","Rao Muhammad Anwer","Salman Khan","Fahad Shahbaz Khan","Lei Huang"],"pdf_url":"https://arxiv.org/pdf/2305.16789v2.pdf","comment":"Accepted at ICLR 2024. The code is available at\n  https://github.com/winci-ai/intl"},{"id":"http://arxiv.org/abs/2401.11660v1","updated":"2024-01-22T02:33:38Z","published":"2024-01-22T02:33:38Z","title":"Differentiable Tree Search in Latent State Space","summary":"  In decision-making problems with limited training data, policy functions\napproximated using deep neural networks often exhibit suboptimal performance.\nAn alternative approach involves learning a world model from the limited data\nand determining actions through online search. However, the performance is\nadversely affected by compounding errors arising from inaccuracies in the\nlearnt world model. While methods like TreeQN have attempted to address these\ninaccuracies by incorporating algorithmic structural biases into their\narchitectures, the biases they introduce are often weak and insufficient for\ncomplex decision-making tasks. In this work, we introduce Differentiable Tree\nSearch (DTS), a novel neural network architecture that significantly\nstrengthens the inductive bias by embedding the algorithmic structure of a\nbest-first online search algorithm. DTS employs a learnt world model to conduct\na fully differentiable online search in latent state space. The world model is\njointly optimised with the search algorithm, enabling the learning of a robust\nworld model and mitigating the effect of model inaccuracies. We address\npotential Q-function discontinuities arising from naive incorporation of\nbest-first search by adopting a stochastic tree expansion policy, formulating\nsearch tree expansion as a decision-making task, and introducing an effective\nvariance reduction technique for the gradient computation. We evaluate DTS in\nan offline-RL setting with a limited training data scenario on Procgen games\nand grid navigation task, and demonstrate that DTS outperforms popular\nmodel-free and model-based baselines.\n","authors":["Dixant Mittal","Wee Sun Lee"],"pdf_url":"https://arxiv.org/pdf/2401.11660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06333v2","updated":"2024-01-22T02:22:12Z","published":"2023-10-10T06:03:51Z","title":"Learning bounded-degree polytrees with known skeleton","summary":"  We establish finite-sample guarantees for efficient proper learning of\nbounded-degree polytrees, a rich class of high-dimensional probability\ndistributions and a subclass of Bayesian networks, a widely-studied type of\ngraphical model. Recently, Bhattacharyya et al. (2021) obtained finite-sample\nguarantees for recovering tree-structured Bayesian networks, i.e., 1-polytrees.\nWe extend their results by providing an efficient algorithm which learns\n$d$-polytrees in polynomial time and sample complexity for any bounded $d$ when\nthe underlying undirected graph (skeleton) is known. We complement our\nalgorithm with an information-theoretic sample complexity lower bound, showing\nthat the dependence on the dimension and target accuracy parameters are nearly\ntight.\n","authors":["Davin Choo","Joy Qiping Yang","Arnab Bhattacharyya","Clément L. Canonne"],"pdf_url":"https://arxiv.org/pdf/2310.06333v2.pdf","comment":"Fixed some typos. Added some discussions. Accepted to ALT 2024"},{"id":"http://arxiv.org/abs/2401.11652v1","updated":"2024-01-22T02:17:36Z","published":"2024-01-22T02:17:36Z","title":"OnDev-LCT: On-Device Lightweight Convolutional Transformers towards\n  federated learning","summary":"  Federated learning (FL) has emerged as a promising approach to\ncollaboratively train machine learning models across multiple edge devices\nwhile preserving privacy. The success of FL hinges on the efficiency of\nparticipating models and their ability to handle the unique challenges of\ndistributed learning. While several variants of Vision Transformer (ViT) have\nshown great potential as alternatives to modern convolutional neural networks\n(CNNs) for centralized training, the unprecedented size and higher\ncomputational demands hinder their deployment on resource-constrained edge\ndevices, challenging their widespread application in FL. Since client devices\nin FL typically have limited computing resources and communication bandwidth,\nmodels intended for such devices must strike a balance between model size,\ncomputational efficiency, and the ability to adapt to the diverse and non-IID\ndata distributions encountered in FL. To address these challenges, we propose\nOnDev-LCT: Lightweight Convolutional Transformers for On-Device vision tasks\nwith limited training data and resources. Our models incorporate image-specific\ninductive biases through the LCT tokenizer by leveraging efficient depthwise\nseparable convolutions in residual linear bottleneck blocks to extract local\nfeatures, while the multi-head self-attention (MHSA) mechanism in the LCT\nencoder implicitly facilitates capturing global representations of images.\nExtensive experiments on benchmark image datasets indicate that our models\noutperform existing lightweight vision models while having fewer parameters and\nlower computational demands, making them suitable for FL scenarios with data\nheterogeneity and communication bottlenecks.\n","authors":["Chu Myaet Thwal","Minh N. H. Nguyen","Ye Lin Tun","Seong Tae Kim","My T. Thai","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2401.11652v1.pdf","comment":"Published in Neural Networks"},{"id":"http://arxiv.org/abs/2312.02277v2","updated":"2024-01-22T02:03:50Z","published":"2023-12-04T19:00:07Z","title":"ALEXR: An Optimal Single-Loop Algorithm for Convex Finite-Sum Coupled\n  Compositional Stochastic Optimization","summary":"  This paper revisits a class of convex Finite-Sum Coupled Compositional\nStochastic Optimization (cFCCO) problems with many applications, including\ngroup distributionally robust optimization (GDRO), learning with imbalanced\ndata, reinforcement learning, and learning to rank. To better solve these\nproblems, we introduce an efficient single-loop primal-dual block-coordinate\nproximal algorithm, dubbed ALEXR. This algorithm leverages block-coordinate\nstochastic mirror ascent updates for the dual variable and stochastic proximal\ngradient descent updates for the primal variable. We establish the convergence\nrates of ALEXR in both convex and strongly convex cases under smoothness and\nnon-smoothness conditions of involved functions, which not only improve the\nbest rates in previous works on smooth cFCCO problems but also expand the realm\nof cFCCO for solving more challenging non-smooth problems such as the dual form\nof GDRO. Finally, we present lower complexity bounds to demonstrate that the\nconvergence rates of ALEXR are optimal among first-order block-coordinate\nstochastic algorithms for the considered class of cFCCO problems.\n","authors":["Bokun Wang","Tianbao Yang"],"pdf_url":"https://arxiv.org/pdf/2312.02277v2.pdf","comment":"Fixed several typos; Added some numerical experiments"},{"id":"http://arxiv.org/abs/2401.11648v1","updated":"2024-01-22T01:58:32Z","published":"2024-01-22T01:58:32Z","title":"Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal\n  Contrastive EHR Modelling with Hierarchical Regularisation","summary":"  Predicting next visit diagnosis using Electronic Health Records (EHR) is an\nessential task in healthcare, critical for devising proactive future plans for\nboth healthcare providers and patients. Nonetheless, many preceding studies\nhave not sufficiently addressed the heterogeneous and hierarchical\ncharacteristics inherent in EHR data, inevitably leading to sub-optimal\nperformance. To this end, we propose NECHO, a novel medical code-centric\nmultimodal contrastive EHR learning framework with hierarchical regularisation.\nFirst, we integrate multifaceted information encompassing medical codes,\ndemographics, and clinical notes using a tailored network design and a pair of\nbimodal contrastive losses, all of which pivot around a medical code\nrepresentation. We also regularise modality-specific encoders using a parental\nlevel information in medical ontology to learn hierarchical structure of EHR\ndata. A series of experiments on MIMIC-III data demonstrates effectiveness of\nour approach.\n","authors":["Heejoon Koo"],"pdf_url":"https://arxiv.org/pdf/2401.11648v1.pdf","comment":"Accepted to EACL 2024 (The 18th Conference of the European Chapter of\n  the Association for Computational Linguistics)"},{"id":"http://arxiv.org/abs/2401.11647v1","updated":"2024-01-22T01:57:31Z","published":"2024-01-22T01:57:31Z","title":"LW-FedSSL: Resource-efficient Layer-wise Federated Self-supervised\n  Learning","summary":"  Many recent studies integrate federated learning (FL) with self-supervised\nlearning (SSL) to take advantage of raw training data distributed across edge\ndevices. However, edge devices often struggle with high computation and\ncommunication costs imposed by SSL and FL algorithms. To tackle this hindrance,\nwe propose LW-FedSSL, a layer-wise federated self-supervised learning approach\nthat allows edge devices to incrementally train one layer of the model at a\ntime. LW-FedSSL comprises server-side calibration and representation alignment\nmechanisms to maintain comparable performance with end-to-end FedSSL while\nsignificantly lowering clients' resource requirements. The server-side\ncalibration mechanism takes advantage of the resource-rich server in an FL\nenvironment to assist in global model training. Meanwhile, the representation\nalignment mechanism encourages closeness between representations of FL local\nmodels and those of the global model. Our experiments show that LW-FedSSL has a\n$3.3 \\times$ lower memory requirement and a $3.2 \\times$ cheaper communication\ncost than its end-to-end counterpart. We also explore a progressive training\nstrategy called Prog-FedSSL that outperforms end-to-end training with a similar\nmemory requirement and a $1.8 \\times$ cheaper communication cost.\n","authors":["Ye Lin Tun","Chu Myaet Thwal","Le Quang Huy","Minh N. H. Nguyen","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2401.11647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11646v1","updated":"2024-01-22T01:45:34Z","published":"2024-01-22T01:45:34Z","title":"Nonparametric Estimation via Variance-Reduced Sketching","summary":"  Nonparametric models are of great interest in various scientific and\nengineering disciplines. Classical kernel methods, while numerically robust and\nstatistically sound in low-dimensional settings, become inadequate in\nhigher-dimensional settings due to the curse of dimensionality. In this paper,\nwe introduce a new framework called Variance-Reduced Sketching (VRS),\nspecifically designed to estimate density functions and nonparametric\nregression functions in higher dimensions with a reduced curse of\ndimensionality. Our framework conceptualizes multivariable functions as\ninfinite-size matrices, and facilitates a new sketching technique motivated by\nnumerical linear algebra literature to reduce the variance in estimation\nproblems. We demonstrate the robust numerical performance of VRS through a\nseries of simulated experiments and real-world data applications. Notably, VRS\nshows remarkable improvement over existing neural network estimators and\nclassical kernel methods in numerous density estimation and nonparametric\nregression models. Additionally, we offer theoretical justifications for VRS to\nsupport its ability to deliver nonparametric estimation with a reduced curse of\ndimensionality.\n","authors":["Yuehaw Khoo","Yifan Peng","Daren Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11646v1.pdf","comment":"64 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.16113v2","updated":"2024-01-22T01:38:12Z","published":"2023-12-20T08:16:53Z","title":"Task-Driven Causal Feature Distillation: Towards Trustworthy Risk\n  Prediction","summary":"  Since artificial intelligence has seen tremendous recent successes in many\nareas, it has sparked great interest in its potential for trustworthy and\ninterpretable risk prediction. However, most models lack causal reasoning and\nstruggle with class imbalance, leading to poor precision and recall. To address\nthis, we propose a Task-Driven Causal Feature Distillation model (TDCFD) to\ntransform original feature values into causal feature attributions for the\nspecific risk prediction task. The causal feature attribution helps describe\nhow much contribution the value of this feature can make to the risk prediction\nresult. After the causal feature distillation, a deep neural network is applied\nto produce trustworthy prediction results with causal interpretability and high\nprecision/recall. We evaluate the performance of our TDCFD method on several\nsynthetic and real datasets, and the results demonstrate its superiority over\nthe state-of-the-art methods regarding precision, recall, interpretability, and\ncausality.\n","authors":["Zhixuan Chu","Mengxuan Hu","Qing Cui","Longfei Li","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2312.16113v2.pdf","comment":"Proceedings of the 2024 AAAI Conference on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2109.01636v4","updated":"2024-01-22T01:23:23Z","published":"2021-09-03T17:28:04Z","title":"Empirical Study of Named Entity Recognition Performance Using\n  Distribution-aware Word Embedding","summary":"  With the fast development of Deep Learning techniques, Named Entity\nRecognition (NER) is becoming more and more important in the information\nextraction task. The greatest difficulty that the NER task faces is to keep the\ndetectability even when types of NE and documents are unfamiliar. Realizing\nthat the specificity information may contain potential meanings of a word and\ngenerate semantic-related features for word embedding, we develop a\ndistribution-aware word embedding and implement three different methods to make\nuse of the distribution information in a NER framework. And the result shows\nthat the performance of NER will be improved if the word specificity is\nincorporated into existing NER methods.\n","authors":["Xin Chen","Qi Zhao","Xinyang Liu"],"pdf_url":"https://arxiv.org/pdf/2109.01636v4.pdf","comment":"Want to correct"},{"id":"http://arxiv.org/abs/2401.01084v2","updated":"2024-01-22T01:16:24Z","published":"2024-01-02T07:56:17Z","title":"Global Convergence of Natural Policy Gradient with Hessian-aided\n  Momentum Variance Reduction","summary":"  Natural policy gradient (NPG) and its variants are widely-used policy search\nmethods in reinforcement learning. Inspired by prior work, a new NPG variant\ncoined NPG-HM is developed in this paper, which utilizes the Hessian-aided\nmomentum technique for variance reduction, while the sub-problem is solved via\nthe stochastic gradient descent method. It is shown that NPG-HM can achieve the\nglobal last iterate $\\epsilon$-optimality with a sample complexity of\n$\\mathcal{O}(\\epsilon^{-2})$, which is the best known result for natural policy\ngradient type methods under the generic Fisher non-degenerate policy\nparameterizations. The convergence analysis is built upon a relaxed weak\ngradient dominance property tailored for NPG under the compatible function\napproximation framework, as well as a neat way to decompose the error when\nhandling the sub-problem. Moreover, numerical experiments on Mujoco-based\nenvironments demonstrate the superior performance of NPG-HM over other\nstate-of-the-art policy gradient methods.\n","authors":["Jie Feng","Ke Wei","Jinchi Chen"],"pdf_url":"https://arxiv.org/pdf/2401.01084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17778v3","updated":"2024-01-22T00:54:30Z","published":"2023-06-30T16:31:14Z","title":"Look, Remember and Reason: Grounded reasoning in videos with language\n  models","summary":"  Multi-modal language models (LM) have recently shown promising performance in\nhigh-level reasoning tasks on videos. However, existing methods still fall\nshort in tasks like causal or compositional spatiotemporal reasoning over\nactions, in which model predictions need to be grounded in fine-grained\nlow-level details, such as object motions and object interactions. In this\nwork, we propose training an LM end-to-end on low-level surrogate tasks,\nincluding object detection, re-identification, and tracking, to endow the model\nwith the required low-level visual capabilities. We show that a two-stream\nvideo encoder with spatiotemporal attention is effective at capturing the\nrequired static and motion-based cues in the video. By leveraging the LM's\nability to perform the low-level surrogate tasks, we can cast reasoning in\nvideos as the three-step process of Look, Remember, Reason wherein visual\ninformation is extracted using low-level visual skills step-by-step and then\nintegrated to arrive at a final answer. We demonstrate the effectiveness of our\nframework on diverse visual reasoning tasks from the ACRE, CATER,\nSomething-Else and STAR datasets. Our approach is trainable end-to-end and\nsurpasses state-of-the-art task-specific methods across these tasks by a large\nmargin.\n","authors":["Apratim Bhattacharyya","Sunny Panchal","Mingu Lee","Reza Pourreza","Pulkit Madan","Roland Memisevic"],"pdf_url":"https://arxiv.org/pdf/2306.17778v3.pdf","comment":"To appear at ICLR 2024"},{"id":"http://arxiv.org/abs/2306.09136v3","updated":"2024-01-22T00:51:05Z","published":"2023-06-15T13:49:30Z","title":"Finite-Time Logarithmic Bayes Regret Upper Bounds","summary":"  We derive the first finite-time logarithmic Bayes regret upper bounds for\nBayesian bandits. In a multi-armed bandit, we obtain $O(c_\\Delta \\log n)$ and\n$O(c_h \\log^2 n)$ upper bounds for an upper confidence bound algorithm, where\n$c_h$ and $c_\\Delta$ are constants depending on the prior distribution and the\ngaps of bandit instances sampled from it, respectively. The latter bound\nasymptotically matches the lower bound of Lai (1987). Our proofs are a major\ntechnical departure from prior works, while being simple and general. To show\nthe generality of our techniques, we apply them to linear bandits. Our results\nprovide insights on the value of prior in the Bayesian setting, both in the\nobjective and as a side information given to the learner. They significantly\nimprove upon existing $\\tilde{O}(\\sqrt{n})$ bounds, which have become standard\nin the literature despite the logarithmic lower bound of Lai (1987).\n","authors":["Alexia Atsidakou","Branislav Kveton","Sumeet Katariya","Constantine Caramanis","Sujay Sanghavi"],"pdf_url":"https://arxiv.org/pdf/2306.09136v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13118v2","updated":"2024-01-22T00:50:55Z","published":"2023-12-20T15:37:50Z","title":"LRS: Enhancing Adversarial Transferability through Lipschitz Regularized\n  Surrogate","summary":"  The transferability of adversarial examples is of central importance to\ntransfer-based black-box adversarial attacks. Previous works for generating\ntransferable adversarial examples focus on attacking \\emph{given} pretrained\nsurrogate models while the connections between surrogate models and adversarial\ntrasferability have been overlooked. In this paper, we propose {\\em Lipschitz\nRegularized Surrogate} (LRS) for transfer-based black-box attacks, a novel\napproach that transforms surrogate models towards favorable adversarial\ntransferability. Using such transformed surrogate models, any existing\ntransfer-based black-box attack can run without any change, yet achieving much\nbetter performance. Specifically, we impose Lipschitz regularization on the\nloss landscape of surrogate models to enable a smoother and more controlled\noptimization process for generating more transferable adversarial examples. In\naddition, this paper also sheds light on the connection between the inner\nproperties of surrogate models and adversarial transferability, where three\nfactors are identified: smaller local Lipschitz constant, smoother loss\nlandscape, and stronger adversarial robustness. We evaluate our proposed LRS\napproach by attacking state-of-the-art standard deep neural networks and\ndefense models. The results demonstrate significant improvement on the attack\nsuccess rates and transferability. Our code is available at\nhttps://github.com/TrustAIoT/LRS.\n","authors":["Tao Wu","Tie Luo","Donald C. Wunsch"],"pdf_url":"https://arxiv.org/pdf/2312.13118v2.pdf","comment":"AAAI 2024 main track. Code available on Github (see abstract).\n  Appendix is included in this updated version"},{"id":"http://arxiv.org/abs/2206.14358v2","updated":"2024-01-22T00:38:08Z","published":"2022-06-29T01:57:44Z","title":"Using Twitter Data to Understand Public Perceptions of Approved versus\n  Off-label Use for COVID-19-related Medications","summary":"  Understanding public discourse on emergency use of unproven therapeutics is\ncrucial for monitoring safe use and combating misinformation. We developed a\nnatural language processing-based pipeline to comprehend public perceptions of\nand stances on coronavirus disease 2019 (COVID-19)-related drugs on Twitter\nover time. This retrospective study included 609,189 US-based tweets from\nJanuary 29, 2020, to November 30, 2021, about four drugs that garnered\nsignificant public attention during the COVID-19 pandemic: (1)\nHydroxychloroquine and Ivermectin, therapies with anecdotal evidence; and (2)\nMolnupiravir and Remdesivir, FDA-approved treatments for eligible patients.\nTime-trend analysis was employed to understand popularity trends and related\nevents. Content and demographic analyses were conducted to explore potential\nrationales behind people's stances on each drug. Time-trend analysis indicated\nthat Hydroxychloroquine and Ivermectin were discussed more than Molnupiravir\nand Remdesivir, particularly during COVID-19 surges. Hydroxychloroquine and\nIvermectin discussions were highly politicized, related to conspiracy theories,\nhearsay, and celebrity influences. The distribution of stances between the two\nmajor US political parties was significantly different (P < .001); Republicans\nwere more likely to support Hydroxychloroquine (55%) and Ivermectin (30%) than\nDemocrats. People with healthcare backgrounds tended to oppose\nHydroxychloroquine (7%) more than the general population, while the general\npopulation was more likely to support Ivermectin (14%). Our study found that\nsocial media users have varying perceptions and stances on off-label versus\nFDA-authorized drug use at different stages of COVID-19. This indicates that\nhealth systems, regulatory agencies, and policymakers should design tailored\nstrategies to monitor and reduce misinformation to promote safe drug use.\n","authors":["Yining Hua","Hang Jiang","Shixu Lin","Jie Yang","Joseph M. Plasek","David W. Bates","Li Zhou"],"pdf_url":"https://arxiv.org/pdf/2206.14358v2.pdf","comment":"Full paper published in JAMIA"},{"id":"http://arxiv.org/abs/2310.17168v2","updated":"2024-01-22T00:12:20Z","published":"2023-10-26T05:49:13Z","title":"Learning an Inventory Control Policy with General Inventory Arrival\n  Dynamics","summary":"  In this paper we address the problem of learning and backtesting inventory\ncontrol policies in the presence of general arrival dynamics -- which we term\nas a quantity-over-time arrivals model (QOT). We also allow for order\nquantities to be modified as a post-processing step to meet vendor constraints\nsuch as order minimum and batch size constraints -- a common practice in real\nsupply chains. To the best of our knowledge this is the first work to handle\neither arbitrary arrival dynamics or an arbitrary downstream post-processing of\norder quantities. Building upon recent work (Madeka et al., 2022) we similarly\nformulate the periodic review inventory control problem as an exogenous\ndecision process, where most of the state is outside the control of the agent.\nMadeka et al., 2022 show how to construct a simulator that replays historic\ndata to solve this class of problem. In our case, we incorporate a deep\ngenerative model for the arrivals process as part of the history replay. By\nformulating the problem as an exogenous decision process, we can apply results\nfrom Madeka et al., 2022 to obtain a reduction to supervised learning. Via\nsimulation studies we show that this approach yields statistically significant\nimprovements in profitability over production baselines. Using data from a\nreal-world A/B test, we show that Gen-QOT generalizes well to off-policy data\nand that the resulting buying policy outperforms traditional inventory\nmanagement systems in real world settings.\n","authors":["Sohrab Andaz","Carson Eisenach","Dhruv Madeka","Kari Torkkola","Randy Jia","Dean Foster","Sham Kakade"],"pdf_url":"https://arxiv.org/pdf/2310.17168v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.00647v2","updated":"2024-01-22T18:53:48Z","published":"2023-10-01T12:02:59Z","title":"Beyond Task Performance: Evaluating and Reducing the Flaws of Large\n  Multimodal Models with In-Context Learning","summary":"  Following the success of Large Language Models (LLMs), Large Multimodal\nModels (LMMs), such as the Flamingo model and its subsequent competitors, have\nstarted to emerge as natural steps towards generalist agents. However,\ninteracting with recent LMMs reveals major limitations that are hardly captured\nby the current evaluation benchmarks. Indeed, task performances (e.g., VQA\naccuracy) alone do not provide enough clues to understand their real\ncapabilities, limitations, and to which extent such models are aligned to human\nexpectations. To refine our understanding of those flaws, we deviate from the\ncurrent evaluation paradigm, and (1) evaluate 10 recent open-source LMMs from\n3B up to 80B parameter scale, on 5 different axes; hallucinations, abstention,\ncompositionality, explainability and instruction following. Our evaluation on\nthese axes reveals major flaws in LMMs. While the current go-to solution to\nalign these models is based on training, such as instruction tuning or RLHF, we\nrather (2) explore the training-free in-context learning (ICL) as a solution,\nand study how it affects these limitations. Based on our ICL study, (3) we push\nICL further and propose new multimodal ICL variants such as; Multitask-ICL,\nChain-of-Hindsight-ICL, and Self-Correcting-ICL. Our findings are as follows.\n(1) Despite their success, LMMs have flaws that remain unsolved with scaling\nalone. (2) The effect of ICL on LMMs flaws is nuanced; despite its\neffectiveness for improved explainability, answer abstention, ICL only slightly\nimproves instruction following, does not improve compositional abilities, and\nactually even amplifies hallucinations. (3) The proposed ICL variants are\npromising as post-hoc approaches to efficiently tackle some of those flaws. The\ncode is available here: https://github.com/mshukor/EvALign-ICL.\n","authors":["Mustafa Shukor","Alexandre Rame","Corentin Dancette","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2310.00647v2.pdf","comment":"ICLR 2024. Project Page: https://evalign-icl.github.io/"},{"id":"http://arxiv.org/abs/2401.11943v1","updated":"2024-01-22T13:33:53Z","published":"2024-01-22T13:33:53Z","title":"Benchmarking Large Multimodal Models against Common Corruptions","summary":"  This technical report aims to fill a deficiency in the assessment of large\nmultimodal models (LMMs) by specifically examining the self-consistency of\ntheir outputs when subjected to common corruptions. We investigate the\ncross-modal interactions between text, image, and speech, encompassing four\nessential generation tasks: text-to-image, image-to-text, text-to-speech, and\nspeech-to-text. We create a comprehensive benchmark, named MMCBench, that\ncovers more than 100 popular LMMs (totally over 150 model checkpoints). A\nthorough evaluation under common corruptions is critical for practical\ndeployment and facilitates a better understanding of the reliability of\ncutting-edge LMMs. The benchmarking code is available at\nhttps://github.com/sail-sg/MMCBench\n","authors":["Jiawei Zhang","Tianyu Pang","Chao Du","Yi Ren","Bo Li","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11943v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2401.11818v1","updated":"2024-01-22T10:26:52Z","published":"2024-01-22T10:26:52Z","title":"MInD: Improving Multimodal Sentiment Analysis via Multimodal Information\n  Disentanglement","summary":"  Learning effective joint representations has been a central task in\nmultimodal sentiment analysis. Previous methods focus on leveraging the\ncorrelations between different modalities and enhancing performance through\nsophisticated fusion techniques. However, challenges still exist due to the\ninherent heterogeneity of distinct modalities, which may lead to distributional\ngap, impeding the full exploitation of inter-modal information and resulting in\nredundancy and impurity in the information extracted from features. To address\nthis problem, we introduce the Multimodal Information Disentanglement (MInD)\napproach. MInD decomposes the multimodal inputs into a modality-invariant\ncomponent, a modality-specific component, and a remnant noise component for\neach modality through a shared encoder and multiple private encoders. The\nshared encoder aims to explore the shared information and commonality across\nmodalities, while the private encoders are deployed to capture the distinctive\ninformation and characteristic features. These representations thus furnish a\ncomprehensive perspective of the multimodal data, facilitating the fusion\nprocess instrumental for subsequent prediction tasks. Furthermore, MInD\nimproves the learned representations by explicitly modeling the task-irrelevant\nnoise in an adversarial manner. Experimental evaluations conducted on benchmark\ndatasets, including CMU-MOSI, CMU-MOSEI, and UR-Funny, demonstrate MInD's\nsuperior performance over existing state-of-the-art methods in both multimodal\nemotion recognition and multimodal humor detection tasks.\n","authors":["Weichen Dai","Xingyu Li","Pengbo Hu","Zeyu Wang","Ji Qi","Jianlin Peng","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.11818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11764v1","updated":"2024-01-22T08:59:09Z","published":"2024-01-22T08:59:09Z","title":"Identity-Driven Multimedia Forgery Detection via Reference Assistance","summary":"  Recent advancements in technologies, such as the 'deepfake' technique, have\npaved the way for the generation of various media forgeries. In response to the\npotential hazards of these media forgeries, many researchers engage in\nexploring detection methods, increasing the demand for high-quality media\nforgery datasets. Despite this, existing datasets have certain limitations.\nFirstly, most of datasets focus on the manipulation of visual modality and\nusually lack diversity, as only a few forgery approaches are considered.\nSecondly, the quality of media is often inadequate in clarity and naturalness.\nMeanwhile, the size of the dataset is also limited. Thirdly, while many\nreal-world forgeries are driven by identity, the identity information of the\nsubject in media is frequently neglected. For detection, identity information\ncould be an essential clue to boost accuracy. Moreover, official media\nconcerning certain identities on the Internet can serve as prior knowledge,\naiding both the audience and forgery detectors in determining the true\nidentity. Therefore, we propose an identity-driven multimedia forgery dataset,\nIDForge, which contains 249,138 video shots. All video shots are sourced from\n324 wild videos collected of 54 celebrities from the Internet. The fake video\nshots involve 9 types of manipulation across visual, audio and textual\nmodalities. Additionally, IDForge provides extra 214,438 real video shots as a\nreference set for the 54 celebrities. Correspondingly, we design an effective\nmultimedia detection network, Reference-assisted Multimodal Forgery Detection\nNetwork (R-MFDN). Through extensive experiments on the proposed dataset, we\ndemonstrate the effectiveness of R-MFDN on the multimedia detection task.\n","authors":["Junhao Xu","Jingjing Chen","Xue Song","Feng Han","Haijun Shan","Yugang Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.11764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12264v1","updated":"2024-01-22T08:16:48Z","published":"2024-01-22T08:16:48Z","title":"CoAVT: A Cognition-Inspired Unified Audio-Visual-Text Pre-Training Model\n  for Multimodal Processing","summary":"  There has been a long-standing quest for a unified audio-visual-text model to\nenable various multimodal understanding tasks, which mimics the listening,\nseeing and reading process of human beings. Humans tends to represent knowledge\nusing two separate systems: one for representing verbal (textual) information\nand one for representing non-verbal (visual and auditory) information. These\ntwo systems can operate independently but can also interact with each other.\nMotivated by this understanding of human cognition, in this paper, we introduce\nCoAVT -- a novel cognition-inspired Correlated Audio-Visual-Text pre-training\nmodel to connect the three modalities. It contains a joint audio-visual encoder\nthat learns to encode audio-visual synchronization information together with\nthe audio and visual content for non-verbal information, and a text encoder to\nhandle textual input for verbal information. To bridge the gap between\nmodalities, CoAVT employs a query encoder, which contains a set of learnable\nquery embeddings, and extracts the most informative audiovisual features of the\ncorresponding text. Additionally, to leverage the correspondences between audio\nand vision with language respectively, we also establish the audio-text and\nvisual-text bi-modal alignments upon the foundational audiovisual-text\ntri-modal alignment to enhance the multimodal representation learning. Finally,\nwe jointly optimize CoAVT model with three multimodal objectives: contrastive\nloss, matching loss and language modeling loss. Extensive experiments show that\nCoAVT can learn strong multimodal correlations and be generalized to various\ndownstream tasks. CoAVT establishes new state-of-the-art performance on\ntext-video retrieval task on AudioCaps for both zero-shot and fine-tuning\nsettings, audio-visual event classification and audio-visual retrieval tasks on\nAudioSet and VGGSound.\n","authors":["Xianghu Yue","Xiaohai Tian","Malu Zhang","Zhizheng Wu","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2401.12264v1.pdf","comment":null}]},"2024-01-21T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.11631v1","updated":"2024-01-21T23:54:05Z","published":"2024-01-21T23:54:05Z","title":"Text-to-Image Cross-Modal Generation: A Systematic Review","summary":"  We review research on generating visual data from text from the angle of\n\"cross-modal generation.\" This point of view allows us to draw parallels\nbetween various methods geared towards working on input text and producing\nvisual output, without limiting the analysis to narrow sub-areas. It also\nresults in the identification of common templates in the field, which are then\ncompared and contrasted both within pools of similar methods and across lines\nof research. We provide a breakdown of text-to-image generation into various\nflavors of image-from-text methods, video-from-text methods, image editing,\nself-supervised and graph-based approaches. In this discussion, we focus on\nresearch papers published at 8 leading machine learning conferences in the\nyears 2016-2022, also incorporating a number of relevant papers not matching\nthe outlined search criteria. The conducted review suggests a significant\nincrease in the number of papers published in the area and highlights research\ngaps and potential lines of investigation. To our knowledge, this is the first\nreview to systematically look at text-to-image generation from the perspective\nof \"cross-modal generation.\"\n","authors":["Maciej Żelaszczyk","Jacek Mańdziuk"],"pdf_url":"https://arxiv.org/pdf/2401.11631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11626v1","updated":"2024-01-21T23:37:33Z","published":"2024-01-21T23:37:33Z","title":"Freely Long-Thinking Transformer (FraiLT)","summary":"  Freely Long-Thinking Transformer (FraiLT) is an improved transformer model\ndesigned to enhance processing capabilities without scaling up size. It\nutilizes a recursive approach, iterating over a subset of layers multiple\ntimes, and introduces iteration encodings to maintain awareness across these\ncycles. Iteration encoding allows FraiLT to achieve the interpretive depth of\nlarger models in a compact form. When evaluated on a synthetic story dataset,\nFraiLT outperformed larger models, showcasing its ability to deliver\nhigh-quality performance while reducing memory demands. This model represents a\nstep forward towards more efficient and accessible language models.\n","authors":["Akbay Tabak"],"pdf_url":"https://arxiv.org/pdf/2401.11626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11624v1","updated":"2024-01-21T23:34:42Z","published":"2024-01-21T23:34:42Z","title":"In-context Learning with Retrieved Demonstrations for Language Models: A\n  Survey","summary":"  Language models, especially pre-trained large language models, have showcased\nremarkable abilities as few-shot in-context learners (ICL), adept at adapting\nto new tasks with just a few demonstrations in the input context. However, the\nmodel's ability to perform ICL is sensitive to the choice of the few-shot\ndemonstrations. Instead of using a fixed set of demonstrations, one recent\ndevelopment is to retrieve demonstrations tailored to each input query. The\nimplementation of demonstration retrieval is relatively straightforward,\nleveraging existing databases and retrieval systems. This not only improves the\nefficiency and scalability of the learning process but also has been shown to\nreduce biases inherent in manual example selection. In light of the encouraging\nresults and growing research in ICL with retrieved demonstrations, we conduct\nan extensive review of studies in this area. In this survey, we discuss and\ncompare different design choices for retrieval models, retrieval training\nprocedures, and inference algorithms.\n","authors":["an Luo","Xin Xu","Yue Liu","Panupong Pasupat","Mehran Kazemi"],"pdf_url":"https://arxiv.org/pdf/2401.11624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11601v1","updated":"2024-01-21T21:21:51Z","published":"2024-01-21T21:21:51Z","title":"Robust Evaluation Measures for Evaluating Social Biases in Masked\n  Language Models","summary":"  Many evaluation measures are used to evaluate social biases in masked\nlanguage models (MLMs). However, we find that these previously proposed\nevaluation measures are lacking robustness in scenarios with limited datasets.\nThis is because these measures are obtained by comparing the\npseudo-log-likelihood (PLL) scores of the stereotypical and anti-stereotypical\nsamples using an indicator function. The disadvantage is the limited mining of\nthe PLL score sets without capturing its distributional information. In this\npaper, we represent a PLL score set as a Gaussian distribution and use Kullback\nLeibler (KL) divergence and Jensen Shannon (JS) divergence to construct\nevaluation measures for the distributions of stereotypical and\nanti-stereotypical PLL scores. Experimental results on the publicly available\ndatasets StereoSet (SS) and CrowS-Pairs (CP) show that our proposed measures\nare significantly more robust and interpretable than those proposed previously.\n","authors":["Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.11601v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.01361v2","updated":"2024-01-21T21:01:12Z","published":"2023-10-02T17:23:48Z","title":"GenSim: Generating Robotic Simulation Tasks via Large Language Models","summary":"  Collecting large amounts of real-world interaction data to train general\nrobotic policies is often prohibitively expensive, thus motivating the use of\nsimulation data. However, existing methods for data generation have generally\nfocused on scene-level diversity (e.g., object instances and poses) rather than\ntask-level diversity, due to the human effort required to come up with and\nverify novel tasks. This has made it challenging for policies trained on\nsimulation data to demonstrate significant task-level generalization. In this\npaper, we propose to automatically generate rich simulation environments and\nexpert demonstrations by exploiting a large language models' (LLM) grounding\nand coding ability. Our approach, dubbed GenSim, has two modes: goal-directed\ngeneration, wherein a target task is given to the LLM and the LLM proposes a\ntask curriculum to solve the target task, and exploratory generation, wherein\nthe LLM bootstraps from previous tasks and iteratively proposes novel tasks\nthat would be helpful in solving more complex tasks. We use GPT4 to expand the\nexisting benchmark by ten times to over 100 tasks, on which we conduct\nsupervised finetuning and evaluate several LLMs including finetuned GPTs and\nCode Llama on code generation for robotic simulation tasks. Furthermore, we\nobserve that LLMs-generated simulation programs can enhance task-level\ngeneralization significantly when used for multitask policy training. We\nfurther find that with minimal sim-to-real adaptation, the multitask policies\npretrained on GPT4-generated simulation tasks exhibit stronger transfer to\nunseen long-horizon tasks in the real world and outperform baselines by 25%.\nSee the project website (https://liruiw.github.io/gensim) for code, demos, and\nvideos.\n","authors":["Lirui Wang","Yiyang Ling","Zhecheng Yuan","Mohit Shridhar","Chen Bao","Yuzhe Qin","Bailin Wang","Huazhe Xu","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01361v2.pdf","comment":"See our project website (https://liruiw.github.io/gensim), demo and\n  datasets (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code\n  (https://github.com/liruiw/GenSim) for more details"},{"id":"http://arxiv.org/abs/2309.12244v2","updated":"2024-01-21T16:30:35Z","published":"2023-09-21T16:43:17Z","title":"ChaCha: Leveraging Large Language Models to Prompt Children to Share\n  Their Emotions about Personal Events","summary":"  Children typically learn to identify and express emotions through sharing\ntheir stories and feelings with others, particularly their family. However, it\nis challenging for parents or siblings to have emotional communication with\nchildren since children are still developing their communication skills. We\npresent ChaCha, a chatbot that encourages and guides children to share personal\nevents and associated emotions. ChaCha combines a state machine and large\nlanguage models (LLMs) to keep the dialogue on track while carrying on\nfree-form conversations. Through an exploratory study with 20 children (aged\n8-12), we examine how ChaCha prompts children to share personal events and\nguides them to describe associated emotions. Participants perceived ChaCha as a\nclose friend and shared their stories on various topics, such as family trips\nand personal achievements. Based on the findings, we discuss opportunities for\nleveraging LLMs to design child-friendly chatbots to support children in\nsharing emotions.\n","authors":["Woosuk Seo","Chanmo Yang","Young-Ho Kim"],"pdf_url":"https://arxiv.org/pdf/2309.12244v2.pdf","comment":"16 pages, 5 figures, 2 tables; Accepted at ACM CHI 2024"},{"id":"http://arxiv.org/abs/2401.09074v2","updated":"2024-01-21T15:15:30Z","published":"2024-01-17T09:23:59Z","title":"Code Simulation Challenges for Large Language Models","summary":"  We investigate the extent to which Large Language Models (LLMs) can simulate\nthe execution of computer code and algorithms. We begin by looking at straight\nline programs, and show that current LLMs demonstrate poor performance even\nwith such simple programs -- performance rapidly degrades with the length of\ncode. We then investigate the ability of LLMs to simulate programs that contain\ncritical paths and redundant instructions. We also go beyond straight line\nprogram simulation with sorting algorithms and nested loops, and we show the\ncomputational complexity of a routine directly affects the ability of an LLM to\nsimulate its execution. We observe that LLMs execute instructions sequentially\nand with a low error margin only for short programs or standard procedures.\nLLMs' code simulation is in tension with their pattern recognition and\nmemorisation capabilities: on tasks where memorisation is detrimental, we\npropose a novel prompting method to simulate code execution line by line.\nEmpirically, our new Chain of Simulation (CoSm) method improves on the standard\nChain of Thought prompting approach by avoiding the pitfalls of memorisation.\n","authors":["Emanuele La Malfa","Christoph Weinhuber","Orazio Torre","Fangru Lin","Anthony Cohn","Nigel Shadbolt","Michael Wooldridge"],"pdf_url":"https://arxiv.org/pdf/2401.09074v2.pdf","comment":"main paper (10 pages) + Appendix (11 pages)"},{"id":"http://arxiv.org/abs/2302.12584v2","updated":"2024-01-21T14:51:26Z","published":"2023-02-24T11:44:24Z","title":"VivesDebate-Speech: A Corpus of Spoken Argumentation to Leverage Audio\n  Features for Argument Mining","summary":"  In this paper, we describe VivesDebate-Speech, a corpus of spoken\nargumentation created to leverage audio features for argument mining tasks. The\ncreation of this corpus represents an important contribution to the\nintersection of speech processing and argument mining communities, and one of\nthe most complete publicly available resources in this topic. Moreover, we have\nperformed a set of first-of-their-kind experiments which show an improvement\nwhen integrating audio features into the argument mining pipeline. The provided\nresults can be used as a baseline for future research.\n","authors":["Ramon Ruiz-Dolz","Javier Iranzo-Sánchez"],"pdf_url":"https://arxiv.org/pdf/2302.12584v2.pdf","comment":"5 pages; EMNLP 2023 Accepted Version"},{"id":"http://arxiv.org/abs/2203.14647v2","updated":"2024-01-21T14:39:30Z","published":"2022-03-28T11:09:07Z","title":"Automatic Debate Evaluation with Argumentation Semantics and Natural\n  Language Argument Graph Networks","summary":"  The lack of annotated data on professional argumentation and complete\nargumentative debates has led to the oversimplification and the inability of\napproaching more complex natural language processing tasks. Such is the case of\nthe automatic debate evaluation. In this paper, we propose an original hybrid\nmethod to automatically evaluate argumentative debates. For that purpose, we\ncombine concepts from argumentation theory such as argumentation frameworks and\nsemantics, with Transformer-based architectures and neural graph networks.\nFurthermore, we obtain promising results that lay the basis on an unexplored\nnew instance of the automatic analysis of natural language arguments.\n","authors":["Ramon Ruiz-Dolz","Stella Heras","Ana García-Fornes"],"pdf_url":"https://arxiv.org/pdf/2203.14647v2.pdf","comment":"EMNLP 2023 Accepted Version"},{"id":"http://arxiv.org/abs/2401.11505v1","updated":"2024-01-21T14:30:20Z","published":"2024-01-21T14:30:20Z","title":"CheX-GPT: Harnessing Large Language Models for Enhanced Chest X-ray\n  Report Labeling","summary":"  Free-text radiology reports present a rich data source for various medical\ntasks, but effectively labeling these texts remains challenging. Traditional\nrule-based labeling methods fall short of capturing the nuances of diverse\nfree-text patterns. Moreover, models using expert-annotated data are limited by\ndata scarcity and pre-defined classes, impacting their performance, flexibility\nand scalability. To address these issues, our study offers three main\ncontributions: 1) We demonstrate the potential of GPT as an adept labeler using\ncarefully designed prompts. 2) Utilizing only the data labeled by GPT, we\ntrained a BERT-based labeler, CheX-GPT, which operates faster and more\nefficiently than its GPT counterpart. 3) To benchmark labeler performance, we\nintroduced a publicly available expert-annotated test set, MIMIC-500,\ncomprising 500 cases from the MIMIC validation set. Our findings demonstrate\nthat CheX-GPT not only excels in labeling accuracy over existing models, but\nalso showcases superior efficiency, flexibility, and scalability, supported by\nour introduction of the MIMIC-500 dataset for robust benchmarking. Code and\nmodels are available at https://github.com/kakaobrain/CheXGPT.\n","authors":["Jawook Gu","Han-Cheol Cho","Jiho Kim","Kihyun You","Eun Kyoung Hong","Byungseok Roh"],"pdf_url":"https://arxiv.org/pdf/2401.11505v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.11504v1","updated":"2024-01-21T14:28:41Z","published":"2024-01-21T14:28:41Z","title":"With Greater Text Comes Greater Necessity: Inference-Time Training Helps\n  Long Text Generation","summary":"  Long text generation, such as novel writing or discourse-level translation\nwith extremely long contexts, presents significant challenges to current\nlanguage models. Existing methods mainly focus on extending the model's context\nwindow through strategies like length extrapolation. However, these approaches\ndemand substantial hardware resources during the training and/or inference\nphases. Our proposed method, Temp-Lora, introduces an alternative concept.\nInstead of relying on the KV cache to store all context information, Temp-Lora\nembeds this information directly into the model's parameters. In the process of\nlong text generation, we use a temporary Lora module, progressively trained\nwith text generated previously. This approach not only efficiently preserves\ncontextual knowledge but also prevents any permanent alteration to the model's\nparameters given that the module is discarded post-generation. Extensive\nexperiments on the PG19 language modeling benchmark and the GuoFeng\ndiscourse-level translation benchmark validate the effectiveness of Temp-Lora.\nOur results show that: 1) Temp-Lora substantially enhances generation quality\nfor long texts, as indicated by a 13.2% decrease in perplexity on a subset of\nPG19, and a 29.6% decrease in perplexity along with a 53.2% increase in BLEU\nscore on GuoFeng, 2) Temp-Lora is compatible with and enhances most existing\nlong text generation methods, and 3) Temp-Lora can greatly reduce computational\ncosts by shortening the context window. While ensuring a slight improvement in\ngeneration quality (a decrease of 3.8% in PPL), it enables a reduction of 70.5%\nin the FLOPs required for inference and a 51.5% decrease in latency.\n","authors":["Y. Wang","D. Ma","D. Cai"],"pdf_url":"https://arxiv.org/pdf/2401.11504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05173v4","updated":"2024-01-21T13:38:20Z","published":"2023-09-11T00:02:05Z","title":"DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning","summary":"  Prompt tuning (PT), where a small amount of trainable soft (continuous)\nprompt vectors is affixed to the input of language models (LM), has shown\npromising results across various tasks and models for parameter-efficient\nfine-tuning (PEFT). PT stands out from other PEFT approaches because it\nmaintains competitive performance with fewer trainable parameters and does not\ndrastically scale up its parameters as the model size expands. However, PT\nintroduces additional soft prompt tokens, leading to longer input sequences,\nwhich significantly impacts training and inference time and memory usage due to\nthe Transformer's quadratic complexity. Particularly concerning for Large\nLanguage Models (LLMs) that face heavy daily querying. To address this issue,\nwe propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt\ninto a shorter soft prompt and a pair of low-rank matrices that are then\noptimised with two different learning rates. This allows DePT to achieve better\nperformance while saving substantial memory and time costs compared to vanilla\nPT and its variants, without changing trainable parameter sizes. Through\nextensive experiments on 23 natural language processing (NLP) and\nvision-language (VL) tasks, we demonstrate that DePT outperforms\nstate-of-the-art PEFT approaches, including the full fine-tuning baseline, in\nsome scenarios. Additionally, we empirically show that DEPT grows more\nefficient as the model size increases. Our further study reveals that DePT\nintegrates seamlessly with parameter-efficient transfer learning in the\nfew-shot learning setting and highlights its adaptability to various model\narchitectures and sizes.\n","authors":["Zhengxiang Shi","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2309.05173v4.pdf","comment":"ICLR 2024. Code is available at https://github.com/ZhengxiangShi/DePT"},{"id":"http://arxiv.org/abs/2401.11487v1","updated":"2024-01-21T13:18:20Z","published":"2024-01-21T13:18:20Z","title":"Towards Better Inclusivity: A Diverse Tweet Corpus of English Varieties","summary":"  The prevalence of social media presents a growing opportunity to collect and\nanalyse examples of English varieties. Whilst usage of these varieties was -\nand, in many cases, still is - used only in spoken contexts or hard-to-access\nprivate messages, social media sites like Twitter provide a platform for users\nto communicate informally in a scrapeable format. Notably, Indian English\n(Hinglish), Singaporean English (Singlish), and African-American English (AAE)\ncan be commonly found online. These varieties pose a challenge to existing\nnatural language processing (NLP) tools as they often differ orthographically\nand syntactically from standard English for which the majority of these tools\nare built. NLP models trained on standard English texts produced biased\noutcomes for users of underrepresented varieties. Some research has aimed to\novercome the inherent biases caused by unrepresentative data through techniques\nlike data augmentation or adjusting training models.\n  We aim to address the issue of bias at its root - the data itself. We curate\na dataset of tweets from countries with high proportions of underserved English\nvariety speakers, and propose an annotation framework of six categorical\nclassifications along a pseudo-spectrum that measures the degree of standard\nEnglish and that thereby indirectly aims to surface the manifestations of\nEnglish varieties in these tweets. Following best annotation practices, our\ngrowing corpus features 170,800 tweets taken from 7 countries, labeled by\nannotators who are from those countries and can communicate in\nregionally-dominant varieties of English. Our corpus highlights the accuracy\ndiscrepancies in pre-trained language identifiers between western English and\nnon-western (i.e., less standard) English varieties. We hope to contribute to\nthe growing literature identifying and reducing the implicit demographic\ndiscrepancies in NLP.\n","authors":["Nhi Pham","Lachlan Pham","Adam L. Meyers"],"pdf_url":"https://arxiv.org/pdf/2401.11487v1.pdf","comment":"10 pages (including limitations, references and appendices), 2\n  figures"},{"id":"http://arxiv.org/abs/2310.15823v3","updated":"2024-01-21T12:40:48Z","published":"2023-10-24T13:23:57Z","title":"Rosetta Stone at KSAA-RD Shared Task: A Hop From Language Modeling To\n  Word--Definition Alignment","summary":"  A Reverse Dictionary is a tool enabling users to discover a word based on its\nprovided definition, meaning, or description. Such a technique proves valuable\nin various scenarios, aiding language learners who possess a description of a\nword without its identity, and benefiting writers seeking precise terminology.\nThese scenarios often encapsulate what is referred to as the\n\"Tip-of-the-Tongue\" (TOT) phenomena. In this work, we present our winning\nsolution for the Arabic Reverse Dictionary shared task. This task focuses on\nderiving a vector representation of an Arabic word from its accompanying\ndescription. The shared task encompasses two distinct subtasks: the first\ninvolves an Arabic definition as input, while the second employs an English\ndefinition. For the first subtask, our approach relies on an ensemble of\nfinetuned Arabic BERT-based models, predicting the word embedding for a given\ndefinition. The final representation is obtained through averaging the output\nembeddings from each model within the ensemble. In contrast, the most effective\nsolution for the second subtask involves translating the English test\ndefinitions into Arabic and applying them to the finetuned models originally\ntrained for the first subtask. This straightforward method achieves the highest\nscore across both subtasks.\n","authors":["Ahmed ElBakry","Mohamed Gabr","Muhammad ElNokrashy","Badr AlKhamissi"],"pdf_url":"https://arxiv.org/pdf/2310.15823v3.pdf","comment":"Proceedings of ArabicNLP 2023"},{"id":"http://arxiv.org/abs/2401.11467v1","updated":"2024-01-21T11:42:18Z","published":"2024-01-21T11:42:18Z","title":"Over-Reasoning and Redundant Calculation of Large Language Models","summary":"  Large language models (LLMs) can solve problems step-by-step. While this\nchain-of-thought (CoT) reasoning boosts LLMs' performance, it is unclear if\nLLMs \\textit{know} when to use CoT and whether those CoT are always necessary\nto answer the question. This paper shows that LLMs tend to generate redundant\ncalculations and reasoning on a manually constructed math QA dataset,\nGSM8K-Zero. GSM8K-Zero is constructed such that the questions can be answered\nwithout any calculations, but LLMs, including Llama-2 models and Claude-2, tend\nto generate lengthy and unnecessary calculations to answer the questions. We\nalso conduct experiments to explain why LLMs generate redundant calculations\nand reasonings. GSM8K-Zero is publicly available at\nhttps://github.com/d223302/Over-Reasoning-of-LLMs and\nhttps://huggingface.co/datasets/dcml0714/GSM8K-Zero.\n","authors":["Cheng-Han Chiang","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2401.11467v1.pdf","comment":"EACL 2024 main conference paper. Camera-ready version"},{"id":"http://arxiv.org/abs/2401.11463v1","updated":"2024-01-21T11:04:30Z","published":"2024-01-21T11:04:30Z","title":"Estimating the Usefulness of Clarifying Questions and Answers for\n  Conversational Search","summary":"  While the body of research directed towards constructing and generating\nclarifying questions in mixed-initiative conversational search systems is vast,\nresearch aimed at processing and comprehending users' answers to such questions\nis scarce. To this end, we present a simple yet effective method for processing\nanswers to clarifying questions, moving away from previous work that simply\nappends answers to the original query and thus potentially degrades retrieval\nperformance. Specifically, we propose a classifier for assessing usefulness of\nthe prompted clarifying question and an answer given by the user. Useful\nquestions or answers are further appended to the conversation history and\npassed to a transformer-based query rewriting module. Results demonstrate\nsignificant improvements over strong non-mixed-initiative baselines.\nFurthermore, the proposed approach mitigates the performance drops when non\nuseful questions and answers are utilized.\n","authors":["Ivan Sekulić","Weronika Łajewska","Krisztian Balog","Fabio Crestani"],"pdf_url":"https://arxiv.org/pdf/2401.11463v1.pdf","comment":"This is the author's version of the work. The definitive version is\n  published in: Proceedings of the 46th European Conference on Information\n  Retrieval (ECIR '24), March 24-28, 2024, Glasgow, Scotland"},{"id":"http://arxiv.org/abs/2401.11458v1","updated":"2024-01-21T10:46:23Z","published":"2024-01-21T10:46:23Z","title":"Linear Alignment: A Closed-form Solution for Aligning Human Preferences\n  without Tuning and Feedback","summary":"  The success of AI assistants based on Language Models (LLMs) hinges on\nReinforcement Learning from Human Feedback (RLHF) to comprehend and align with\nuser intentions. However, traditional alignment algorithms, such as PPO, are\nhampered by complex annotation and training requirements. This reliance limits\nthe applicability of RLHF and hinders the development of professional\nassistants tailored to diverse human preferences. In this work, we introduce\n\\textit{Linear Alignment}, a novel algorithm that aligns language models with\nhuman preferences in one single inference step, eliminating the reliance on\ndata annotation and model training. Linear alignment incorporates a new\nparameterization for policy optimization under divergence constraints, which\nenables the extraction of optimal policy in a closed-form manner and\nfacilitates the direct estimation of the aligned response. Extensive\nexperiments on both general and personalized preference datasets demonstrate\nthat linear alignment significantly enhances the performance and efficiency of\nLLM alignment across diverse scenarios. Our code and dataset will be published\non \\url{https://github.com/Wizardcoast/Linear_Alignment.git}.\n","authors":["Songyang Gao","Qiming Ge","Wei Shen","Shihan Dou","Junjie Ye","Xiao Wang","Rui Zheng","Yicheng Zou","Zhi Chen","Hang Yan","Qi Zhang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11452v1","updated":"2024-01-21T10:15:36Z","published":"2024-01-21T10:15:36Z","title":"Towards Reliable and Factual Response Generation: Detecting Unanswerable\n  Questions in Information-Seeking Conversations","summary":"  Generative AI models face the challenge of hallucinations that can undermine\nusers' trust in such systems. We approach the problem of conversational\ninformation seeking as a two-step process, where relevant passages in a corpus\nare identified first and then summarized into a final system response. This way\nwe can automatically assess if the answer to the user's question is present in\nthe corpus. Specifically, our proposed method employs a sentence-level\nclassifier to detect if the answer is present, then aggregates these\npredictions on the passage level, and eventually across the top-ranked passages\nto arrive at a final answerability estimate. For training and evaluation, we\ndevelop a dataset based on the TREC CAsT benchmark that includes answerability\nlabels on the sentence, passage, and ranking levels. We demonstrate that our\nproposed method represents a strong baseline and outperforms a state-of-the-art\nLLM on the answerability prediction task.\n","authors":["Weronika Łajewska","Krisztian Balog"],"pdf_url":"https://arxiv.org/pdf/2401.11452v1.pdf","comment":"This is the author's version of the work. The definitive version is\n  published in: Proceedings of the 46th European Conference on Information\n  Retrieval} (ECIR '24), March 24--28, 2024, Glasgow, Scotland"},{"id":"http://arxiv.org/abs/2312.11532v2","updated":"2024-01-21T09:30:36Z","published":"2023-12-15T15:01:10Z","title":"Topic-VQ-VAE: Leveraging Latent Codebooks for Flexible Topic-Guided\n  Document Generation","summary":"  This paper introduces a novel approach for topic modeling utilizing latent\ncodebooks from Vector-Quantized Variational Auto-Encoder~(VQ-VAE), discretely\nencapsulating the rich information of the pre-trained embeddings such as the\npre-trained language model. From the novel interpretation of the latent\ncodebooks and embeddings as conceptual bag-of-words, we propose a new\ngenerative topic model called Topic-VQ-VAE~(TVQ-VAE) which inversely generates\nthe original documents related to the respective latent codebook. The TVQ-VAE\ncan visualize the topics with various generative distributions including the\ntraditional BoW distribution and the autoregressive image generation. Our\nexperimental results on document analysis and image generation demonstrate that\nTVQ-VAE effectively captures the topic context which reveals the underlying\nstructures of the dataset and supports flexible forms of document generation.\nOfficial implementation of the proposed TVQ-VAE is available at\nhttps://github.com/clovaai/TVQ-VAE.\n","authors":["YoungJoon Yoo","Jongwon Choi"],"pdf_url":"https://arxiv.org/pdf/2312.11532v2.pdf","comment":"Published in the 38th annual AAAI conference on Artificial\n  Intelligence"},{"id":"http://arxiv.org/abs/2401.11431v1","updated":"2024-01-21T08:43:24Z","published":"2024-01-21T08:43:24Z","title":"Majority or Minority: Data Imbalance Learning Method for Named Entity\n  Recognition","summary":"  Data imbalance presents a significant challenge in various machine learning\n(ML) tasks, particularly named entity recognition (NER) within natural language\nprocessing (NLP). NER exhibits a data imbalance with a long-tail distribution,\nfeaturing numerous minority classes (i.e., entity classes) and a single\nmajority class (i.e., O-class). The imbalance leads to the misclassifications\nof the entity classes as the O-class. To tackle the imbalance, we propose a\nsimple and effective learning method, named majority or minority (MoM)\nlearning. MoM learning incorporates the loss computed only for samples whose\nground truth is the majority class (i.e., the O-class) into the loss of the\nconventional ML model. Evaluation experiments on four NER datasets (Japanese\nand English) showed that MoM learning improves prediction performance of the\nminority classes, without sacrificing the performance of the majority class and\nis more effective than widely known and state-of-the-art methods. We also\nevaluated MoM learning using frameworks as sequential labeling and machine\nreading comprehension, which are commonly used in NER. Furthermore, MoM\nlearning has achieved consistent performance improvements regardless of\nlanguage, model, or framework.\n","authors":["Sota Nemoto","Shunsuke Kitada","Hitoshi Iyatomi"],"pdf_url":"https://arxiv.org/pdf/2401.11431v1.pdf","comment":"6 pages, 1 figures, 6 tables"},{"id":"http://arxiv.org/abs/2302.06419v2","updated":"2024-01-21T07:41:02Z","published":"2023-02-10T02:55:52Z","title":"AV-data2vec: Self-supervised Learning of Audio-Visual Speech\n  Representations with Contextualized Target Representations","summary":"  Self-supervision has shown great potential for audio-visual speech\nrecognition by vastly reducing the amount of labeled data required to build\ngood systems. However, existing methods are either not entirely end-to-end or\ndo not train joint representations of both modalities. In this paper, we\nintroduce AV-data2vec which addresses these challenges and builds audio-visual\nrepresentations based on predicting contextualized representations which has\nbeen successful in the uni-modal case. The model uses a shared transformer\nencoder for both audio and video and can combine both modalities to improve\nspeech recognition. Results on LRS3 show that AV-data2vec consistently\noutperforms existing methods under all settings with the same amount of data\nand model size.\n","authors":["Jiachen Lian","Alexei Baevski","Wei-Ning Hsu","Michael Auli"],"pdf_url":"https://arxiv.org/pdf/2302.06419v2.pdf","comment":"2023 ASRU"},{"id":"http://arxiv.org/abs/2401.10015v2","updated":"2024-01-21T06:51:25Z","published":"2024-01-18T14:33:01Z","title":"Towards Hierarchical Spoken Language Dysfluency Modeling","summary":"  Speech disfluency modeling is the bottleneck for both speech therapy and\nlanguage learning. However, there is no effective AI solution to systematically\ntackle this problem. We solidify the concept of disfluent speech and disfluent\nspeech modeling. We then present Hierarchical Unconstrained Disfluency Modeling\n(H-UDM) approach, the hierarchical extension of UDM that addresses both\ndisfluency transcription and detection to eliminate the need for extensive\nmanual annotation. Our experimental findings serve as clear evidence of the\neffectiveness and reliability of the methods we have introduced, encompassing\nboth transcription and detection tasks.\n","authors":["Jiachen Lian","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2401.10015v2.pdf","comment":"2024 EACL. Hierarchical extension of our previous workshop paper\n  arXiv:2312.12810"},{"id":"http://arxiv.org/abs/2401.11408v1","updated":"2024-01-21T06:10:03Z","published":"2024-01-21T06:10:03Z","title":"SEBERTNets: Sequence Enhanced BERT Networks for Event Entity Extraction\n  Tasks Oriented to the Finance Field","summary":"  Event extraction lies at the cores of investment analysis and asset\nmanagement in the financial field, and thus has received much attention. The\n2019 China conference on knowledge graph and semantic computing (CCKS)\nchallenge sets up a evaluation competition for event entity extraction task\noriented to the finance field. In this task, we mainly focus on how to extract\nthe event entity accurately, and recall all the corresponding event entity\neffectively. In this paper, we propose a novel model, Sequence Enhanced BERT\nNetworks (SEBERTNets for short), which can inherit the advantages of the\nBERT,and while capturing sequence semantic information. In addition, motivated\nby recommendation system, we propose Hybrid Sequence Enhanced BERT Networks\n(HSEBERTNets for short), which uses a multi-channel recall method to recall all\nthe corresponding event entity. The experimental results show that, the F1\nscore of SEBERTNets is 0.905 in the first stage, and the F1 score of\nHSEBERTNets is 0.934 in the first stage, which demonstarate the effectiveness\nof our methods.\n","authors":["Congqing He","Xiangyu Zhu","Yuquan Le","Yuzhong Liu","Jianhong Yin"],"pdf_url":"https://arxiv.org/pdf/2401.11408v1.pdf","comment":"CCKS 2019"},{"id":"http://arxiv.org/abs/2312.07930v2","updated":"2024-01-21T05:22:22Z","published":"2023-12-13T06:57:00Z","title":"Towards Optimal Statistical Watermarking","summary":"  We study statistical watermarking by formulating it as a hypothesis testing\nproblem, a general framework which subsumes all previous statistical\nwatermarking methods. Key to our formulation is a coupling of the output tokens\nand the rejection region, realized by pseudo-random generators in practice,\nthat allows non-trivial trade-off between the Type I error and Type II error.\nWe characterize the Uniformly Most Powerful (UMP) watermark in the general\nhypothesis testing setting and the minimax Type II error in the model-agnostic\nsetting. In the common scenario where the output is a sequence of $n$ tokens,\nwe establish nearly matching upper and lower bounds on the number of i.i.d.\ntokens required to guarantee small Type I and Type II errors. Our rate of\n$\\Theta(h^{-1} \\log (1/h))$ with respect to the average entropy per token $h$\nhighlights potentials for improvement from the rate of $h^{-2}$ in the previous\nworks. Moreover, we formulate the robust watermarking problem where users are\nallowed to perform a class of perturbations on the generated texts, and\ncharacterize the optimal type II error of robust UMP tests via a linear\nprogramming problem. To the best of our knowledge, this is the first systematic\nstatistical treatment on the watermarking problem with near-optimal rates in\nthe i.i.d. setting, which might be of interest for future works.\n","authors":["Baihe Huang","Banghua Zhu","Hanlin Zhu","Jason D. Lee","Jiantao Jiao","Michael I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2312.07930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11403v1","updated":"2024-01-21T04:54:45Z","published":"2024-01-21T04:54:45Z","title":"MolTailor: Tailoring Chemical Molecular Representation to Specific Tasks\n  via Text Prompts","summary":"  Deep learning is now widely used in drug discovery, providing significant\nacceleration and cost reduction. As the most fundamental building block,\nmolecular representation is essential for predicting molecular properties to\nenable various downstream applications. Most existing methods attempt to\nincorporate more information to learn better representations. However, not all\nfeatures are equally important for a specific task. Ignoring this would\npotentially compromise the training efficiency and predictive accuracy. To\naddress this issue, we propose a novel approach, which treats language models\nas an agent and molecular pretraining models as a knowledge base. The agent\naccentuates task-relevant features in the molecular representation by\nunderstanding the natural language description of the task, just as a tailor\ncustomizes clothes for clients. Thus, we call this approach MolTailor.\nEvaluations demonstrate MolTailor's superior performance over baselines,\nvalidating the efficacy of enhancing relevance for molecular representation\nlearning. This illustrates the potential of language model guided optimization\nto better exploit and unleash the capabilities of existing powerful molecular\nrepresentation methods. Our codes and appendix are available at\nhttps://github.com/SCIR-HI/MolTailor.\n","authors":["Haoqiang Guo","Sendong Zhao","Haochun Wang","Yanrui Du","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2401.11403v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2310.02255v3","updated":"2024-01-21T03:47:06Z","published":"2023-10-03T17:57:24Z","title":"MathVista: Evaluating Mathematical Reasoning of Foundation Models in\n  Visual Contexts","summary":"  Large Language Models (LLMs) and Large Multimodal Models (LMMs) exhibit\nimpressive problem-solving skills in many tasks and domains, but their ability\nin mathematical reasoning in visual contexts has not been systematically\nstudied. To bridge this gap, we present MathVista, a benchmark designed to\ncombine challenges from diverse mathematical and visual tasks. It consists of\n6,141 examples, derived from 28 existing multimodal datasets involving\nmathematics and 3 newly created datasets (i.e., IQTest, FunctionQA, and\nPaperQA). Completing these tasks requires fine-grained, deep visual\nunderstanding and compositional reasoning, which all state-of-the-art\nfoundation models find challenging. With MathVista, we have conducted a\ncomprehensive, quantitative evaluation of 12 prominent foundation models. The\nbest-performing GPT-4V model achieves an overall accuracy of 49.9%,\nsubstantially outperforming Bard, the second-best performer, by 15.1%. Our\nin-depth analysis reveals that the superiority of GPT-4V is mainly attributed\nto its enhanced visual perception and mathematical reasoning. However, GPT-4V\nstill falls short of human performance by 10.4%, as it often struggles to\nunderstand complex figures and perform rigorous reasoning. This significant gap\nunderscores the critical role that MathVista will play in the development of\ngeneral-purpose AI agents capable of tackling mathematically intensive and\nvisually rich real-world tasks. We further explore the new ability of\nself-verification, the application of self-consistency, and the interactive\nchatbot capabilities of GPT-4V, highlighting its promising potential for future\nresearch. The project is available at https://mathvista.github.io/.\n","authors":["Pan Lu","Hritik Bansal","Tony Xia","Jiacheng Liu","Chunyuan Li","Hannaneh Hajishirzi","Hao Cheng","Kai-Wei Chang","Michel Galley","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2310.02255v3.pdf","comment":"116 pages, 120 figures. Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11389v1","updated":"2024-01-21T03:37:47Z","published":"2024-01-21T03:37:47Z","title":"MedLM: Exploring Language Models for Medical Question Answering Systems","summary":"  In the face of rapidly expanding online medical literature, automated systems\nfor aggregating and summarizing information are becoming increasingly crucial\nfor healthcare professionals and patients. Large Language Models (LLMs), with\ntheir advanced generative capabilities, have shown promise in various NLP\ntasks, and their potential in the healthcare domain, particularly for\nClosed-Book Generative QnA, is significant. However, the performance of these\nmodels in domain-specific tasks such as medical Q&A remains largely unexplored.\nThis study aims to fill this gap by comparing the performance of general and\nmedical-specific distilled LMs for medical Q&A. We aim to evaluate the\neffectiveness of fine-tuning domain-specific LMs and compare the performance of\ndifferent families of Language Models. The study will address critical\nquestions about these models' reliability, comparative performance, and\neffectiveness in the context of medical Q&A. The findings will provide valuable\ninsights into the suitability of different LMs for specific applications in the\nmedical domain.\n","authors":["Niraj Yagnik","Jay Jhaveri","Vivek Sharma","Gabriel Pila","Asma Ben","Jingbo Shang"],"pdf_url":"https://arxiv.org/pdf/2401.11389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10189v2","updated":"2024-01-21T03:37:41Z","published":"2024-01-18T18:20:15Z","title":"Chem-FINESE: Validating Fine-Grained Few-shot Entity Extraction through\n  Text Reconstruction","summary":"  Fine-grained few-shot entity extraction in the chemical domain faces two\nunique challenges. First, compared with entity extraction tasks in the general\ndomain, sentences from chemical papers usually contain more entities. Moreover,\nentity extraction models usually have difficulty extracting entities of\nlong-tailed types. In this paper, we propose Chem-FINESE, a novel\nsequence-to-sequence (seq2seq) based few-shot entity extraction approach, to\naddress these two challenges. Our Chem-FINESE has two components: a seq2seq\nentity extractor to extract named entities from the input sentence and a\nseq2seq self-validation module to reconstruct the original input sentence from\nextracted entities. Inspired by the fact that a good entity extraction system\nneeds to extract entities faithfully, our new self-validation module leverages\nentity extraction results to reconstruct the original input sentence. Besides,\nwe design a new contrastive loss to reduce excessive copying during the\nextraction process. Finally, we release ChemNER+, a new fine-grained chemical\nentity extraction dataset that is annotated by domain experts with the ChemNER\nschema. Experiments in few-shot settings with both ChemNER+ and CHEMET datasets\nshow that our newly proposed framework has contributed up to 8.26% and 6.84%\nabsolute F1-score gains respectively.\n","authors":["Qingyun Wang","Zixuan Zhang","Hongxiang Li","Xuan Liu","Jiawei Han","Heng Ji","Huimin Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10189v2.pdf","comment":"16 pages. Accepted by Findings of the Association for Computational\n  Linguistics: EACL 2024. Code and resources are available at\n  https://github.com/EagleW/Chem-FINESE"},{"id":"http://arxiv.org/abs/2401.11382v1","updated":"2024-01-21T03:15:05Z","published":"2024-01-21T03:15:05Z","title":"Using Large Language Model for End-to-End Chinese ASR and NER","summary":"  Mapping speech tokens to the same feature space as text tokens has become the\nparadigm for the integration of speech modality into decoder-only large\nlanguage models (LLMs). An alternative approach is to use an encoder-decoder\narchitecture that incorporates speech features through cross-attention. This\napproach, however, has received less attention in the literature. In this work,\nwe connect the Whisper encoder with ChatGLM3 and provide in-depth comparisons\nof these two approaches using Chinese automatic speech recognition (ASR) and\nname entity recognition (NER) tasks. We evaluate them not only by conventional\nmetrics like the F1 score but also by a novel fine-grained taxonomy of ASR-NER\nerrors. Our experiments reveal that encoder-decoder architecture outperforms\ndecoder-only architecture with a short context, while decoder-only architecture\nbenefits from a long context as it fully exploits all layers of the LLM. By\nusing LLM, we significantly reduced the entity omission errors and improved the\nentity ASR accuracy compared to the Conformer baseline. Additionally, we\nobtained a state-of-the-art (SOTA) F1 score of 0.805 on the AISHELL-NER test\nset by using chain-of-thought (CoT) NER which first infers long-form ASR\ntranscriptions and then predicts NER labels.\n","authors":["Yuang Li","Jiawei Yu","Yanqing Zhao","Min Zhang","Mengxin Ren","Xiaofeng Zhao","Xiaosong Qiao","Chang Su","Miaomiao Ma","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2401.11382v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.11374v1","updated":"2024-01-21T02:29:12Z","published":"2024-01-21T02:29:12Z","title":"Language Models as Hierarchy Encoders","summary":"  Interpreting hierarchical structures latent in language is a key limitation\nof current language models (LMs). While previous research has implicitly\nleveraged these hierarchies to enhance LMs, approaches for their explicit\nencoding are yet to be explored. To address this, we introduce a novel approach\nto re-train transformer encoder-based LMs as Hierarchy Transformer encoders\n(HiTs), harnessing the expansive nature of hyperbolic space. Our method\nsituates the output embedding space of pre-trained LMs within a Poincar\\'e ball\nwith a curvature that adapts to the embedding dimension, followed by\nre-training on hyperbolic cluster and centripetal losses. These losses are\ndesigned to effectively cluster related entities (input as texts) and organise\nthem hierarchically. We evaluate HiTs against pre-trained and fine-tuned LMs,\nfocusing on their capabilities in simulating transitive inference, predicting\nsubsumptions, and transferring knowledge across hierarchies. The results\ndemonstrate that HiTs consistently outperform both pre-trained and fine-tuned\nLMs in these tasks, underscoring the effectiveness and transferability of our\nre-trained hierarchy encoders.\n","authors":["Yuan He","Zhangdie Yuan","Jiaoyan Chen","Ian Horrocks"],"pdf_url":"https://arxiv.org/pdf/2401.11374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11373v1","updated":"2024-01-21T02:25:29Z","published":"2024-01-21T02:25:29Z","title":"Finding a Needle in the Adversarial Haystack: A Targeted Paraphrasing\n  Approach For Uncovering Edge Cases with Minimal Distribution Distortion","summary":"  Adversarial attacks against NLP Deep Learning models are a significant\nconcern. In particular, adversarial samples exploit the model's sensitivity to\nsmall input changes. While these changes appear insignificant on the semantics\nof the input sample, they result in significant decay in model performance. In\nthis paper, we propose Targeted Paraphrasing via RL (TPRL), an approach to\nautomatically learn a policy to generate challenging samples that most likely\nimprove the model's performance. TPRL leverages FLAN T5, a language model, as a\ngenerator and employs a self learned policy using a proximal policy gradient to\ngenerate the adversarial examples automatically. TPRL's reward is based on the\nconfusion induced in the classifier, preserving the original text meaning\nthrough a Mutual Implication score. We demonstrate and evaluate TPRL's\neffectiveness in discovering natural adversarial attacks and improving model\nperformance through extensive experiments on four diverse NLP classification\ntasks via Automatic and Human evaluation. TPRL outperforms strong baselines,\nexhibits generalizability across classifiers and datasets, and combines the\nstrengths of language modeling and reinforcement learning to generate diverse\nand influential adversarial examples.\n","authors":["Aly M. Kassem","Sherif Saad"],"pdf_url":"https://arxiv.org/pdf/2401.11373v1.pdf","comment":"EACL 2024 - Main conference"},{"id":"http://arxiv.org/abs/2401.11365v1","updated":"2024-01-21T01:37:25Z","published":"2024-01-21T01:37:25Z","title":"Confidence Preservation Property in Knowledge Distillation Abstractions","summary":"  Social media platforms prevent malicious activities by detecting harmful\ncontent of posts and comments. To that end, they employ large-scale deep neural\nnetwork language models for sentiment analysis and content understanding. Some\nmodels, like BERT, are complex, and have numerous parameters, which makes them\nexpensive to operate and maintain. To overcome these deficiencies, industry\nexperts employ a knowledge distillation compression technique, where a\ndistilled model is trained to reproduce the classification behavior of the\noriginal model. The distillation processes terminates when the distillation\nloss function reaches the stopping criteria. This function is mainly designed\nto ensure that the original and the distilled models exhibit alike\nclassification behaviors. However, besides classification accuracy, there are\nadditional properties of the original model that the distilled model should\npreserve to be considered as an appropriate abstraction. In this work, we\nexplore whether distilled TinyBERT models preserve confidence values of the\noriginal BERT models, and investigate how this confidence preservation property\ncould guide tuning hyperparameters of the distillation process.\n","authors":["Dmitry Vengertsev","Elena Sherman"],"pdf_url":"https://arxiv.org/pdf/2401.11365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11361v1","updated":"2024-01-21T01:18:08Z","published":"2024-01-21T01:18:08Z","title":"Revolutionizing API Documentation through Summarization","summary":"  This study tackles the challenges associated with interpreting Application\nProgramming Interface (API) documentation, an integral aspect of software\ndevelopment. Official API documentation, while essential, can be lengthy and\nchallenging to navigate, prompting developers to seek unofficial sources such\nas Stack Overflow. Leveraging the vast user-generated content on Stack\nOverflow, including code snippets and discussions, we employ BERTopic and\nextractive summarization to automatically generate concise and informative API\nsummaries. These summaries encompass key insights like general usage, common\ndeveloper issues, and potential solutions, sourced from the wealth of knowledge\non Stack Overflow. Software developers evaluate these summaries for\nperformance, coherence, and interoperability, providing valuable feedback on\nthe practicality of our approach.\n","authors":["AmirHossein Naghshzan","Sylvie Ratte"],"pdf_url":"https://arxiv.org/pdf/2401.11361v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2308.09070"},{"id":"http://arxiv.org/abs/2401.11356v1","updated":"2024-01-21T00:58:31Z","published":"2024-01-21T00:58:31Z","title":"ProLex: A Benchmark for Language Proficiency-oriented Lexical\n  Substitution","summary":"  Lexical Substitution discovers appropriate substitutes for a given target\nword in a context sentence. However, the task fails to consider substitutes\nthat are of equal or higher proficiency than the target, an aspect that could\nbe beneficial for language learners looking to improve their writing. To bridge\nthis gap, we propose a new task, language proficiency-oriented lexical\nsubstitution. We also introduce ProLex, a novel benchmark designed to assess\nsystems' ability to generate not only appropriate substitutes but also\nsubstitutes that demonstrate better language proficiency. Besides the\nbenchmark, we propose models that can automatically perform the new task. We\nshow that our best model, a Llama2-13B model fine-tuned with task-specific\nsynthetic data, outperforms ChatGPT by an average of 3.2% in F-score and\nachieves comparable results with GPT-4 on ProLex.\n","authors":["Xuanming Zhang","Zixun Chen","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11356v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.11631v1","updated":"2024-01-21T23:54:05Z","published":"2024-01-21T23:54:05Z","title":"Text-to-Image Cross-Modal Generation: A Systematic Review","summary":"  We review research on generating visual data from text from the angle of\n\"cross-modal generation.\" This point of view allows us to draw parallels\nbetween various methods geared towards working on input text and producing\nvisual output, without limiting the analysis to narrow sub-areas. It also\nresults in the identification of common templates in the field, which are then\ncompared and contrasted both within pools of similar methods and across lines\nof research. We provide a breakdown of text-to-image generation into various\nflavors of image-from-text methods, video-from-text methods, image editing,\nself-supervised and graph-based approaches. In this discussion, we focus on\nresearch papers published at 8 leading machine learning conferences in the\nyears 2016-2022, also incorporating a number of relevant papers not matching\nthe outlined search criteria. The conducted review suggests a significant\nincrease in the number of papers published in the area and highlights research\ngaps and potential lines of investigation. To our knowledge, this is the first\nreview to systematically look at text-to-image generation from the perspective\nof \"cross-modal generation.\"\n","authors":["Maciej Żelaszczyk","Jacek Mańdziuk"],"pdf_url":"https://arxiv.org/pdf/2401.11631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05105v2","updated":"2024-01-21T23:04:32Z","published":"2023-03-09T08:24:02Z","title":"MaskDiff: Modeling Mask Distribution with Diffusion Probabilistic Model\n  for Few-Shot Instance Segmentation","summary":"  Few-shot instance segmentation extends the few-shot learning paradigm to the\ninstance segmentation task, which tries to segment instance objects from a\nquery image with a few annotated examples of novel categories. Conventional\napproaches have attempted to address the task via prototype learning, known as\npoint estimation. However, this mechanism depends on prototypes (\\eg mean of\n$K-$shot) for prediction, leading to performance instability. To overcome the\ndisadvantage of the point estimation mechanism, we propose a novel approach,\ndubbed MaskDiff, which models the underlying conditional distribution of a\nbinary mask, which is conditioned on an object region and $K-$shot information.\nInspired by augmentation approaches that perturb data with Gaussian noise for\npopulating low data density regions, we model the mask distribution with a\ndiffusion probabilistic model. We also propose to utilize classifier-free\nguided mask sampling to integrate category information into the binary mask\ngeneration process. Without bells and whistles, our proposed method\nconsistently outperforms state-of-the-art methods on both base and novel\nclasses of the COCO dataset while simultaneously being more stable than\nexisting methods. The source code is available at:\nhttps://github.com/minhquanlecs/MaskDiff.\n","authors":["Minh-Quan Le","Tam V. Nguyen","Trung-Nghia Le","Thanh-Toan Do","Minh N. Do","Minh-Triet Tran"],"pdf_url":"https://arxiv.org/pdf/2303.05105v2.pdf","comment":"Accepted at AAAI 2024 (oral presentation)"},{"id":"http://arxiv.org/abs/2401.11617v1","updated":"2024-01-21T22:50:44Z","published":"2024-01-21T22:50:44Z","title":"A Survey on African Computer Vision Datasets, Topics and Researchers","summary":"  Computer vision encompasses a range of tasks such as object detection,\nsemantic segmentation, and 3D reconstruction. Despite its relevance to African\ncommunities, research in this field within Africa represents only 0.06% of\ntop-tier publications over the past decade. This study undertakes a thorough\nanalysis of 63,000 Scopus-indexed computer vision publications from Africa,\nspanning from 2012 to 2022. The aim is to provide a survey of African computer\nvision topics, datasets and researchers. A key aspect of our study is the\nidentification and categorization of African Computer Vision datasets using\nlarge language models that automatically parse abstracts of these publications.\nWe also provide a compilation of unofficial African Computer Vision datasets\ndistributed through challenges or data hosting platforms, and provide a full\ntaxonomy of dataset categories. Our survey also pinpoints computer vision\ntopics trends specific to different African regions, indicating their unique\nfocus areas. Additionally, we carried out an extensive survey to capture the\nviews of African researchers on the current state of computer vision research\nin the continent and the structural barriers they believe need urgent\nattention. In conclusion, this study catalogs and categorizes Computer Vision\ndatasets and topics contributed or initiated by African institutions and\nidentifies barriers to publishing in top-tier Computer Vision venues. This\nsurvey underscores the importance of encouraging African researchers and\ninstitutions in advancing computer vision research in the continent. It also\nstresses on the need for research topics to be more aligned with the needs of\nAfrican communities.\n","authors":["Abdul-Hakeem Omotayo","Ashery Mbilinyi","Lukman Ismaila","Houcemeddine Turki","Mahmoud Abdien","Karim Gamal","Idriss Tondji","Yvan Pimi","Naome A. Etori","Marwa M. Matar","Clifford Broni-Bediako","Abigail Oppong","Mai Gamal","Eman Ehab","Gbetondji Dovonon","Zainab Akinjobi","Daniel Ajisafe","Oluwabukola G. Adegboro","Mennatullah Siam"],"pdf_url":"https://arxiv.org/pdf/2401.11617v1.pdf","comment":"Under Review, Community Work of Ro'ya Grassroots,\n  https://ro-ya-cv4africa.github.io/homepage/. arXiv admin note: text overlap\n  with arXiv:2305.06773"},{"id":"http://arxiv.org/abs/2311.03500v2","updated":"2024-01-21T22:04:28Z","published":"2023-11-06T20:18:26Z","title":"Predicting Age from White Matter Diffusivity with Residual Learning","summary":"  Imaging findings inconsistent with those expected at specific chronological\nage ranges may serve as early indicators of neurological disorders and\nincreased mortality risk. Estimation of chronological age, and deviations from\nexpected results, from structural MRI data has become an important task for\ndeveloping biomarkers that are sensitive to such deviations. Complementary to\nstructural analysis, diffusion tensor imaging (DTI) has proven effective in\nidentifying age-related microstructural changes within the brain white matter,\nthereby presenting itself as a promising additional modality for brain age\nprediction. Although early studies have sought to harness DTI's advantages for\nage estimation, there is no evidence that the success of this prediction is\nowed to the unique microstructural and diffusivity features that DTI provides,\nrather than the macrostructural features that are also available in DTI data.\nTherefore, we seek to develop white-matter-specific age estimation to capture\ndeviations from normal white matter aging. Specifically, we deliberately\ndisregard the macrostructural information when predicting age from DTI scalar\nimages, using two distinct methods. The first method relies on extracting only\nmicrostructural features from regions of interest. The second applies 3D\nresidual neural networks (ResNets) to learn features directly from the images,\nwhich are non-linearly registered and warped to a template to minimize\nmacrostructural variations. When tested on unseen data, the first method yields\nmean absolute error (MAE) of 6.11 years for cognitively normal participants and\nMAE of 6.62 years for cognitively impaired participants, while the second\nmethod achieves MAE of 4.69 years for cognitively normal participants and MAE\nof 4.96 years for cognitively impaired participants. We find that the ResNet\nmodel captures subtler, non-macrostructural features for brain age prediction.\n","authors":["Chenyu Gao","Michael E. Kim","Ho Hin Lee","Qi Yang","Nazirah Mohd Khairi","Praitayini Kanakaraj","Nancy R. Newlin","Derek B. Archer","Angela L. Jefferson","Warren D. Taylor","Brian D. Boyd","Lori L. Beason-Held","Susan M. Resnick","The BIOCARD Study Team","Yuankai Huo","Katherine D. Van Schaik","Kurt G. Schilling","Daniel Moyer","Ivana Išgum","Bennett A. Landman"],"pdf_url":"https://arxiv.org/pdf/2311.03500v2.pdf","comment":"SPIE Medical Imaging: Image Processing. San Diego, CA. February 2024\n  (accepted as poster presentation)"},{"id":"http://arxiv.org/abs/2401.11605v1","updated":"2024-01-21T21:49:49Z","published":"2024-01-21T21:49:49Z","title":"Scalable High-Resolution Pixel-Space Image Synthesis with Hourglass\n  Diffusion Transformers","summary":"  We present the Hourglass Diffusion Transformer (HDiT), an image generative\nmodel that exhibits linear scaling with pixel count, supporting training at\nhigh-resolution (e.g. $1024 \\times 1024$) directly in pixel-space. Building on\nthe Transformer architecture, which is known to scale to billions of\nparameters, it bridges the gap between the efficiency of convolutional U-Nets\nand the scalability of Transformers. HDiT trains successfully without typical\nhigh-resolution training techniques such as multiscale architectures, latent\nautoencoders or self-conditioning. We demonstrate that HDiT performs\ncompetitively with existing models on ImageNet $256^2$, and sets a new\nstate-of-the-art for diffusion models on FFHQ-$1024^2$.\n","authors":["Katherine Crowson","Stefan Andreas Baumann","Alex Birch","Tanishq Mathew Abraham","Daniel Z. Kaplan","Enrico Shippole"],"pdf_url":"https://arxiv.org/pdf/2401.11605v1.pdf","comment":"20 pages, 13 figures, project page and code available at\n  https://crowsonkb.github.io/hourglass-diffusion-transformers/"},{"id":"http://arxiv.org/abs/2401.11598v1","updated":"2024-01-21T21:04:05Z","published":"2024-01-21T21:04:05Z","title":"TetraLoss: Improving the Robustness of Face Recognition against Morphing\n  Attacks","summary":"  Face recognition systems are widely deployed in high-security applications\nsuch as for biometric verification at border controls. Despite their high\naccuracy on pristine data, it is well-known that digital manipulations, such as\nface morphing, pose a security threat to face recognition systems. Malicious\nactors can exploit the facilities offered by the identity document issuance\nprocess to obtain identity documents containing morphed images. Thus, subjects\nwho contributed to the creation of the morphed image can with high probability\nuse the identity document to bypass automated face recognition systems. In\nrecent years, no-reference (i.e., single image) and differential morphing\nattack detectors have been proposed to tackle this risk. These systems are\ntypically evaluated in isolation from the face recognition system that they\nhave to operate jointly with and do not consider the face recognition process.\nContrary to most existing works, we present a novel method for adapting deep\nlearning-based face recognition systems to be more robust against face morphing\nattacks. To this end, we introduce TetraLoss, a novel loss function that learns\nto separate morphed face images from its contributing subjects in the embedding\nspace while still preserving high biometric verification performance. In a\ncomprehensive evaluation, we show that the proposed method can significantly\nenhance the original system while also significantly outperforming other tested\nbaseline methods.\n","authors":["Mathias Ibsen","Lázaro J. González-Soler","Christian Rathgeb","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2401.11598v1.pdf","comment":"Accepted to the IEEE International Conference on Automatic Face &\n  Gesture Recognition 2024 (FG'24)"},{"id":"http://arxiv.org/abs/2310.01361v2","updated":"2024-01-21T21:01:12Z","published":"2023-10-02T17:23:48Z","title":"GenSim: Generating Robotic Simulation Tasks via Large Language Models","summary":"  Collecting large amounts of real-world interaction data to train general\nrobotic policies is often prohibitively expensive, thus motivating the use of\nsimulation data. However, existing methods for data generation have generally\nfocused on scene-level diversity (e.g., object instances and poses) rather than\ntask-level diversity, due to the human effort required to come up with and\nverify novel tasks. This has made it challenging for policies trained on\nsimulation data to demonstrate significant task-level generalization. In this\npaper, we propose to automatically generate rich simulation environments and\nexpert demonstrations by exploiting a large language models' (LLM) grounding\nand coding ability. Our approach, dubbed GenSim, has two modes: goal-directed\ngeneration, wherein a target task is given to the LLM and the LLM proposes a\ntask curriculum to solve the target task, and exploratory generation, wherein\nthe LLM bootstraps from previous tasks and iteratively proposes novel tasks\nthat would be helpful in solving more complex tasks. We use GPT4 to expand the\nexisting benchmark by ten times to over 100 tasks, on which we conduct\nsupervised finetuning and evaluate several LLMs including finetuned GPTs and\nCode Llama on code generation for robotic simulation tasks. Furthermore, we\nobserve that LLMs-generated simulation programs can enhance task-level\ngeneralization significantly when used for multitask policy training. We\nfurther find that with minimal sim-to-real adaptation, the multitask policies\npretrained on GPT4-generated simulation tasks exhibit stronger transfer to\nunseen long-horizon tasks in the real world and outperform baselines by 25%.\nSee the project website (https://liruiw.github.io/gensim) for code, demos, and\nvideos.\n","authors":["Lirui Wang","Yiyang Ling","Zhecheng Yuan","Mohit Shridhar","Chen Bao","Yuzhe Qin","Bailin Wang","Huazhe Xu","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01361v2.pdf","comment":"See our project website (https://liruiw.github.io/gensim), demo and\n  datasets (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code\n  (https://github.com/liruiw/GenSim) for more details"},{"id":"http://arxiv.org/abs/2401.11582v1","updated":"2024-01-21T20:10:02Z","published":"2024-01-21T20:10:02Z","title":"Thermal Image Calibration and Correction using Unpaired Cycle-Consistent\n  Adversarial Networks","summary":"  Unmanned aerial vehicles (UAVs) offer a flexible and cost-effective solution\nfor wildfire monitoring. However, their widespread deployment during wildfires\nhas been hindered by a lack of operational guidelines and concerns about\npotential interference with aircraft systems. Consequently, the progress in\ndeveloping deep-learning models for wildfire detection and characterization\nusing aerial images is constrained by the limited availability, size, and\nquality of existing datasets. This paper introduces a solution aimed at\nenhancing the quality of current aerial wildfire datasets to align with\nadvancements in camera technology. The proposed approach offers a solution to\ncreate a comprehensive, standardized large-scale image dataset. This paper\npresents a pipeline based on CycleGAN to enhance wildfire datasets and a novel\nfusion method that integrates paired RGB images as attribute conditioning in\nthe generators of both directions, improving the accuracy of the generated\nimages.\n","authors":["Hossein Rajoli","Pouya Afshin","Fatemeh Afghah"],"pdf_url":"https://arxiv.org/pdf/2401.11582v1.pdf","comment":"This paper has been accepted at the Asilomar 2023 Conference and will\n  be published"},{"id":"http://arxiv.org/abs/2303.05123v3","updated":"2024-01-21T18:11:49Z","published":"2023-03-09T09:12:21Z","title":"Dominating Set Database Selection for Visual Place Recognition","summary":"  This paper presents an approach for creating a visual place recognition (VPR)\ndatabase for localization in indoor environments from RGBD scanning sequences.\nThe proposed approach is formulated as a minimization problem in terms of\ndominating set algorithm for graph, constructed from spatial information, and\nreferred as DominatingSet. Our algorithm shows better scene coverage in\ncomparison to other methodologies that are used for database creation. Also, we\ndemonstrate that using DominatingSet, a database size could be up to 250-1400\ntimes smaller than the original scanning sequence while maintaining a recall\nrate of more than 80% on testing sequences. We evaluated our algorithm on\n7-scenes and BundleFusion datasets and an additionally recorded sequence in a\nhighly repetitive office setting. In addition, the database selection can\nproduce weakly-supervised labels for fine-tuning neural place recognition\nalgorithms to particular settings, improving even more their accuracy. The\npaper also presents a fully automated pipeline for VPR database creation from\nRGBD scanning sequences, as well as a set of metrics for VPR database\nevaluation. The code and released data are available on our web-page~ --\nhttps://prime-slam.github.io/place-recognition-db/\n","authors":["Anastasiia Kornilova","Ivan Moskalenko","Timofei Pushkin","Fakhriddin Tojiboev","Rahim Tariverdizadeh","Gonzalo Ferrer"],"pdf_url":"https://arxiv.org/pdf/2303.05123v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11544v1","updated":"2024-01-21T16:59:44Z","published":"2024-01-21T16:59:44Z","title":"Hierarchical Prompts for Rehearsal-free Continual Learning","summary":"  Continual learning endeavors to equip the model with the capability to\nintegrate current task knowledge while mitigating the forgetting of past task\nknowledge. Inspired by prompt tuning, prompt-based methods maintain a frozen\nbackbone and train with slight learnable prompts to minimize the catastrophic\nforgetting that arises due to updating a large number of backbone parameters.\nNonetheless, these learnable prompts tend to concentrate on the discriminatory\nknowledge of the current task while ignoring past task knowledge, leading to\nthat learnable prompts still suffering from catastrophic forgetting. This paper\nintroduces a novel rehearsal-free paradigm for continual learning termed\nHierarchical Prompts (H-Prompts), comprising three categories of prompts --\nclass prompt, task prompt, and general prompt. To effectively depict the\nknowledge of past classes, class prompt leverages Bayesian Distribution\nAlignment to model the distribution of classes in each task. To reduce the\nforgetting of past task knowledge, task prompt employs Cross-task Knowledge\nExcavation to amalgamate the knowledge encapsulated in the learned class\nprompts of past tasks and current task knowledge. Furthermore, general prompt\nutilizes Generalized Knowledge Exploration to deduce highly generalized\nknowledge in a self-supervised manner. Evaluations on two benchmarks\nsubstantiate the efficacy of the proposed H-Prompts, exemplified by an average\naccuracy of 87.8% in Split CIFAR-100 and 70.6% in Split ImageNet-R.\n","authors":["Yukun Zuo","Hantao Yao","Lu Yu","Liansheng Zhuang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2401.11544v1.pdf","comment":"Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2401.11543v1","updated":"2024-01-21T16:55:40Z","published":"2024-01-21T16:55:40Z","title":"How Robust Are Energy-Based Models Trained With Equilibrium Propagation?","summary":"  Deep neural networks (DNNs) are easily fooled by adversarial perturbations\nthat are imperceptible to humans. Adversarial training, a process where\nadversarial examples are added to the training set, is the current\nstate-of-the-art defense against adversarial attacks, but it lowers the model's\naccuracy on clean inputs, is computationally expensive, and offers less\nrobustness to natural noise. In contrast, energy-based models (EBMs), which\nwere designed for efficient implementation in neuromorphic hardware and\nphysical systems, incorporate feedback connections from each layer to the\nprevious layer, yielding a recurrent, deep-attractor architecture which we\nhypothesize should make them naturally robust. Our work is the first to explore\nthe robustness of EBMs to both natural corruptions and adversarial attacks,\nwhich we do using the CIFAR-10 and CIFAR-100 datasets. We demonstrate that EBMs\nare more robust than transformers and display comparable robustness to\nadversarially-trained DNNs on gradient-based (white-box) attacks, query-based\n(black-box) attacks, and natural perturbations without sacrificing clean\naccuracy, and without the need for adversarial training or additional training\ntechniques.\n","authors":["Siddharth Mansingh","Michal Kucer","Garrett Kenyon","Juston Moore","Michael Teti"],"pdf_url":"https://arxiv.org/pdf/2401.11543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11541v1","updated":"2024-01-21T16:46:04Z","published":"2024-01-21T16:46:04Z","title":"Multi-View Neural 3D Reconstruction of Micro-/Nanostructures with Atomic\n  Force Microscopy","summary":"  Atomic Force Microscopy (AFM) is a widely employed tool for micro-/nanoscale\ntopographic imaging. However, conventional AFM scanning struggles to\nreconstruct complex 3D micro-/nanostructures precisely due to limitations such\nas incomplete sample topography capturing and tip-sample convolution artifacts.\nHere, we propose a multi-view neural-network-based framework with AFM\n(MVN-AFM), which accurately reconstructs surface models of intricate\nmicro-/nanostructures. Unlike previous works, MVN-AFM does not depend on any\nspecially shaped probes or costly modifications to the AFM system. To achieve\nthis, MVN-AFM uniquely employs an iterative method to align multi-view data and\neliminate AFM artifacts simultaneously. Furthermore, we pioneer the application\nof neural implicit surface reconstruction in nanotechnology and achieve\nmarkedly improved results. Extensive experiments show that MVN-AFM effectively\neliminates artifacts present in raw AFM images and reconstructs various\nmicro-/nanostructures including complex geometrical microstructures printed via\nTwo-photon Lithography and nanoparticles such as PMMA nanospheres and ZIF-67\nnanocrystals. This work presents a cost-effective tool for micro-/nanoscale 3D\nanalysis.\n","authors":["Shuo Chen","Mao Peng","Yijin Li","Bing-Feng Ju","Hujun Bao","Yuan-Liu Chen","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16301v2","updated":"2024-01-21T16:27:06Z","published":"2023-09-28T09:54:10Z","title":"Gated Cross-Attention Network for Depth Completion","summary":"  Depth completion is a popular research direction in the field of depth\nestimation. The fusion of color and depth features is the current critical\nchallenge in this task, mainly due to the asymmetry between the rich scene\ndetails in color images and the sparse pixels in depth maps. To tackle this\nissue, we design an efficient Gated Cross-Attention Network that propagates\nconfidence via a gating mechanism, simultaneously extracting and refining key\ninformation in both color and depth branches to achieve local spatial feature\nfusion. Additionally, we employ an attention network based on the Transformer\nin low-dimensional space to effectively fuse global features and increase the\nnetwork's receptive field. With a simple yet efficient gating mechanism, our\nproposed method achieves fast and accurate depth completion without the need\nfor additional branches or post-processing steps. At the same time, we use the\nRay Tune mechanism with the AsyncHyperBandScheduler scheduler and the\nHyperOptSearch algorithm to automatically search for the optimal number of\nmodule iterations, which also allows us to achieve performance comparable to\nstate-of-the-art methods. We conduct experiments on both indoor and outdoor\nscene datasets. Our fast network achieves Pareto-optimal solutions in terms of\ntime and accuracy, and at the time of submission, our accurate network ranks\nfirst among all published papers on the KITTI official website in terms of\naccuracy.\n","authors":["Xiaogang Jia","Songlei Jian","Yusong Tan","Yonggang Che","Wei Chen","Zhengfa Liang"],"pdf_url":"https://arxiv.org/pdf/2309.16301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13472v3","updated":"2024-01-21T16:14:44Z","published":"2023-03-23T17:43:17Z","title":"Promptable Game Models: Text-Guided Game Simulation via Masked Diffusion\n  Models","summary":"  Neural video game simulators emerged as powerful tools to generate and edit\nvideos. Their idea is to represent games as the evolution of an environment's\nstate driven by the actions of its agents. While such a paradigm enables users\nto play a game action-by-action, its rigidity precludes more semantic forms of\ncontrol. To overcome this limitation, we augment game models with prompts\nspecified as a set of natural language actions and desired states. The result-a\nPromptable Game Model (PGM)-makes it possible for a user to play the game by\nprompting it with high- and low-level action sequences. Most captivatingly, our\nPGM unlocks the director's mode, where the game is played by specifying goals\nfor the agents in the form of a prompt. This requires learning \"game AI\",\nencapsulated by our animation model, to navigate the scene using high-level\nconstraints, play against an adversary, and devise a strategy to win a point.\nTo render the resulting state, we use a compositional NeRF representation\nencapsulated in our synthesis model. To foster future research, we present\nnewly collected, annotated and calibrated Tennis and Minecraft datasets. Our\nmethod significantly outperforms existing neural video game simulators in terms\nof rendering quality and unlocks applications beyond the capabilities of the\ncurrent state of the art. Our framework, data, and models are available at\nhttps://snap-research.github.io/promptable-game-models/.\n","authors":["Willi Menapace","Aliaksandr Siarohin","Stéphane Lathuilière","Panos Achlioptas","Vladislav Golyanik","Sergey Tulyakov","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2303.13472v3.pdf","comment":"ACM Transactions on Graphics \\c{opyright} Copyright is held by the\n  owner/author(s) 2023. This is the author's version of the work. It is posted\n  here for your personal use. Not for redistribution. The definitive Version of\n  Record was published in ACM Transactions on Graphics,\n  http://dx.doi.org/10.1145/3635705"},{"id":"http://arxiv.org/abs/2401.11535v1","updated":"2024-01-21T16:14:04Z","published":"2024-01-21T16:14:04Z","title":"Deformable Endoscopic Tissues Reconstruction with Gaussian Splatting","summary":"  Surgical 3D reconstruction is a critical area of research in robotic surgery,\nwith recent works adopting variants of dynamic radiance fields to achieve\nsuccess in 3D reconstruction of deformable tissues from single-viewpoint\nvideos. However, these methods often suffer from time-consuming optimization or\ninferior quality, limiting their adoption in downstream tasks. Inspired by 3D\nGaussian Splatting, a recent trending 3D representation, we present EndoGS,\napplying Gaussian Splatting for deformable endoscopic tissue reconstruction.\nSpecifically, our approach incorporates deformation fields to handle dynamic\nscenes, depth-guided supervision to optimize 3D targets with a single\nviewpoint, and a spatial-temporal weight mask to mitigate tool occlusion. As a\nresult, EndoGS reconstructs and renders high-quality deformable endoscopic\ntissues from a single-viewpoint video, estimated depth maps, and labeled tool\nmasks. Experiments on DaVinci robotic surgery videos demonstrate that EndoGS\nachieves superior rendering quality. Code is available at\nhttps://github.com/HKU-MedAI/EndoGS.\n","authors":["Lingting Zhu","Zhao Wang","Zhenchao Jin","Guying Lin","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11535v1.pdf","comment":"Work in progress. 10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.11519v1","updated":"2024-01-21T15:22:15Z","published":"2024-01-21T15:22:15Z","title":"CaBuAr: California Burned Areas dataset for delineation","summary":"  Forest wildfires represent one of the catastrophic events that, over the last\ndecades, caused huge environmental and humanitarian damages. In addition to a\nsignificant amount of carbon dioxide emission, they are a source of risk to\nsociety in both short-term (e.g., temporary city evacuation due to fire) and\nlong-term (e.g., higher risks of landslides) cases. Consequently, the\navailability of tools to support local authorities in automatically identifying\nburned areas plays an important role in the continuous monitoring requirement\nto alleviate the aftereffects of such catastrophic events. The great\navailability of satellite acquisitions coupled with computer vision techniques\nrepresents an important step in developing such tools. This paper introduces a\nnovel open dataset that tackles the burned area delineation problem, a binary\nsegmentation problem applied to satellite imagery. The presented resource\nconsists of pre- and post-fire Sentinel-2 L2A acquisitions of California forest\nfires that took place starting in 2015. Raster annotations were generated from\nthe data released by California's Department of Forestry and Fire Protection.\nMoreover, in conjunction with the dataset, we release three different baselines\nbased on spectral indexes analyses, SegFormer, and U-Net models.\n","authors":["Daniele Rege Cambrin","Luca Colomba","Paolo Garza"],"pdf_url":"https://arxiv.org/pdf/2401.11519v1.pdf","comment":"Accepted at the IEEE Geoscience and Remote Sensing Magazine"},{"id":"http://arxiv.org/abs/2401.11511v1","updated":"2024-01-21T14:48:38Z","published":"2024-01-21T14:48:38Z","title":"MobileARLoc: On-device Robust Absolute Localisation for Pervasive\n  Markerless Mobile AR","summary":"  Recent years have seen significant improvement in absolute camera pose\nestimation, paving the way for pervasive markerless Augmented Reality (AR).\nHowever, accurate absolute pose estimation techniques are computation- and\nstorage-heavy, requiring computation offloading. As such, AR systems rely on\nvisual-inertial odometry (VIO) to track the device's relative pose between\nrequests to the server. However, VIO suffers from drift, requiring frequent\nabsolute repositioning. This paper introduces MobileARLoc, a new framework for\non-device large-scale markerless mobile AR that combines an absolute pose\nregressor (APR) with a local VIO tracking system. Absolute pose regressors\n(APRs) provide fast on-device pose estimation at the cost of reduced accuracy.\nTo address APR accuracy and reduce VIO drift, MobileARLoc creates a feedback\nloop where VIO pose estimations refine the APR predictions. The VIO system\nidentifies reliable predictions of APR, which are then used to compensate for\nthe VIO drift. We comprehensively evaluate MobileARLoc through dataset\nsimulations. MobileARLoc halves the error compared to the underlying APR and\nachieve fast (80\\,ms) on-device inference speed.\n","authors":["Changkun Liu","Yukun Zhao","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2401.11511v1.pdf","comment":"Accepted for publication at the 3rd edition of the Pervasive and\n  Resource-Constrained AI (PerConAI) workshop (co-located with PerCom 2024).\n  arXiv admin note: substantial text overlap with arXiv:2308.05394"},{"id":"http://arxiv.org/abs/2401.11499v1","updated":"2024-01-21T14:09:49Z","published":"2024-01-21T14:09:49Z","title":"Self-Supervised Bird's Eye View Motion Prediction with Cross-Modality\n  Signals","summary":"  Learning the dense bird's eye view (BEV) motion flow in a self-supervised\nmanner is an emerging research for robotics and autonomous driving. Current\nself-supervised methods mainly rely on point correspondences between point\nclouds, which may introduce the problems of fake flow and inconsistency,\nhindering the model's ability to learn accurate and realistic motion. In this\npaper, we introduce a novel cross-modality self-supervised training framework\nthat effectively addresses these issues by leveraging multi-modality data to\nobtain supervision signals. We design three innovative supervision signals to\npreserve the inherent properties of scene motion, including the masked Chamfer\ndistance loss, the piecewise rigidity loss, and the temporal consistency loss.\nThrough extensive experiments, we demonstrate that our proposed self-supervised\nframework outperforms all previous self-supervision methods for the motion\nprediction task.\n","authors":["Shaoheng Fang","Zuhong Liu","Mingyu Wang","Chenxin Xu","Yiqi Zhong","Siheng Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11492v1","updated":"2024-01-21T13:45:52Z","published":"2024-01-21T13:45:52Z","title":"Edge-Enabled Real-time Railway Track Segmentation","summary":"  Accurate and rapid railway track segmentation can assist automatic train\ndriving and is a key step in early warning to fixed or moving obstacles on the\nrailway track. However, certain existing algorithms tailored for track\nsegmentation often struggle to meet the requirements of real-time and\nefficiency on resource-constrained edge devices. Considering this challenge, we\npropose an edge-enabled real-time railway track segmentation algorithm, which\nis optimized to be suitable for edge applications by optimizing the network\nstructure and quantizing the model after training. Initially, Ghost convolution\nis introduced to reduce the complexity of the backbone, thereby achieving the\nextraction of key information of the interested region at a lower cost. To\nfurther reduce the model complexity and calculation, a new lightweight\ndetection head is proposed to achieve the best balance between accuracy and\nefficiency. Subsequently, we introduce quantization techniques to map the\nmodel's floating-point weights and activation values into lower bit-width\nfixed-point representations, reducing computational demands and memory\nfootprint, ultimately accelerating the model's inference. Finally, we draw\ninspiration from GPU parallel programming principles to expedite the\npre-processing and post-processing stages of the algorithm by doing parallel\nprocessing. The approach is evaluated with public and challenging dataset\nRailSem19 and tested on Jetson Nano. Experimental results demonstrate that our\nenhanced algorithm achieves an accuracy level of 83.3% while achieving a\nreal-time inference rate of 25 frames per second when the input size is\n480x480, thereby effectively meeting the requirements for real-time and\nhigh-efficiency operation.\n","authors":["Chen Chenglin","Wang Fei","Yang Min","Qin Yong","Bai Yun"],"pdf_url":"https://arxiv.org/pdf/2401.11492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05173v4","updated":"2024-01-21T13:38:20Z","published":"2023-09-11T00:02:05Z","title":"DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning","summary":"  Prompt tuning (PT), where a small amount of trainable soft (continuous)\nprompt vectors is affixed to the input of language models (LM), has shown\npromising results across various tasks and models for parameter-efficient\nfine-tuning (PEFT). PT stands out from other PEFT approaches because it\nmaintains competitive performance with fewer trainable parameters and does not\ndrastically scale up its parameters as the model size expands. However, PT\nintroduces additional soft prompt tokens, leading to longer input sequences,\nwhich significantly impacts training and inference time and memory usage due to\nthe Transformer's quadratic complexity. Particularly concerning for Large\nLanguage Models (LLMs) that face heavy daily querying. To address this issue,\nwe propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt\ninto a shorter soft prompt and a pair of low-rank matrices that are then\noptimised with two different learning rates. This allows DePT to achieve better\nperformance while saving substantial memory and time costs compared to vanilla\nPT and its variants, without changing trainable parameter sizes. Through\nextensive experiments on 23 natural language processing (NLP) and\nvision-language (VL) tasks, we demonstrate that DePT outperforms\nstate-of-the-art PEFT approaches, including the full fine-tuning baseline, in\nsome scenarios. Additionally, we empirically show that DEPT grows more\nefficient as the model size increases. Our further study reveals that DePT\nintegrates seamlessly with parameter-efficient transfer learning in the\nfew-shot learning setting and highlights its adaptability to various model\narchitectures and sizes.\n","authors":["Zhengxiang Shi","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2309.05173v4.pdf","comment":"ICLR 2024. Code is available at https://github.com/ZhengxiangShi/DePT"},{"id":"http://arxiv.org/abs/2303.11681v4","updated":"2024-01-21T13:35:44Z","published":"2023-03-21T08:43:15Z","title":"DiffuMask: Synthesizing Images with Pixel-level Annotations for Semantic\n  Segmentation Using Diffusion Models","summary":"  Collecting and annotating images with pixel-wise labels is time-consuming and\nlaborious. In contrast, synthetic data can be freely available using a\ngenerative model (e.g., DALL-E, Stable Diffusion). In this paper, we show that\nit is possible to automatically obtain accurate semantic masks of synthetic\nimages generated by the Off-the-shelf Stable Diffusion model, which uses only\ntext-image pairs during training. Our approach, called DiffuMask, exploits the\npotential of the cross-attention map between text and image, which is natural\nand seamless to extend the text-driven image synthesis to semantic mask\ngeneration. DiffuMask uses text-guided cross-attention information to localize\nclass/word-specific regions, which are combined with practical techniques to\ncreate a novel high-resolution and class-discriminative pixel-wise mask. The\nmethods help to reduce data collection and annotation costs obviously.\nExperiments demonstrate that the existing segmentation methods trained on\nsynthetic data of DiffuMask can achieve a competitive performance over the\ncounterpart of real data (VOC 2012, Cityscapes). For some classes (e.g., bird),\nDiffuMask presents promising performance, close to the stateof-the-art result\nof real data (within 3% mIoU gap). Moreover, in the open-vocabulary\nsegmentation (zero-shot) setting, DiffuMask achieves a new SOTA result on\nUnseen class of VOC 2012. The project website can be found at\nhttps://weijiawu.github.io/DiffusionMask/.\n","authors":["Weijia Wu","Yuzhong Zhao","Mike Zheng Shou","Hong Zhou","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2303.11681v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11489v1","updated":"2024-01-21T13:30:02Z","published":"2024-01-21T13:30:02Z","title":"MapChange: Enhancing Semantic Change Detection with Temporal-Invariant\n  Historical Maps Based on Deep Triplet Network","summary":"  Semantic Change Detection (SCD) is recognized as both a crucial and\nchallenging task in the field of image analysis. Traditional methods for SCD\nhave predominantly relied on the comparison of image pairs. However, this\napproach is significantly hindered by substantial imaging differences, which\narise due to variations in shooting times, atmospheric conditions, and angles.\nSuch discrepancies lead to two primary issues: the under-detection of minor yet\nsignificant changes, and the generation of false alarms due to temporal\nvariances. These factors often result in unchanged objects appearing markedly\ndifferent in multi-temporal images. In response to these challenges, the\nMapChange framework has been developed. This framework introduces a novel\nparadigm that synergizes temporal-invariant historical map data with\ncontemporary high-resolution images. By employing this combination, the\ntemporal variance inherent in conventional image pair comparisons is\neffectively mitigated. The efficacy of the MapChange framework has been\nempirically validated through comprehensive testing on two public datasets.\nThese tests have demonstrated the framework's marked superiority over existing\nstate-of-the-art SCD methods.\n","authors":["Yinhe Liu","Sunan Shi","Zhuo Zheng","Jue Wang","Shiqi Tian","Yanfei Zhong"],"pdf_url":"https://arxiv.org/pdf/2401.11489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01738v4","updated":"2024-01-21T13:27:31Z","published":"2023-08-03T12:58:23Z","title":"Enhancing Visibility in Nighttime Haze Images Using Guided APSF and\n  Gradient Adaptive Convolution","summary":"  Visibility in hazy nighttime scenes is frequently reduced by multiple\nfactors, including low light, intense glow, light scattering, and the presence\nof multicolored light sources. Existing nighttime dehazing methods often\nstruggle with handling glow or low-light conditions, resulting in either\nexcessively dark visuals or unsuppressed glow outputs. In this paper, we\nenhance the visibility from a single nighttime haze image by suppressing glow\nand enhancing low-light regions. To handle glow effects, our framework learns\nfrom the rendered glow pairs. Specifically, a light source aware network is\nproposed to detect light sources of night images, followed by the APSF\n(Atmospheric Point Spread Function)-guided glow rendering. Our framework is\nthen trained on the rendered images, resulting in glow suppression. Moreover,\nwe utilize gradient-adaptive convolution, to capture edges and textures in hazy\nscenes. By leveraging extracted edges and textures, we enhance the contrast of\nthe scene without losing important structural details. To boost low-light\nintensity, our network learns an attention map, then adjusted by gamma\ncorrection. This attention has high values on low-light regions and low values\non haze and glow regions. Extensive evaluation on real nighttime haze images,\ndemonstrates the effectiveness of our method. Our experiments demonstrate that\nour method achieves a PSNR of 30.38dB, outperforming state-of-the-art methods\nby 13% on GTA5 nighttime haze dataset. Our data and code is available at\nhttps://github.com/jinyeying/nighttime_dehaze.\n","authors":["Yeying Jin","Beibei Lin","Wending Yan","Yuan Yuan","Wei Ye","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2308.01738v4.pdf","comment":"Accepted to ACM'MM2023, https://github.com/jinyeying/nighttime_dehaze"},{"id":"http://arxiv.org/abs/2308.10610v2","updated":"2024-01-21T13:23:10Z","published":"2023-08-21T10:20:46Z","title":"Ultrafast and Ultralight Network-Based Intelligent System for Real-time\n  Diagnosis of Ear diseases in Any Devices","summary":"  Traditional ear disease diagnosis heavily depends on experienced specialists\nand specialized equipment, frequently resulting in misdiagnoses, treatment\ndelays, and financial burdens for some patients. Utilizing deep learning models\nfor efficient ear disease diagnosis has proven effective and affordable.\nHowever, existing research overlooked model inference speed and parameter size\nrequired for deployment. To tackle these challenges, we constructed a\nlarge-scale dataset comprising eight ear disease categories and normal ear\ncanal samples from two hospitals. Inspired by ShuffleNetV2, we developed\nBest-EarNet, an ultrafast and ultralight network enabling real-time ear disease\ndiagnosis. Best-EarNet incorporates the novel Local-Global Spatial Feature\nFusion Module which can capture global and local spatial information\nsimultaneously and guide the network to focus on crucial regions within feature\nmaps at various levels, mitigating low accuracy issues. Moreover, our network\nuses multiple auxiliary classification heads for efficient parameter\noptimization. With 0.77M parameters, Best-EarNet achieves an average frames per\nsecond of 80 on CPU. Employing transfer learning and five-fold cross-validation\nwith 22,581 images from Hospital-1, the model achieves an impressive 95.23%\naccuracy. External testing on 1,652 images from Hospital-2 validates its\nperformance, yielding 92.14% accuracy. Compared to state-of-the-art networks,\nBest-EarNet establishes a new state-of-the-art (SOTA) in practical\napplications. Most importantly, we developed an intelligent diagnosis system\ncalled Ear Keeper, which can be deployed on common electronic devices. By\nmanipulating a compact electronic otoscope, users can perform comprehensive\nscanning and diagnosis of the ear canal using real-time video. This study\nprovides a novel paradigm for ear endoscopy and other medical endoscopic image\nrecognition applications.\n","authors":["Yubiao Yue","Xinyu Zeng","Xiaoqiang Shi","Meiping Zhang","Haihua Liang","Fan Zhang","Yanmei Chen","Zefeng Xie","Wenrui Wu","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2308.10610v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11485v1","updated":"2024-01-21T13:16:33Z","published":"2024-01-21T13:16:33Z","title":"ColorVideoVDP: A visual difference predictor for image, video and\n  display distortions","summary":"  ColorVideoVDP is a video and image quality metric that models spatial and\ntemporal aspects of vision, for both luminance and color. The metric is built\non novel psychophysical models of chromatic spatiotemporal contrast sensitivity\nand cross-channel contrast masking. It accounts for the viewing conditions,\ngeometric, and photometric characteristics of the display. It was trained to\npredict common video streaming distortions (e.g. video compression, rescaling,\nand transmission errors), and also 8 new distortion types related to AR/VR\ndisplays (e.g. light source and waveguide non-uniformities). To address the\nlatter application, we collected our novel XR-Display-Artifact-Video quality\ndataset (XR-DAVID), comprised of 336 distorted videos. Extensive testing on\nXR-DAVID, as well as several datasets from the literature, indicate a\nsignificant gain in prediction performance compared to existing metrics.\nColorVideoVDP opens the doors to many novel applications which require the\njoint automated spatiotemporal assessment of luminance and color distortions,\nincluding video streaming, display specification and design, visual comparison\nof results, and perceptually-guided quality optimization.\n","authors":["Rafal K. Mantiuk","Param Hanji","Maliha Ashraf","Yuta Asano","Alexandre Chapiro"],"pdf_url":"https://arxiv.org/pdf/2401.11485v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2401.04614v2","updated":"2024-01-21T12:56:32Z","published":"2024-01-09T15:36:07Z","title":"Generic Knowledge Boosted Pre-training For Remote Sensing Images","summary":"  Deep learning models are essential for scene classification, change\ndetection, land cover segmentation, and other remote sensing image\nunderstanding tasks. Most backbones of existing remote sensing deep learning\nmodels are typically initialized by pre-trained weights obtained from ImageNet\npre-training (IMP). However, domain gaps exist between remote sensing images\nand natural images (e.g., ImageNet), making deep learning models initialized by\npre-trained weights of IMP perform poorly for remote sensing image\nunderstanding. Although some pre-training methods are studied in the remote\nsensing community, current remote sensing pre-training methods face the problem\nof vague generalization by only using remote sensing images. In this paper, we\npropose a novel remote sensing pre-training framework, Generic Knowledge\nBoosted Remote Sensing Pre-training (GeRSP), to learn robust representations\nfrom remote sensing and natural images for remote sensing understanding tasks.\nGeRSP contains two pre-training branches: (1) A self-supervised pre-training\nbranch is adopted to learn domain-related representations from unlabeled remote\nsensing images. (2) A supervised pre-training branch is integrated into GeRSP\nfor general knowledge learning from labeled natural images. Moreover, GeRSP\ncombines two pre-training branches using a teacher-student architecture to\nsimultaneously learn representations with general and special knowledge, which\ngenerates a powerful pre-trained model for deep learning model initialization.\nFinally, we evaluate GeRSP and other remote sensing pre-training methods on\nthree downstream tasks, i.e., object detection, semantic segmentation, and\nscene classification. The extensive experimental results consistently\ndemonstrate that GeRSP can effectively learn robust representations in a\nunified manner, improving the performance of remote sensing downstream tasks.\n","authors":["Ziyue Huang","Mingming Zhang","Yuan Gong","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.04614v2.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.01632v3","updated":"2024-01-21T12:50:08Z","published":"2023-12-04T05:24:45Z","title":"GaussianHead: High-fidelity Head Avatars with Learnable Gaussian\n  Derivation","summary":"  Constructing vivid 3D head avatars for given subjects and realizing a series\nof animations on them is valuable yet challenging. This paper presents\nGaussianHead, which models the actional human head with anisotropic 3D\nGaussians. In our framework, a motion deformation field and multi-resolution\ntri-plane are constructed respectively to deal with the head's dynamic geometry\nand complex texture. Notably, we impose an exclusive derivation scheme on each\nGaussian, which generates its multiple doppelgangers through a set of learnable\nparameters for position transformation. With this design, we can compactly and\naccurately encode the appearance information of Gaussians, even those fitting\nthe head's particular components with sophisticated structures. In addition, an\ninherited derivation strategy for newly added Gaussians is adopted to\nfacilitate training acceleration. Extensive experiments show that our method\ncan produce high-fidelity renderings, outperforming state-of-the-art approaches\nin reconstruction, cross-identity reenactment, and novel view synthesis tasks.\nOur code is available at: https://github.com/chiehwangs/gaussian-head.\n","authors":["Jie Wang","Jiu-Cheng Xie","Xianyan Li","Feng Xu","Chi-Man Pun","Hao Gao"],"pdf_url":"https://arxiv.org/pdf/2312.01632v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04089v2","updated":"2024-01-21T12:32:04Z","published":"2023-09-08T02:58:17Z","title":"Toward Sufficient Spatial-Frequency Interaction for Gradient-aware\n  Underwater Image Enhancement","summary":"  Underwater images suffer from complex and diverse degradation, which\ninevitably affects the performance of underwater visual tasks. However, most\nexisting learning-based Underwater image enhancement (UIE) methods mainly\nrestore such degradations in the spatial domain, and rarely pay attention to\nthe fourier frequency information. In this paper, we develop a novel UIE\nframework based on spatial-frequency interaction and gradient maps, namely\nSFGNet, which consists of two stages. Specifically, in the first stage, we\npropose a dense spatial-frequency fusion network (DSFFNet), mainly including\nour designed dense fourier fusion block and dense spatial fusion block,\nachieving sufficient spatial-frequency interaction by cross connections between\nthese two blocks. In the second stage, we propose a gradient-aware corrector\n(GAC) to further enhance perceptual details and geometric structures of images\nby gradient map. Experimental results on two real-world underwater image\ndatasets show that our approach can successfully enhance underwater images, and\nachieves competitive performance in visual quality improvement. The code is\navailable at https://github.com/zhihefang/SFGNet.\n","authors":["Chen Zhao","Weiling Cai","Chenyu Dong","Ziqi Zeng"],"pdf_url":"https://arxiv.org/pdf/2309.04089v2.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.11470v1","updated":"2024-01-21T11:55:42Z","published":"2024-01-21T11:55:42Z","title":"Exploring Missing Modality in Multimodal Egocentric Datasets","summary":"  Multimodal video understanding is crucial for analyzing egocentric videos,\nwhere integrating multiple sensory signals significantly enhances action\nrecognition and moment localization. However, practical applications often\ngrapple with incomplete modalities due to factors like privacy concerns,\nefficiency demands, or hardware malfunctions. Addressing this, our study delves\ninto the impact of missing modalities on egocentric action recognition,\nparticularly within transformer-based models. We introduce a novel concept\n-Missing Modality Token (MMT)-to maintain performance even when modalities are\nabsent, a strategy that proves effective in the Ego4D, Epic-Kitchens, and\nEpic-Sounds datasets. Our method mitigates the performance loss, reducing it\nfrom its original $\\sim 30\\%$ drop to only $\\sim 10\\%$ when half of the test\nset is modal-incomplete. Through extensive experimentation, we demonstrate the\nadaptability of MMT to different training scenarios and its superiority in\nhandling missing modalities compared to current methods. Our research\ncontributes a comprehensive analysis and an innovative approach, opening\navenues for more resilient multimodal systems in real-world settings.\n","authors":["Merey Ramazanova","Alejandro Pardo","Humam Alwassel","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2401.11470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14835v2","updated":"2024-01-21T11:13:31Z","published":"2022-11-27T14:18:40Z","title":"CLID: Controlled-Length Image Descriptions with Limited Data","summary":"  Controllable image captioning models generate human-like image descriptions,\nenabling some kind of control over the generated captions. This paper focuses\non controlling the caption length, i.e. a short and concise description or a\nlong and detailed one. Since existing image captioning datasets contain mostly\nshort captions, generating long captions is challenging. To address the\nshortage of long training examples, we propose to enrich the dataset with\nvarying-length self-generated captions. These, however, might be of varying\nquality and are thus unsuitable for conventional training. We introduce a novel\ntraining strategy that selects the data points to be used at different times\nduring the training. Our method dramatically improves the length-control\nabilities, while exhibiting SoTA performance in terms of caption quality. Our\napproach is general and is shown to be applicable also to paragraph generation.\n","authors":["Elad Hirsch","Ayellet Tal"],"pdf_url":"https://arxiv.org/pdf/2211.14835v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11464v1","updated":"2024-01-21T11:12:00Z","published":"2024-01-21T11:12:00Z","title":"Task-specific regularization loss towards model calibration for reliable\n  lung cancer detection","summary":"  Lung cancer is one of the significant causes of cancer-related deaths\nglobally. Early detection and treatment improve the chances of survival.\nTraditionally CT scans have been used to extract the most significant lung\ninfection information and diagnose cancer. This process is carried out manually\nby an expert radiologist. The imbalance in the radiologists-to-population ratio\nin a country like India implies significant work pressure on them and thus\nraises the need to automate a few of their responsibilities. The tendency of\nmodern-day Deep Neural networks to make overconfident mistakes limit their\nusage to detect cancer. In this paper, we propose a new task-specific loss\nfunction to calibrate the neural network to reduce the risk of overconfident\nmistakes. We use the state-of-the-art Multi-class Difference in Confidence and\nAccuracy (MDCA) loss in conjunction with the proposed task-specific loss\nfunction to achieve the same. We also integrate post-hoc calibration by\nperforming temperature scaling on top of the train-time calibrated model. We\ndemonstrate 5.98% improvement in the Expected Calibration Error (ECE) and a\n17.9% improvement in Maximum Calibration Error (MCE) as compared to the\nbest-performing SOTA algorithm.\n","authors":["Mehar Prateek Kalra","Mansi Singhal","Rohan Raju Dhanakashirur"],"pdf_url":"https://arxiv.org/pdf/2401.11464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.04852v2","updated":"2024-01-21T10:54:55Z","published":"2021-06-09T07:20:54Z","title":"Deep Tiny Network for Recognition-Oriented Face Image Quality Assessment","summary":"  Face recognition has made significant progress in recent years due to deep\nconvolutional neural networks (CNN). In many face recognition (FR) scenarios,\nface images are acquired from a sequence with huge intra-variations. These\nintra-variations, which are mainly affected by the low-quality face images,\ncause instability of recognition performance. Previous works have focused on\nad-hoc methods to select frames from a video or use face image quality\nassessment (FIQA) methods, which consider only a particular or combination of\nseveral distortions.\n  In this work, we present an efficient non-reference image quality assessment\nfor FR that directly links image quality assessment (IQA) and FR. More\nspecifically, we propose a new measurement to evaluate image quality without\nany reference. Based on the proposed quality measurement, we propose a deep\nTiny Face Quality network (tinyFQnet) to learn a quality prediction function\nfrom data.\n  We evaluate the proposed method for different powerful FR models on two\nclassical video-based (or template-based) benchmark: IJB-B and YTF. Extensive\nexperiments show that, although the tinyFQnet is much smaller than the others,\nthe proposed method outperforms state-of-the-art quality assessment methods in\nterms of effectiveness and efficiency.\n","authors":["Baoyun Peng","Min Liu","Zhaoning Zhang","Kai Xu","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2106.04852v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11453v1","updated":"2024-01-21T10:20:46Z","published":"2024-01-21T10:20:46Z","title":"Inter-Domain Mixup for Semi-Supervised Domain Adaptation","summary":"  Semi-supervised domain adaptation (SSDA) aims to bridge source and target\ndomain distributions, with a small number of target labels available, achieving\nbetter classification performance than unsupervised domain adaptation (UDA).\nHowever, existing SSDA work fails to make full use of label information from\nboth source and target domains for feature alignment across domains, resulting\nin label mismatch in the label space during model testing. This paper presents\na novel SSDA approach, Inter-domain Mixup with Neighborhood Expansion (IDMNE),\nto tackle this issue. Firstly, we introduce a cross-domain feature alignment\nstrategy, Inter-domain Mixup, that incorporates label information into model\nadaptation. Specifically, we employ sample-level and manifold-level data mixing\nto generate compatible training samples. These newly established samples,\ncombined with reliable and actual label information, display diversity and\ncompatibility across domains, while such extra supervision thus facilitates\ncross-domain feature alignment and mitigates label mismatch. Additionally, we\nutilize Neighborhood Expansion to leverage high-confidence pseudo-labeled\nsamples in the target domain, diversifying the label information of the target\ndomain and thereby further increasing the performance of the adaptation model.\nAccordingly, the proposed approach outperforms existing state-of-the-art\nmethods, achieving significant accuracy improvements on popular SSDA\nbenchmarks, including DomainNet, Office-Home, and Office-31.\n","authors":["Jichang Li","Guanbin Li","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11453v1.pdf","comment":"Publisted to Elsevier PR2024, available at\n  https://www.sciencedirect.com/science/article/pii/S0031320323007203?via%3Dihub"},{"id":"http://arxiv.org/abs/2401.11448v1","updated":"2024-01-21T09:57:56Z","published":"2024-01-21T09:57:56Z","title":"Adaptive Betweenness Clustering for Semi-Supervised Domain Adaptation","summary":"  Compared to unsupervised domain adaptation, semi-supervised domain adaptation\n(SSDA) aims to significantly improve the classification performance and\ngeneralization capability of the model by leveraging the presence of a small\namount of labeled data from the target domain. Several SSDA approaches have\nbeen developed to enable semantic-aligned feature confusion between labeled (or\npseudo labeled) samples across domains; nevertheless, owing to the scarcity of\nsemantic label information of the target domain, they were arduous to fully\nrealize their potential. In this study, we propose a novel SSDA approach named\nGraph-based Adaptive Betweenness Clustering (G-ABC) for achieving categorical\ndomain alignment, which enables cross-domain semantic alignment by mandating\nsemantic transfer from labeled data of both the source and target domains to\nunlabeled target samples. In particular, a heterogeneous graph is initially\nconstructed to reflect the pairwise relationships between labeled samples from\nboth domains and unlabeled ones of the target domain. Then, to degrade the\nnoisy connectivity in the graph, connectivity refinement is conducted by\nintroducing two strategies, namely Confidence Uncertainty based Node Removal\nand Prediction Dissimilarity based Edge Pruning. Once the graph has been\nrefined, Adaptive Betweenness Clustering is introduced to facilitate semantic\ntransfer by using across-domain betweenness clustering and within-domain\nbetweenness clustering, thereby propagating semantic label information from\nlabeled samples across domains to unlabeled target data. Extensive experiments\non three standard benchmark datasets, namely DomainNet, Office-Home, and\nOffice-31, indicated that our method outperforms previous state-of-the-art SSDA\napproaches, demonstrating the superiority of the proposed G-ABC algorithm.\n","authors":["Jichang Li","Guanbin Li","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11448v1.pdf","comment":"16 pages, 9 figures, published to IEEE TIP"},{"id":"http://arxiv.org/abs/2401.11439v1","updated":"2024-01-21T09:39:11Z","published":"2024-01-21T09:39:11Z","title":"General Flow as Foundation Affordance for Scalable Robot Learning","summary":"  We address the challenge of acquiring real-world manipulation skills with a\nscalable framework.Inspired by the success of large-scale auto-regressive\nprediction in Large Language Models (LLMs), we hold the belief that identifying\nan appropriate prediction target capable of leveraging large-scale datasets is\ncrucial for achieving efficient and universal learning. Therefore, we propose\nto utilize flow, which represents the future trajectories of 3D points on\nobjects of interest, as an ideal prediction target in robot learning. To\nexploit scalable data resources, we turn our attention to cross-embodiment\ndatasets. We develop, for the first time, a language-conditioned prediction\nmodel directly from large-scale RGBD human video datasets. Our predicted flow\noffers actionable geometric and physics guidance, thus facilitating stable\nzero-shot skill transfer in real-world scenarios.We deploy our method with a\npolicy based on closed-loop flow prediction. Remarkably, without any additional\ntraining, our method achieves an impressive 81% success rate in human-to-robot\nskill transfer, covering 18 tasks in 6 scenes. Our framework features the\nfollowing benefits: (1) scalability: leveraging cross-embodiment data\nresources; (2) universality: multiple object categories, including rigid,\narticulated, and soft bodies; (3) stable skill transfer: providing actionable\nguidance with a small inference domain-gap. These lead to a new pathway towards\nscalable general robot learning. Data, code, and model weights will be made\npublicly available.\n","authors":["Chengbo Yuan","Chuan Wen","Tong Zhang","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2401.11439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11436v1","updated":"2024-01-21T09:16:29Z","published":"2024-01-21T09:16:29Z","title":"Geometric Prior Guided Feature Representation Learning for Long-Tailed\n  Classification","summary":"  Real-world data are long-tailed, the lack of tail samples leads to a\nsignificant limitation in the generalization ability of the model. Although\nnumerous approaches of class re-balancing perform well for moderate class\nimbalance problems, additional knowledge needs to be introduced to help the\ntail class recover the underlying true distribution when the observed\ndistribution from a few tail samples does not represent its true distribution\nproperly, thus allowing the model to learn valuable information outside the\nobserved domain. In this work, we propose to leverage the geometric information\nof the feature distribution of the well-represented head class to guide the\nmodel to learn the underlying distribution of the tail class. Specifically, we\nfirst systematically define the geometry of the feature distribution and the\nsimilarity measures between the geometries, and discover four phenomena\nregarding the relationship between the geometries of different feature\ndistributions. Then, based on four phenomena, feature uncertainty\nrepresentation is proposed to perturb the tail features by utilizing the\ngeometry of the head class feature distribution. It aims to make the perturbed\nfeatures cover the underlying distribution of the tail class as much as\npossible, thus improving the model's generalization performance in the test\ndomain. Finally, we design a three-stage training scheme enabling feature\nuncertainty modeling to be successfully applied. Experiments on\nCIFAR-10/100-LT, ImageNet-LT, and iNaturalist2018 show that our proposed\napproach outperforms other similar methods on most metrics. In addition, the\nexperimental phenomena we discovered are able to provide new perspectives and\ntheoretical foundations for subsequent studies.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Shuyuan Yang","Xu Liu","Puhua Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11436v1.pdf","comment":"This work was accepted by the IJCV"},{"id":"http://arxiv.org/abs/2401.09496v2","updated":"2024-01-21T08:51:37Z","published":"2024-01-17T01:37:17Z","title":"Learning to Generalize over Subpartitions for Heterogeneity-aware Domain\n  Adaptive Nuclei Segmentation","summary":"  Annotation scarcity and cross-modality/stain data distribution shifts are two\nmajor obstacles hindering the application of deep learning models for nuclei\nanalysis, which holds a broad spectrum of potential applications in digital\npathology. Recently, unsupervised domain adaptation (UDA) methods have been\nproposed to mitigate the distributional gap between different imaging\nmodalities for unsupervised nuclei segmentation in histopathology images.\nHowever, existing UDA methods are built upon the assumption that data\ndistributions within each domain should be uniform. Based on the\nover-simplified supposition, they propose to align the histopathology target\ndomain with the source domain integrally, neglecting severe intra-domain\ndiscrepancy over subpartitions incurred by mixed cancer types and sampling\norgans. In this paper, for the first time, we propose to explicitly consider\nthe heterogeneity within the histopathology domain and introduce open compound\ndomain adaptation (OCDA) to resolve the crux. In specific, a two-stage\ndisentanglement framework is proposed to acquire domain-invariant feature\nrepresentations at both image and instance levels. The holistic design\naddresses the limitations of existing OCDA approaches which struggle to capture\ninstance-wise variations. Two regularization strategies are specifically\ndevised herein to leverage the rich subpartition-specific characteristics in\nhistopathology images and facilitate subdomain decomposition. Moreover, we\npropose a dual-branch nucleus shape and structure preserving module to prevent\nnucleus over-generation and deformation in the synthesized images. Experimental\nresults on both cross-modality and cross-stain scenarios over a broad range of\ndiverse datasets demonstrate the superiority of our method compared with\nstate-of-the-art UDA and OCDA methods.\n","authors":["Jianan Fan","Dongnan Liu","Hang Chang","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2401.09496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11430v1","updated":"2024-01-21T08:35:25Z","published":"2024-01-21T08:35:25Z","title":"Exploring Diffusion Time-steps for Unsupervised Representation Learning","summary":"  Representation learning is all about discovering the hidden modular\nattributes that generate the data faithfully. We explore the potential of\nDenoising Diffusion Probabilistic Model (DM) in unsupervised learning of the\nmodular attributes. We build a theoretical framework that connects the\ndiffusion time-steps and the hidden attributes, which serves as an effective\ninductive bias for unsupervised learning. Specifically, the forward diffusion\nprocess incrementally adds Gaussian noise to samples at each time-step, which\nessentially collapses different samples into similar ones by losing attributes,\ne.g., fine-grained attributes such as texture are lost with less noise added\n(i.e., early time-steps), while coarse-grained ones such as shape are lost by\nadding more noise (i.e., late time-steps). To disentangle the modular\nattributes, at each time-step t, we learn a t-specific feature to compensate\nfor the newly lost attribute, and the set of all 1,...,t-specific features,\ncorresponding to the cumulative set of lost attributes, are trained to make up\nfor the reconstruction error of a pre-trained DM at time-step t. On CelebA,\nFFHQ, and Bedroom datasets, the learned feature significantly improves\nattribute classification and enables faithful counterfactual generation, e.g.,\ninterpolating only one specified attribute between two images, validating the\ndisentanglement quality. Codes are in https://github.com/yue-zhongqi/diti.\n","authors":["Zhongqi Yue","Jiankun Wang","Qianru Sun","Lei Ji","Eric I-Chao Chang","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11430v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11425v1","updated":"2024-01-21T08:18:45Z","published":"2024-01-21T08:18:45Z","title":"Grayscale Image Colorization with GAN and CycleGAN in Different Image\n  Domain","summary":"  Automatic colorization of grayscale image has been a challenging task.\nPrevious research have applied supervised methods in conquering this problem [\n1]. In this paper, we reproduces a GAN-based coloring model, and experiments\none of its variant. We also proposed a CycleGAN based model and experiments\nthose methods on various datasets. The result shows that the proposed CycleGAN\nmodel does well in human-face coloring and comic coloring, but lack the ability\nto diverse colorization.\n","authors":["Chen Liang","Yunchen Sheng","Yichen Mo"],"pdf_url":"https://arxiv.org/pdf/2401.11425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11421v1","updated":"2024-01-21T07:57:04Z","published":"2024-01-21T07:57:04Z","title":"Enhancing the vision-language foundation model with key semantic\n  knowledge-emphasized report refinement","summary":"  Recently, vision-language representation learning has made remarkable\nadvancements in building up medical foundation models, holding immense\npotential for transforming the landscape of clinical research and medical care.\nThe underlying hypothesis is that the rich knowledge embedded in radiology\nreports can effectively assist and guide the learning process, reducing the\nneed for additional labels. However, these reports tend to be complex and\nsometimes even consist of redundant descriptions that make the representation\nlearning too challenging to capture the key semantic information. This paper\ndevelops a novel iterative vision-language representation learning framework by\nproposing a key semantic knowledge-emphasized report refinement method.\nParticularly, raw radiology reports are refined to highlight the key\ninformation according to a constructed clinical dictionary and two\nmodel-optimized knowledge-enhancement metrics. The iterative framework is\ndesigned to progressively learn, starting from gaining a general understanding\nof the patient's condition based on raw reports and gradually refines and\nextracts critical information essential to the fine-grained analysis tasks. The\neffectiveness of the proposed framework is validated on various downstream\nmedical image analysis tasks, including disease classification,\nregion-of-interest segmentation, and phrase grounding. Our framework surpasses\nseven state-of-the-art methods in both fine-tuning and zero-shot settings,\ndemonstrating its encouraging potential for different clinical applications.\n","authors":["Cheng Li","Weijian Huang","Hao Yang","Jiarun Liu","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11420v1","updated":"2024-01-21T07:48:39Z","published":"2024-01-21T07:48:39Z","title":"Embedded Hyperspectral Band Selection with Adaptive Optimization for\n  Image Semantic Segmentation","summary":"  Hyperspectral band selection plays a pivotal role in remote sensing and image\nanalysis, aiming to identify the most informative spectral bands while\nminimizing computational overhead. In this paper, we introduce a pioneering\napproach for hyperspectral band selection that offers an embedded solution,\nmaking it well-suited for resource-constrained or real-time applications. Our\nproposed method, embedded Hyperspectral Band Selection (EHBS), excels in\nselecting the best bands without the need for prior processing, seamlessly\nintegrating with the downstream task model. This is achieved through the\nadaptation of the Stochastic Gates (STG) algorithm, originally designed for\nfeature selection, for hyperspectral band selection in the context of image\nsemantic segmentation and the integration of a dynamic optimizer, DoG, which\nremoves the need for the required tuning the learning rate. To assess the\nperformance of our method, we introduce a novel metric for evaluating band\nselection methods across different target numbers of selected bands quantified\nby the Area Under the Curve (AUC). We conduct experiments on two distinct\nsemantic-segmentation hyperspectral benchmark datasets, demonstrating its\nsuperiority in terms of its resulting accuracy and its ease of use compared to\nmany common and state-of-the-art methods. Furthermore, our contributions extend\nbeyond the realm of hyperspectral band selection. The adaptability of our\napproach to other tasks, especially those involving grouped features, opens up\npromising avenues for broader applications within the realm of deep learning,\nsuch as feature selection for feature groups. The demonstrated success on the\ntested datasets and the potential for application to a variety of tasks\nunderscore the value of our method as a substantial addition to the field of\ncomputer vision.\n","authors":["Yaniv Zimmer","Oren Glickman"],"pdf_url":"https://arxiv.org/pdf/2401.11420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17005v3","updated":"2024-01-21T07:36:52Z","published":"2023-11-28T17:59:04Z","title":"MVBench: A Comprehensive Multi-modal Video Understanding Benchmark","summary":"  With the rapid development of Multi-modal Large Language Models (MLLMs), a\nnumber of diagnostic benchmarks have recently emerged to evaluate the\ncomprehension capabilities of these models. However, most benchmarks\npredominantly assess spatial understanding in the static image tasks, while\noverlooking temporal understanding in the dynamic video tasks. To alleviate\nthis issue, we introduce a comprehensive Multi-modal Video understanding\nBenchmark, namely MVBench, which covers 20 challenging video tasks that cannot\nbe effectively solved with a single frame. Specifically, we first introduce a\nnovel static-to-dynamic method to define these temporal-related tasks. By\ntransforming various static tasks into dynamic ones, we enable the systematic\ngeneration of video tasks that require a broad spectrum of temporal skills,\nranging from perception to cognition. Then, guided by the task definition, we\nautomatically convert public video annotations into multiple-choice QA to\nevaluate each task. On one hand, such a distinct paradigm allows us to build\nMVBench efficiently, without much manual intervention. On the other hand, it\nguarantees evaluation fairness with ground-truth video annotations, avoiding\nthe biased scoring of LLMs. Moreover, we further develop a robust video MLLM\nbaseline, i.e., VideoChat2, by progressive multi-modal training with diverse\ninstruction-tuning data. The extensive results on our MVBench reveal that, the\nexisting MLLMs are far from satisfactory in temporal understanding, while our\nVideoChat2 largely surpasses these leading models by over 15% on MVBench. All\nmodels and data are available at https://github.com/OpenGVLab/Ask-Anything.\n","authors":["Kunchang Li","Yali Wang","Yinan He","Yizhuo Li","Yi Wang","Yi Liu","Zun Wang","Jilan Xu","Guo Chen","Ping Luo","Limin Wang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2311.17005v3.pdf","comment":"18 pages, 7 figures, 19 tables"},{"id":"http://arxiv.org/abs/2401.09671v2","updated":"2024-01-21T07:27:25Z","published":"2024-01-18T01:07:00Z","title":"Towards Identifiable Unsupervised Domain Translation: A Diversified\n  Distribution Matching Approach","summary":"  Unsupervised domain translation (UDT) aims to find functions that convert\nsamples from one domain (e.g., sketches) to another domain (e.g., photos)\nwithout changing the high-level semantic meaning (also referred to as\n``content''). The translation functions are often sought by probability\ndistribution matching of the transformed source domain and target domain.\nCycleGAN stands as arguably the most representative approach among this line of\nwork. However, it was noticed in the literature that CycleGAN and variants\ncould fail to identify the desired translation functions and produce\ncontent-misaligned translations. This limitation arises due to the presence of\nmultiple translation functions -- referred to as ``measure-preserving\nautomorphism\" (MPA) -- in the solution space of the learning criteria. Despite\nawareness of such identifiability issues, solutions have remained elusive. This\nstudy delves into the core identifiability inquiry and introduces an MPA\nelimination theory. Our analysis shows that MPA is unlikely to exist, if\nmultiple pairs of diverse cross-domain conditional distributions are matched by\nthe learning function. Our theory leads to a UDT learner using distribution\nmatching over auxiliary variable-induced subsets of the domains -- other than\nover the entire data domains as in the classical approaches. The proposed\nframework is the first to rigorously establish translation identifiability\nunder reasonable UDT settings, to our best knowledge. Experiments corroborate\nwith our theoretical claims.\n","authors":["Sagar Shrestha","Xiao Fu"],"pdf_url":"https://arxiv.org/pdf/2401.09671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11414v1","updated":"2024-01-21T06:47:33Z","published":"2024-01-21T06:47:33Z","title":"S$^3$M-Net: Joint Learning of Semantic Segmentation and Stereo Matching\n  for Autonomous Driving","summary":"  Semantic segmentation and stereo matching are two essential components of 3D\nenvironmental perception systems for autonomous driving. Nevertheless,\nconventional approaches often address these two problems independently,\nemploying separate models for each task. This approach poses practical\nlimitations in real-world scenarios, particularly when computational resources\nare scarce or real-time performance is imperative. Hence, in this article, we\nintroduce S$^3$M-Net, a novel joint learning framework developed to perform\nsemantic segmentation and stereo matching simultaneously. Specifically,\nS$^3$M-Net shares the features extracted from RGB images between both tasks,\nresulting in an improved overall scene understanding capability. This feature\nsharing process is realized using a feature fusion adaption (FFA) module, which\neffectively transforms the shared features into semantic space and subsequently\nfuses them with the encoded disparity features. The entire joint learning\nframework is trained by minimizing a novel semantic consistency-guided (SCG)\nloss, which places emphasis on the structural consistency in both tasks.\nExtensive experimental results conducted on the vKITTI2 and KITTI datasets\ndemonstrate the effectiveness of our proposed joint learning framework and its\nsuperior performance compared to other state-of-the-art single-task networks.\nOur project webpage is accessible at mias.group/S3M-Net.\n","authors":["Zhiyuan Wu","Yi Feng","Chuang-Wei Liu","Fisher Yu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2401.11414v1.pdf","comment":"accepted to IEEE Trans. on Intelligent Vehicles (T-IV)"},{"id":"http://arxiv.org/abs/2401.11406v1","updated":"2024-01-21T05:50:39Z","published":"2024-01-21T05:50:39Z","title":"Adversarial Augmentation Training Makes Action Recognition Models More\n  Robust to Realistic Video Distribution Shifts","summary":"  Despite recent advances in video action recognition achieving strong\nperformance on existing benchmarks, these models often lack robustness when\nfaced with natural distribution shifts between training and test data. We\npropose two novel evaluation methods to assess model resilience to such\ndistribution disparity. One method uses two different datasets collected from\ndifferent sources and uses one for training and validation, and the other for\ntesting. More precisely, we created dataset splits of HMDB-51 or UCF-101 for\ntraining, and Kinetics-400 for testing, using the subset of the classes that\nare overlapping in both train and test datasets. The other proposed method\nextracts the feature mean of each class from the target evaluation dataset's\ntraining data (i.e. class prototype) and estimates test video prediction as a\ncosine similarity score between each sample to the class prototypes of each\ntarget class. This procedure does not alter model weights using the target\ndataset and it does not require aligning overlapping classes of two different\ndatasets, thus is a very efficient method to test the model robustness to\ndistribution shifts without prior knowledge of the target distribution. We\naddress the robustness problem by adversarial augmentation training -\ngenerating augmented views of videos that are \"hard\" for the classification\nmodel by applying gradient ascent on the augmentation parameters - as well as\n\"curriculum\" scheduling the strength of the video augmentations. We\nexperimentally demonstrate the superior performance of the proposed adversarial\naugmentation approach over baselines across three state-of-the-art action\nrecognition models - TSM, Video Swin Transformer, and Uniformer. The presented\nwork provides critical insight into model robustness to distribution shifts and\npresents effective techniques to enhance video action recognition performance\nin a real-world deployment.\n","authors":["Kiyoon Kim","Shreyank N Gowda","Panagiotis Eustratiadis","Antreas Antoniou","Robert B Fisher"],"pdf_url":"https://arxiv.org/pdf/2401.11406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11737v2","updated":"2024-01-21T04:55:06Z","published":"2023-08-22T18:57:07Z","title":"Animal3D: A Comprehensive Dataset of 3D Animal Pose and Shape","summary":"  Accurately estimating the 3D pose and shape is an essential step towards\nunderstanding animal behavior, and can potentially benefit many downstream\napplications, such as wildlife conservation. However, research in this area is\nheld back by the lack of a comprehensive and diverse dataset with high-quality\n3D pose and shape annotations. In this paper, we propose Animal3D, the first\ncomprehensive dataset for mammal animal 3D pose and shape estimation. Animal3D\nconsists of 3379 images collected from 40 mammal species, high-quality\nannotations of 26 keypoints, and importantly the pose and shape parameters of\nthe SMAL model. All annotations were labeled and checked manually in a\nmulti-stage process to ensure highest quality results. Based on the Animal3D\ndataset, we benchmark representative shape and pose estimation models at: (1)\nsupervised learning from only the Animal3D data, (2) synthetic to real transfer\nfrom synthetically generated images, and (3) fine-tuning human pose and shape\nestimation models. Our experimental results demonstrate that predicting the 3D\nshape and pose of animals across species remains a very challenging task,\ndespite significant advances in human pose estimation. Our results further\ndemonstrate that synthetic pre-training is a viable strategy to boost the model\nperformance. Overall, Animal3D opens new directions for facilitating future\nresearch in animal 3D pose and shape estimation, and is publicly available.\n","authors":["Jiacong Xu","Yi Zhang","Jiawei Peng","Wufei Ma","Artur Jesslen","Pengliang Ji","Qixin Hu","Jiehua Zhang","Qihao Liu","Jiahao Wang","Wei Ji","Chen Wang","Xiaoding Yuan","Prakhar Kaushik","Guofeng Zhang","Jie Liu","Yushan Xie","Yawen Cui","Alan Yuille","Adam Kortylewski"],"pdf_url":"https://arxiv.org/pdf/2308.11737v2.pdf","comment":"11 pages, 5 figures, link to the dataset:\n  https://xujiacong.github.io/Animal3D/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.11632v1","updated":"2024-01-21T23:56:57Z","published":"2024-01-21T23:56:57Z","title":"What Are We Optimizing For? A Human-centric Evaluation Of Deep\n  Learning-based Recommender Systems","summary":"  Deep learning-based (DL) models in recommender systems (RecSys) have gained\nsignificant recognition for their remarkable accuracy in predicting user\npreferences. However, their performance often lacks a comprehensive evaluation\nfrom a human-centric perspective, which encompasses various dimensions beyond\nsimple interest matching. In this work, we have developed a robust\nhuman-centric evaluation framework that incorporates seven diverse metrics to\nassess the quality of recommendations generated by five recent open-sourced DL\nmodels. Our evaluation datasets consist of both offline benchmark data and\npersonalized online recommendation feedback collected from 445 real users. We\nfind that (1) different DL models have different pros and cons in the\nmulti-dimensional metrics that we test with; (2) users generally want a\ncombination of accuracy with at least one another human values in the\nrecommendation; (3) the degree of combination of different values needs to be\ncarefully experimented to user preferred level.\n","authors":["Ruixuan Sun","Avinash Akella","Xinyi Wu","Ruoyan Kong","Joseph A. Konstan"],"pdf_url":"https://arxiv.org/pdf/2401.11632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11624v1","updated":"2024-01-21T23:34:42Z","published":"2024-01-21T23:34:42Z","title":"In-context Learning with Retrieved Demonstrations for Language Models: A\n  Survey","summary":"  Language models, especially pre-trained large language models, have showcased\nremarkable abilities as few-shot in-context learners (ICL), adept at adapting\nto new tasks with just a few demonstrations in the input context. However, the\nmodel's ability to perform ICL is sensitive to the choice of the few-shot\ndemonstrations. Instead of using a fixed set of demonstrations, one recent\ndevelopment is to retrieve demonstrations tailored to each input query. The\nimplementation of demonstration retrieval is relatively straightforward,\nleveraging existing databases and retrieval systems. This not only improves the\nefficiency and scalability of the learning process but also has been shown to\nreduce biases inherent in manual example selection. In light of the encouraging\nresults and growing research in ICL with retrieved demonstrations, we conduct\nan extensive review of studies in this area. In this survey, we discuss and\ncompare different design choices for retrieval models, retrieval training\nprocedures, and inference algorithms.\n","authors":["an Luo","Xin Xu","Yue Liu","Panupong Pasupat","Mehran Kazemi"],"pdf_url":"https://arxiv.org/pdf/2401.11624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11509v1","updated":"2024-01-21T14:35:54Z","published":"2024-01-21T14:35:54Z","title":"Simple Domain Adaptation for Sparse Retrievers","summary":"  In Information Retrieval, and more generally in Natural Language Processing,\nadapting models to specific domains is conducted through fine-tuning. Despite\nthe successes achieved by this method and its versatility, the need for\nhuman-curated and labeled data makes it impractical to transfer to new tasks,\ndomains, and/or languages when training data doesn't exist. Using the model\nwithout training (zero-shot) is another option that however suffers an\neffectiveness cost, especially in the case of first-stage retrievers. Numerous\nresearch directions have emerged to tackle these issues, most of them in the\ncontext of adapting to a task or a language. However, the literature is scarcer\nfor domain (or topic) adaptation. In this paper, we address this issue of\ncross-topic discrepancy for a sparse first-stage retriever by transposing a\nmethod initially designed for language adaptation. By leveraging pre-training\non the target data to learn domain-specific knowledge, this technique\nalleviates the need for annotated data and expands the scope of domain\nadaptation. Despite their relatively good generalization ability, we show that\neven sparse retrievers can benefit from our simple domain adaptation method.\n","authors":["Mathias Vast","Yuxuan Zong","Basile Van Cooten","Benjamin Piwowarski","Laure Soulier"],"pdf_url":"https://arxiv.org/pdf/2401.11509v1.pdf","comment":"Accepted at ECIR 2024"},{"id":"http://arxiv.org/abs/2401.11506v1","updated":"2024-01-21T14:33:52Z","published":"2024-01-21T14:33:52Z","title":"Enhancing Recommendation Diversity by Re-ranking with Large Language\n  Models","summary":"  It has long been recognized that it is not enough for a Recommender System\n(RS) to provide recommendations based only on their relevance to users. Among\nmany other criteria, the set of recommendations may need to be diverse in order\nto handle uncertainty and offer a meaningful choice. The literature reports\nmany ways of measuring diversity and ways of improving the diversity of a set\nof recommendations, most notably by re-ranking and selecting from a larger set\nof candidate recommendations. Driven by promising insights from the literature\non how to incorporate versatile Large Language Models (LLMs) into the RS\npipeline, in this paper, we show how LLMs can be used for diversity re-ranking.\n  We begin with an informal study that verifies that LLMs can be used for\nre-ranking tasks and do have some understanding of the concept of diversity.\nThen, we design a more rigorous methodology where LLMs are prompted to generate\na diverse ranking from a candidate ranking using various prompt templates with\ndifferent re-ranking instructions in a zero-shot fashion. We conduct\ncomprehensive experiments testing state-of-the-art conversational LLMs from the\nGPT and Llama families. We compare their re-ranking capabilities with random\nre-ranking and various traditional re-ranking methods from the literature (MMR,\nxQuAD and RxQuAD). We find that LLM-based re-ranking outperforms random\nre-ranking across all the metrics that we use but does not perform as well as\nthe traditional re-ranking methods. We gain insight into prompt design for this\ntask (e.g.\\ on the whole, it is better to prompt for diversity rather than a\nbalance of diversity and relevance). Given that no special knowledge\nengineering is needed, we conclude that LLM-based re-ranking is a promising\napproach, and we highlight directions for future research. We open-source the\ncode of our experiments for reproducibility.\n","authors":["Diego Carraro","Derek Bridge"],"pdf_url":"https://arxiv.org/pdf/2401.11506v1.pdf","comment":"32 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.11505v1","updated":"2024-01-21T14:30:20Z","published":"2024-01-21T14:30:20Z","title":"CheX-GPT: Harnessing Large Language Models for Enhanced Chest X-ray\n  Report Labeling","summary":"  Free-text radiology reports present a rich data source for various medical\ntasks, but effectively labeling these texts remains challenging. Traditional\nrule-based labeling methods fall short of capturing the nuances of diverse\nfree-text patterns. Moreover, models using expert-annotated data are limited by\ndata scarcity and pre-defined classes, impacting their performance, flexibility\nand scalability. To address these issues, our study offers three main\ncontributions: 1) We demonstrate the potential of GPT as an adept labeler using\ncarefully designed prompts. 2) Utilizing only the data labeled by GPT, we\ntrained a BERT-based labeler, CheX-GPT, which operates faster and more\nefficiently than its GPT counterpart. 3) To benchmark labeler performance, we\nintroduced a publicly available expert-annotated test set, MIMIC-500,\ncomprising 500 cases from the MIMIC validation set. Our findings demonstrate\nthat CheX-GPT not only excels in labeling accuracy over existing models, but\nalso showcases superior efficiency, flexibility, and scalability, supported by\nour introduction of the MIMIC-500 dataset for robust benchmarking. Code and\nmodels are available at https://github.com/kakaobrain/CheXGPT.\n","authors":["Jawook Gu","Han-Cheol Cho","Jiho Kim","Kihyun You","Eun Kyoung Hong","Byungseok Roh"],"pdf_url":"https://arxiv.org/pdf/2401.11505v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.11478v1","updated":"2024-01-21T12:51:28Z","published":"2024-01-21T12:51:28Z","title":"D2K: Turning Historical Data into Retrievable Knowledge for Recommender\n  Systems","summary":"  A vast amount of user behavior data is constantly accumulating on today's\nlarge recommendation platforms, recording users' various interests and tastes.\nPreserving knowledge from the old data while new data continually arrives is a\nvital problem for recommender systems. Existing approaches generally seek to\nsave the knowledge implicitly in the model parameters. However, such a\nparameter-centric approach lacks scalability and flexibility -- the capacity is\nhard to scale, and the knowledge is inflexible to utilize. Hence, in this work,\nwe propose a framework that turns massive user behavior data to retrievable\nknowledge (D2K). It is a data-centric approach that is model-agnostic and easy\nto scale up. Different from only storing unary knowledge such as the user-side\nor item-side information, D2K propose to store ternary knowledge for\nrecommendation, which is determined by the complete recommendation factors --\nuser, item, and context. The knowledge retrieved by target samples can be\ndirectly used to enhance the performance of any recommendation algorithms.\nSpecifically, we introduce a Transformer-based knowledge encoder to transform\nthe old data into knowledge with the user-item-context cross features. A\npersonalized knowledge adaptation unit is devised to effectively exploit the\ninformation from the knowledge base by adapting the retrieved knowledge to the\ntarget samples. Extensive experiments on two public datasets show that D2K\nsignificantly outperforms existing baselines and is compatible with a major\ncollection of recommendation algorithms.\n","authors":["Jiarui Qin","Weiwen Liu","Ruiming Tang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11478v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.11463v1","updated":"2024-01-21T11:04:30Z","published":"2024-01-21T11:04:30Z","title":"Estimating the Usefulness of Clarifying Questions and Answers for\n  Conversational Search","summary":"  While the body of research directed towards constructing and generating\nclarifying questions in mixed-initiative conversational search systems is vast,\nresearch aimed at processing and comprehending users' answers to such questions\nis scarce. To this end, we present a simple yet effective method for processing\nanswers to clarifying questions, moving away from previous work that simply\nappends answers to the original query and thus potentially degrades retrieval\nperformance. Specifically, we propose a classifier for assessing usefulness of\nthe prompted clarifying question and an answer given by the user. Useful\nquestions or answers are further appended to the conversation history and\npassed to a transformer-based query rewriting module. Results demonstrate\nsignificant improvements over strong non-mixed-initiative baselines.\nFurthermore, the proposed approach mitigates the performance drops when non\nuseful questions and answers are utilized.\n","authors":["Ivan Sekulić","Weronika Łajewska","Krisztian Balog","Fabio Crestani"],"pdf_url":"https://arxiv.org/pdf/2401.11463v1.pdf","comment":"This is the author's version of the work. The definitive version is\n  published in: Proceedings of the 46th European Conference on Information\n  Retrieval (ECIR '24), March 24-28, 2024, Glasgow, Scotland"},{"id":"http://arxiv.org/abs/2401.11452v1","updated":"2024-01-21T10:15:36Z","published":"2024-01-21T10:15:36Z","title":"Towards Reliable and Factual Response Generation: Detecting Unanswerable\n  Questions in Information-Seeking Conversations","summary":"  Generative AI models face the challenge of hallucinations that can undermine\nusers' trust in such systems. We approach the problem of conversational\ninformation seeking as a two-step process, where relevant passages in a corpus\nare identified first and then summarized into a final system response. This way\nwe can automatically assess if the answer to the user's question is present in\nthe corpus. Specifically, our proposed method employs a sentence-level\nclassifier to detect if the answer is present, then aggregates these\npredictions on the passage level, and eventually across the top-ranked passages\nto arrive at a final answerability estimate. For training and evaluation, we\ndevelop a dataset based on the TREC CAsT benchmark that includes answerability\nlabels on the sentence, passage, and ranking levels. We demonstrate that our\nproposed method represents a strong baseline and outperforms a state-of-the-art\nLLM on the answerability prediction task.\n","authors":["Weronika Łajewska","Krisztian Balog"],"pdf_url":"https://arxiv.org/pdf/2401.11452v1.pdf","comment":"This is the author's version of the work. The definitive version is\n  published in: Proceedings of the 46th European Conference on Information\n  Retrieval} (ECIR '24), March 24--28, 2024, Glasgow, Scotland"},{"id":"http://arxiv.org/abs/2401.11441v1","updated":"2024-01-21T09:42:24Z","published":"2024-01-21T09:42:24Z","title":"On-Device Recommender Systems: A Comprehensive Survey","summary":"  Recommender systems have been widely deployed in various real-world\napplications to help users identify content of interest from massive amounts of\ninformation. Traditional recommender systems work by collecting user-item\ninteraction data in a cloud-based data center and training a centralized model\nto perform the recommendation service. However, such cloud-based recommender\nsystems (CloudRSs) inevitably suffer from excessive resource consumption,\nresponse latency, as well as privacy and security risks concerning both data\nand models. Recently, driven by the advances in storage, communication, and\ncomputation capabilities of edge devices, there has been a shift of focus from\nCloudRSs to on-device recommender systems (DeviceRSs), which leverage the\ncapabilities of edge devices to minimize centralized data storage requirements,\nreduce the response latency caused by communication overheads, and enhance user\nprivacy and security by localizing data processing and model training. Despite\nthe rapid rise of DeviceRSs, there is a clear absence of timely literature\nreviews that systematically introduce, categorize and contrast these methods.\nTo bridge this gap, we aim to provide a comprehensive survey of DeviceRSs,\ncovering three main aspects: (1) the deployment and inference of DeviceRSs (2)\nthe training and update of DeviceRSs (3) the security and privacy of DeviceRSs.\nFurthermore, we provide a fine-grained and systematic taxonomy of the methods\ninvolved in each aspect, followed by a discussion regarding challenges and\nfuture research directions. This is the first comprehensive survey on DeviceRSs\nthat covers a spectrum of tasks to fit various needs. We believe this survey\nwill help readers effectively grasp the current research status in this field,\nequip them with relevant technical foundations, and stimulate new research\nideas for developing DeviceRSs.\n","authors":["Hongzhi Yin","Liang Qu","Tong Chen","Wei Yuan","Ruiqi Zheng","Jing Long","Xin Xia","Yuhui Shi","Chengqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11441v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.11632v1","updated":"2024-01-21T23:56:57Z","published":"2024-01-21T23:56:57Z","title":"What Are We Optimizing For? A Human-centric Evaluation Of Deep\n  Learning-based Recommender Systems","summary":"  Deep learning-based (DL) models in recommender systems (RecSys) have gained\nsignificant recognition for their remarkable accuracy in predicting user\npreferences. However, their performance often lacks a comprehensive evaluation\nfrom a human-centric perspective, which encompasses various dimensions beyond\nsimple interest matching. In this work, we have developed a robust\nhuman-centric evaluation framework that incorporates seven diverse metrics to\nassess the quality of recommendations generated by five recent open-sourced DL\nmodels. Our evaluation datasets consist of both offline benchmark data and\npersonalized online recommendation feedback collected from 445 real users. We\nfind that (1) different DL models have different pros and cons in the\nmulti-dimensional metrics that we test with; (2) users generally want a\ncombination of accuracy with at least one another human values in the\nrecommendation; (3) the degree of combination of different values needs to be\ncarefully experimented to user preferred level.\n","authors":["Ruixuan Sun","Avinash Akella","Xinyi Wu","Ruoyan Kong","Joseph A. Konstan"],"pdf_url":"https://arxiv.org/pdf/2401.11632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11631v1","updated":"2024-01-21T23:54:05Z","published":"2024-01-21T23:54:05Z","title":"Text-to-Image Cross-Modal Generation: A Systematic Review","summary":"  We review research on generating visual data from text from the angle of\n\"cross-modal generation.\" This point of view allows us to draw parallels\nbetween various methods geared towards working on input text and producing\nvisual output, without limiting the analysis to narrow sub-areas. It also\nresults in the identification of common templates in the field, which are then\ncompared and contrasted both within pools of similar methods and across lines\nof research. We provide a breakdown of text-to-image generation into various\nflavors of image-from-text methods, video-from-text methods, image editing,\nself-supervised and graph-based approaches. In this discussion, we focus on\nresearch papers published at 8 leading machine learning conferences in the\nyears 2016-2022, also incorporating a number of relevant papers not matching\nthe outlined search criteria. The conducted review suggests a significant\nincrease in the number of papers published in the area and highlights research\ngaps and potential lines of investigation. To our knowledge, this is the first\nreview to systematically look at text-to-image generation from the perspective\nof \"cross-modal generation.\"\n","authors":["Maciej Żelaszczyk","Jacek Mańdziuk"],"pdf_url":"https://arxiv.org/pdf/2401.11631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11630v1","updated":"2024-01-21T23:50:46Z","published":"2024-01-21T23:50:46Z","title":"Reframing Offline Reinforcement Learning as a Regression Problem","summary":"  The study proposes the reformulation of offline reinforcement learning as a\nregression problem that can be solved with decision trees. Aiming to predict\nactions based on input states, return-to-go (RTG), and timestep information, we\nobserve that with gradient-boosted trees, the agent training and inference are\nvery fast, the former taking less than a minute. Despite the simplification\ninherent in this reformulated problem, our agent demonstrates performance that\nis at least on par with established methods. This assertion is validated by\ntesting it across standard datasets associated with D4RL Gym-MuJoCo tasks. We\nfurther discuss the agent's ability to generalize by testing it on two extreme\ncases, how it learns to model the return distributions effectively even with\nhighly skewed expert datasets, and how it exhibits robust performance in\nscenarios with sparse/delayed rewards.\n","authors":["Prajwal Koirala","Cody Fleming"],"pdf_url":"https://arxiv.org/pdf/2401.11630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11627v1","updated":"2024-01-21T23:41:32Z","published":"2024-01-21T23:41:32Z","title":"Tight Verification of Probabilistic Robustness in Bayesian Neural\n  Networks","summary":"  We introduce two algorithms for computing tight guarantees on the\nprobabilistic robustness of Bayesian Neural Networks (BNNs). Computing\nrobustness guarantees for BNNs is a significantly more challenging task than\nverifying the robustness of standard Neural Networks (NNs) because it requires\nsearching the parameters' space for safe weights. Moreover, tight and complete\napproaches for the verification of standard NNs, such as those based on\nMixed-Integer Linear Programming (MILP), cannot be directly used for the\nverification of BNNs because of the polynomial terms resulting from the\nconsecutive multiplication of variables encoding the weights. Our algorithms\nefficiently and effectively search the parameters' space for safe weights by\nusing iterative expansion and the network's gradient and can be used with any\nverification algorithm of choice for BNNs. In addition to proving that our\nalgorithms compute tighter bounds than the SoA, we also evaluate our algorithms\nagainst the SoA on standard benchmarks, such as MNIST and CIFAR10, showing that\nour algorithms compute bounds up to 40% tighter than the SoA.\n","authors":["Ben Batten","Mehran Hosseini","Alessio Lomuscio"],"pdf_url":"https://arxiv.org/pdf/2401.11627v1.pdf","comment":"Accepted at AISTATS 2024"},{"id":"http://arxiv.org/abs/2401.11626v1","updated":"2024-01-21T23:37:33Z","published":"2024-01-21T23:37:33Z","title":"Freely Long-Thinking Transformer (FraiLT)","summary":"  Freely Long-Thinking Transformer (FraiLT) is an improved transformer model\ndesigned to enhance processing capabilities without scaling up size. It\nutilizes a recursive approach, iterating over a subset of layers multiple\ntimes, and introduces iteration encodings to maintain awareness across these\ncycles. Iteration encoding allows FraiLT to achieve the interpretive depth of\nlarger models in a compact form. When evaluated on a synthetic story dataset,\nFraiLT outperformed larger models, showcasing its ability to deliver\nhigh-quality performance while reducing memory demands. This model represents a\nstep forward towards more efficient and accessible language models.\n","authors":["Akbay Tabak"],"pdf_url":"https://arxiv.org/pdf/2401.11626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11618v1","updated":"2024-01-21T22:55:26Z","published":"2024-01-21T22:55:26Z","title":"Efficient local linearity regularization to overcome catastrophic\n  overfitting","summary":"  Catastrophic overfitting (CO) in single-step adversarial training (AT)\nresults in abrupt drops in the adversarial test accuracy (even down to 0%). For\nmodels trained with multi-step AT, it has been observed that the loss function\nbehaves locally linearly with respect to the input, this is however lost in\nsingle-step AT. To address CO in single-step AT, several methods have been\nproposed to enforce local linearity of the loss via regularization. However,\nthese regularization terms considerably slow down training due to Double\nBackpropagation. Instead, in this work, we introduce a regularization term,\ncalled ELLE, to mitigate CO effectively and efficiently in classical AT\nevaluations, as well as some more difficult regimes, e.g., large adversarial\nperturbations and long training schedules. Our regularization term can be\ntheoretically linked to curvature of the loss function and is computationally\ncheaper than previous methods by avoiding Double Backpropagation. Our thorough\nexperimental validation demonstrates that our work does not suffer from CO,\neven in challenging settings where previous works suffer from it. We also\nnotice that adapting our regularization parameter during training (ELLE-A)\ngreatly improves the performance, specially in large $\\epsilon$ setups. Our\nimplementation is available in https://github.com/LIONS-EPFL/ELLE .\n","authors":["Elias Abad Rocamora","Fanghui Liu","Grigorios G. Chrysos","Pablo M. Olmos","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2401.11618v1.pdf","comment":"Accepted in ICLR 2024"},{"id":"http://arxiv.org/abs/2310.19491v2","updated":"2024-01-21T22:35:34Z","published":"2023-10-30T12:28:53Z","title":"Generator Identification for Linear SDEs with Additive and\n  Multiplicative Noise","summary":"  In this paper, we present conditions for identifying the generator of a\nlinear stochastic differential equation (SDE) from the distribution of its\nsolution process with a given fixed initial state. These identifiability\nconditions are crucial in causal inference using linear SDEs as they enable the\nidentification of the post-intervention distributions from its observational\ndistribution. Specifically, we derive a sufficient and necessary condition for\nidentifying the generator of linear SDEs with additive noise, as well as a\nsufficient condition for identifying the generator of linear SDEs with\nmultiplicative noise. We show that the conditions derived for both types of\nSDEs are generic. Moreover, we offer geometric interpretations of the derived\nidentifiability conditions to enhance their understanding. To validate our\ntheoretical results, we perform a series of simulations, which support and\nsubstantiate the established findings.\n","authors":["Yuanyuan Wang","Xi Geng","Wei Huang","Biwei Huang","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2310.19491v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11611v1","updated":"2024-01-21T22:18:29Z","published":"2024-01-21T22:18:29Z","title":"Continuous Field Reconstruction from Sparse Observations with Implicit\n  Neural Networks","summary":"  Reliably reconstructing physical fields from sparse sensor data is a\nchallenge that frequently arises in many scientific domains. In practice, the\nprocess generating the data often is not understood to sufficient accuracy.\nTherefore, there is a growing interest in using the deep neural network route\nto address the problem. This work presents a novel approach that learns a\ncontinuous representation of the physical field using implicit neural\nrepresentations (INRs). Specifically, after factorizing spatiotemporal\nvariability into spatial and temporal components using the separation of\nvariables technique, the method learns relevant basis functions from sparsely\nsampled irregular data points to develop a continuous representation of the\ndata. In experimental evaluations, the proposed model outperforms recent INR\nmethods, offering superior reconstruction quality on simulation data from a\nstate-of-the-art climate model and a second dataset that comprises ultra-high\nresolution satellite-based sea surface temperature fields.\n","authors":["Xihaier Luo","Wei Xu","Yihui Ren","Shinjae Yoo","Balu Nadiga"],"pdf_url":"https://arxiv.org/pdf/2401.11611v1.pdf","comment":"25 pages,21 figures"},{"id":"http://arxiv.org/abs/2401.11609v1","updated":"2024-01-21T22:11:29Z","published":"2024-01-21T22:11:29Z","title":"Graph Edits for Counterfactual Explanations: A Unified GNN Approach","summary":"  Counterfactuals have been established as a popular explainability technique\nwhich leverages a set of minimal edits to alter the prediction of a classifier.\nWhen considering conceptual counterfactuals, the edits requested should\ncorrespond to salient concepts present in the input data. At the same time,\nconceptual distances are defined by knowledge graphs, ensuring the optimality\nof conceptual edits. In this work, we extend previous endeavors on conceptual\ncounterfactuals by introducing \\textit{graph edits as counterfactual\nexplanations}: should we represent input data as graphs, which is the shortest\ngraph edit path that results in an alternative classification label as provided\nby a black-box classifier?\n","authors":["Nikolaos Chaidos","Angeliki Dimitriou","Maria Lymperaiou","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2401.11609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07364v2","updated":"2024-01-21T22:08:20Z","published":"2024-01-14T20:41:36Z","title":"PDE Generalization of In-Context Operator Networks: A Study on 1D Scalar\n  Nonlinear Conservation Laws","summary":"  Can we build a single large model for a wide range of PDE-related scientific\nlearning tasks? Can this model generalize to new PDEs, even of new forms,\nwithout any fine-tuning? In-context operator learning and the corresponding\nmodel In-Context Operator Networks (ICON) represent an initial exploration of\nthese questions. The capability of ICON regarding the first question has been\ndemonstrated previously. In this paper, we present a detailed methodology for\nsolving PDE problems with ICON, and show how a single ICON model can make\nforward and reverse predictions for different equations with different strides,\nprovided with appropriately designed data prompts. We show the positive\nevidence to the second question, i.e., ICON can generalize well to some PDEs\nwith new forms without any fine-tuning. This is exemplified through a study on\n1D scalar nonlinear conservation laws, a family of PDEs with temporal\nevolution. We also show how to broaden the range of problems that an ICON model\ncan address, by transforming functions and equations to ICON's capability\nscope. We believe that the progress in this paper is a significant step towards\nthe goal of training a foundation model for PDE-related tasks under the\nin-context operator learning framework.\n","authors":["Liu Yang","Stanley J. Osher"],"pdf_url":"https://arxiv.org/pdf/2401.07364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11608v1","updated":"2024-01-21T22:01:34Z","published":"2024-01-21T22:01:34Z","title":"$\\texttt{immrax}$: A Parallelizable and Differentiable Toolbox for\n  Interval Analysis and Mixed Monotone Reachability in JAX","summary":"  We present an implementation of interval analysis and mixed monotone interval\nreachability analysis as function transforms in Python, fully composable with\nthe computational framework JAX. The resulting toolbox inherits several key\nfeatures from JAX, including computational efficiency through Just-In-Time\nCompilation, GPU acceleration for quick parallelized computations, and\nAutomatic Differentiability. We demonstrate the toolbox's performance on\nseveral case studies, including a reachability problem on a vehicle model\ncontrolled by a neural network, and a robust closed-loop optimal control\nproblem for a swinging pendulum.\n","authors":["Akash Harapanahalli","Saber Jafarpour","Samuel Coogan"],"pdf_url":"https://arxiv.org/pdf/2401.11608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11605v1","updated":"2024-01-21T21:49:49Z","published":"2024-01-21T21:49:49Z","title":"Scalable High-Resolution Pixel-Space Image Synthesis with Hourglass\n  Diffusion Transformers","summary":"  We present the Hourglass Diffusion Transformer (HDiT), an image generative\nmodel that exhibits linear scaling with pixel count, supporting training at\nhigh-resolution (e.g. $1024 \\times 1024$) directly in pixel-space. Building on\nthe Transformer architecture, which is known to scale to billions of\nparameters, it bridges the gap between the efficiency of convolutional U-Nets\nand the scalability of Transformers. HDiT trains successfully without typical\nhigh-resolution training techniques such as multiscale architectures, latent\nautoencoders or self-conditioning. We demonstrate that HDiT performs\ncompetitively with existing models on ImageNet $256^2$, and sets a new\nstate-of-the-art for diffusion models on FFHQ-$1024^2$.\n","authors":["Katherine Crowson","Stefan Andreas Baumann","Alex Birch","Tanishq Mathew Abraham","Daniel Z. Kaplan","Enrico Shippole"],"pdf_url":"https://arxiv.org/pdf/2401.11605v1.pdf","comment":"20 pages, 13 figures, project page and code available at\n  https://crowsonkb.github.io/hourglass-diffusion-transformers/"},{"id":"http://arxiv.org/abs/2312.02063v2","updated":"2024-01-21T21:41:32Z","published":"2023-12-04T17:19:37Z","title":"The GPU Phase Folding and Deep Learning Method for Detecting Exoplanet\n  Transits","summary":"  This paper presents GPFC, a novel Graphics Processing Unit (GPU) Phase\nFolding and Convolutional Neural Network (CNN) system to detect exoplanets\nusing the transit method. We devise a fast folding algorithm parallelized on a\nGPU to amplify low signal-to-noise ratio transit signals, allowing a search at\nhigh precision and speed. A CNN trained on two million synthetic light curves\nreports a score indicating the likelihood of a planetary signal at each period.\nWhile the GPFC method has broad applicability across period ranges, this\nresearch specifically focuses on detecting ultra-short-period planets with\norbital periods less than one day. GPFC improves on speed by three orders of\nmagnitude over the predominant Box-fitting Least Squares (BLS) method. Our\nsimulation results show GPFC achieves $97%$ training accuracy, higher true\npositive rate at the same false positive rate of detection, and higher\nprecision at the same recall rate when compared to BLS. GPFC recovers $100\\%$\nof known ultra-short-period planets in $\\textit{Kepler}$ light curves from a\nblind search. These results highlight the promise of GPFC as an alternative\napproach to the traditional BLS algorithm for finding new transiting exoplanets\nin data taken with $\\textit{Kepler}$ and other space transit missions such as\nK2, TESS and future PLATO and Earth 2.0.\n","authors":["Kaitlyn Wang","Jian Ge","Kevin Willis","Kevin Wang","Yinan Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.02063v2.pdf","comment":"16 pages, 19 figures; Accepted for publication in the peer-reviewed\n  journal, Monthly Notices of the Royal Astronomical Society (MNRAS), on\n  January 20, 2024"}]},"2024-01-20T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2305.14189v3","updated":"2024-01-20T22:29:15Z","published":"2023-05-23T16:11:00Z","title":"Beyond Shared Vocabulary: Increasing Representational Word Similarities\n  across Languages for Multilingual Machine Translation","summary":"  Using a vocabulary that is shared across languages is common practice in\nMultilingual Neural Machine Translation (MNMT). In addition to its simple\ndesign, shared tokens play an important role in positive knowledge transfer,\nassuming that shared tokens refer to similar meanings across languages.\nHowever, when word overlap is small, especially due to different writing\nsystems, transfer is inhibited. In this paper, we define word-level information\ntransfer pathways via word equivalence classes and rely on graph networks to\nfuse word embeddings across languages. Our experiments demonstrate the\nadvantages of our approach: 1) embeddings of words with similar meanings are\nbetter aligned across languages, 2) our method achieves consistent BLEU\nimprovements of up to 2.3 points for high- and low-resource MNMT, and 3) less\nthan 1.0\\% additional trainable parameters are required with a limited increase\nin computational costs, while inference time remains identical to the baseline.\nWe release the codebase to the community.\n","authors":["Di Wu","Christof Monz"],"pdf_url":"https://arxiv.org/pdf/2305.14189v3.pdf","comment":"15 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.07510v3","updated":"2024-01-20T22:08:18Z","published":"2024-01-15T07:21:16Z","title":"Developing ChatGPT for Biology and Medicine: A Complete Review of\n  Biomedical Question Answering","summary":"  ChatGPT explores a strategic blueprint of question answering (QA) in\ndelivering medical diagnosis, treatment recommendations, and other healthcare\nsupport. This is achieved through the increasing incorporation of medical\ndomain data via natural language processing (NLP) and multimodal paradigms. By\ntransitioning the distribution of text, images, videos, and other modalities\nfrom the general domain to the medical domain, these techniques have expedited\nthe progress of medical domain question answering (MDQA). They bridge the gap\nbetween human natural language and sophisticated medical domain knowledge or\nexpert manual annotations, handling large-scale, diverse, unbalanced, or even\nunlabeled data analysis scenarios in medical contexts. Central to our focus is\nthe utilizing of language models and multimodal paradigms for medical question\nanswering, aiming to guide the research community in selecting appropriate\nmechanisms for their specific medical research requirements. Specialized tasks\nsuch as unimodal-related question answering, reading comprehension, reasoning,\ndiagnosis, relation extraction, probability modeling, and others, as well as\nmultimodal-related tasks like vision question answering, image caption,\ncross-modal retrieval, report summarization, and generation, are discussed in\ndetail. Each section delves into the intricate specifics of the respective\nmethod under consideration. This paper highlights the structures and\nadvancements of medical domain explorations against general domain methods,\nemphasizing their applications across different tasks and datasets. It also\noutlines current challenges and opportunities for future medical domain\nresearch, paving the way for continued innovation and application in this\nrapidly evolving field.\n","authors":["Qing Li","Lei Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2401.07510v3.pdf","comment":"50 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.02317v3","updated":"2024-01-20T21:16:09Z","published":"2023-12-04T19:58:07Z","title":"GNN2R: Weakly-Supervised Rationale-Providing Question Answering over\n  Knowledge Graphs","summary":"  Most current methods for multi-hop question answering (QA) over knowledge\ngraphs (KGs) only provide final conclusive answers without explanations, such\nas a set of KG entities that is difficult for normal users to review and\ncomprehend. This issue severely limits the application of KG-based QA in\nreal-world scenarios. However, it is non-trivial to solve due to two\nchallenges: First, annotations of reasoning chains of multi-hop questions,\nwhich could serve as supervision for explanation generation, are usually\nlacking. Second, it is difficult to maintain high efficiency when explicit KG\ntriples need to be retrieved to generate explanations. In this paper, we\npropose a novel Graph Neural Network-based Two-Step Reasoning model (GNN2R) to\nsolve this issue. GNN2R can provide both final answers and reasoning subgraphs\nas a rationale behind final answers efficiently with only weak supervision that\nis available through question-final answer pairs. We extensively evaluated\nGNN2R with detailed analyses in experiments. The results demonstrate that, in\nterms of effectiveness, efficiency, and quality of generated explanations,\nGNN2R outperforms existing state-of-the-art methods that are applicable to this\ntask. Our code and pre-trained models are available at\nhttps://github.com/ruijie-wang-uzh/GNN2R.\n","authors":["Ruijie Wang","Luca Rossetto","Michael Cochez","Abraham Bernstein"],"pdf_url":"https://arxiv.org/pdf/2312.02317v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11323v1","updated":"2024-01-20T20:55:21Z","published":"2024-01-20T20:55:21Z","title":"Analyzing Task-Encoding Tokens in Large Language Models","summary":"  In-context learning (ICL) has become an effective solution for few-shot\nlearning in natural language processing. Past work has found that, during this\nprocess, representations of the last prompt token are utilized to store task\nreasoning procedures, thereby explaining the working mechanism of in-context\nlearning. In this paper, we seek to locate and analyze other task-encoding\ntokens whose representations store task reasoning procedures. Supported by\nexperiments that ablate the representations of different token types, we find\nthat template and stopword tokens are the most prone to be task-encoding\ntokens. In addition, we demonstrate experimentally that lexical cues,\nrepetition, and text formats are the main distinguishing characteristics of\nthese tokens. Our work provides additional insights into how large language\nmodels (LLMs) leverage task reasoning procedures in ICL and suggests that\nfuture work may involve using task-encoding tokens to improve the computational\nefficiency of LLMs at inference time and their ability to handle long\nsequences.\n","authors":["Yu Bai","Heyan Huang","Cesare Spinoso-Di Piano","Marc-Antoine Rondeau","Sanxing Chen","Yang Gao","Jackie Chi Kit Cheung"],"pdf_url":"https://arxiv.org/pdf/2401.11323v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.11316v1","updated":"2024-01-20T20:25:17Z","published":"2024-01-20T20:25:17Z","title":"PRILoRA: Pruned and Rank-Increasing Low-Rank Adaptation","summary":"  With the proliferation of large pre-trained language models (PLMs),\nfine-tuning all model parameters becomes increasingly inefficient, particularly\nwhen dealing with numerous downstream tasks that entail substantial training\nand storage costs. Several approaches aimed at achieving parameter-efficient\nfine-tuning (PEFT) have been proposed. Among them, Low-Rank Adaptation (LoRA)\nstands out as an archetypal method, incorporating trainable rank decomposition\nmatrices into each target module. Nevertheless, LoRA does not consider the\nvarying importance of each layer. To address these challenges, we introduce\nPRILoRA, which linearly allocates a different rank for each layer, in an\nincreasing manner, and performs pruning throughout the training process,\nconsidering both the temporary magnitude of weights and the accumulated\nstatistics of the input to any given layer. We validate the effectiveness of\nPRILoRA through extensive experiments on eight GLUE benchmarks, setting a new\nstate of the art.\n","authors":["Nadav Benedek","Lior Wolf"],"pdf_url":"https://arxiv.org/pdf/2401.11316v1.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2401.11305v1","updated":"2024-01-20T19:32:56Z","published":"2024-01-20T19:32:56Z","title":"Progress in Privacy Protection: A Review of Privacy Preserving\n  Techniques in Recommender Systems, Edge Computing, and Cloud Computing","summary":"  As digital technology evolves, the increasing use of connected devices brings\nboth challenges and opportunities in the areas of mobile crowdsourcing, edge\ncomputing, and recommender systems. This survey focuses on these dynamic\nfields, emphasizing the critical need for privacy protection in our\nincreasingly data-oriented world. It explores the latest trends in these\ninterconnected areas, with a special emphasis on privacy and data security. Our\nmethod involves an in-depth analysis of various academic works, which helps us\nto gain a comprehensive understanding of these sectors and their shifting focus\ntowards privacy concerns. We present new insights and marks a significant\nadvancement in addressing privacy issues within these technologies. The survey\nis a valuable resource for researchers, industry practitioners, and policy\nmakers, offering an extensive overview of these fields and their related\nprivacy challenges, catering to a wide audience in the modern digital era.\n","authors":["Syed Raza Bashir","Shaina Raza","Vojislav Misic"],"pdf_url":"https://arxiv.org/pdf/2401.11305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04925v3","updated":"2024-01-20T17:23:31Z","published":"2024-01-10T04:37:38Z","title":"The Impact of Reasoning Step Length on Large Language Models","summary":"  Chain of Thought (CoT) is significant in improving the reasoning abilities of\nlarge language models (LLMs). However, the correlation between the\neffectiveness of CoT and the length of reasoning steps in prompts remains\nlargely unknown. To shed light on this, we have conducted several empirical\nexperiments to explore the relations. Specifically, we design experiments that\nexpand and compress the rationale reasoning steps within CoT demonstrations,\nwhile keeping all other factors constant. We have the following key findings.\nFirst, the results indicate that lengthening the reasoning steps in prompts,\neven without adding new information into the prompt, considerably enhances\nLLMs' reasoning abilities across multiple datasets. Alternatively, shortening\nthe reasoning steps, even while preserving the key information, significantly\ndiminishes the reasoning abilities of models. This finding highlights the\nimportance of the number of steps in CoT prompts and provides practical\nguidance to make better use of LLMs' potential in complex problem-solving\nscenarios. Second, we also investigated the relationship between the\nperformance of CoT and the rationales used in demonstrations. Surprisingly, the\nresult shows that even incorrect rationales can yield favorable outcomes if\nthey maintain the requisite length of inference. Third, we observed that the\nadvantages of increasing reasoning steps are task-dependent: simpler tasks\nrequire fewer steps, whereas complex tasks gain significantly from longer\ninference sequences.\n","authors":["Mingyu Jin","Qinkai Yu","Dong Shu","Haiyan Zhao","Wenyue Hua","Yanda Meng","Yongfeng Zhang","Mengnan Du"],"pdf_url":"https://arxiv.org/pdf/2401.04925v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11268v1","updated":"2024-01-20T16:48:55Z","published":"2024-01-20T16:48:55Z","title":"Word-Level ASR Quality Estimation for Efficient Corpus Sampling and\n  Post-Editing through Analyzing Attentions of a Reference-Free Metric","summary":"  In the realm of automatic speech recognition (ASR), the quest for models that\nnot only perform with high accuracy but also offer transparency in their\ndecision-making processes is crucial. The potential of quality estimation (QE)\nmetrics is introduced and evaluated as a novel tool to enhance explainable\nartificial intelligence (XAI) in ASR systems. Through experiments and analyses,\nthe capabilities of the NoRefER (No Reference Error Rate) metric are explored\nin identifying word-level errors to aid post-editors in refining ASR\nhypotheses. The investigation also extends to the utility of NoRefER in the\ncorpus-building process, demonstrating its effectiveness in augmenting datasets\nwith insightful annotations. The diagnostic aspects of NoRefER are examined,\nrevealing its ability to provide valuable insights into model behaviors and\ndecision patterns. This has proven beneficial for prioritizing hypotheses in\npost-editing workflows and fine-tuning ASR models. The findings suggest that\nNoRefER is not merely a tool for error detection but also a comprehensive\nframework for enhancing ASR systems' transparency, efficiency, and\neffectiveness. To ensure the reproducibility of the results, all source codes\nof this study are made publicly available.\n","authors":["Golara Javadi","Kamer Ali Yuksel","Yunsu Kim","Thiago Castro Ferreira","Mohamed Al-Badrashiny"],"pdf_url":"https://arxiv.org/pdf/2401.11268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11248v1","updated":"2024-01-20T15:02:33Z","published":"2024-01-20T15:02:33Z","title":"Drop your Decoder: Pre-training with Bag-of-Word Prediction for Dense\n  Passage Retrieval","summary":"  Masked auto-encoder pre-training has emerged as a prevalent technique for\ninitializing and enhancing dense retrieval systems. It generally utilizes\nadditional Transformer decoder blocks to provide sustainable supervision\nsignals and compress contextual information into dense representations.\nHowever, the underlying reasons for the effectiveness of such a pre-training\ntechnique remain unclear. The usage of additional Transformer-based decoders\nalso incurs significant computational costs. In this study, we aim to shed\nlight on this issue by revealing that masked auto-encoder (MAE) pre-training\nwith enhanced decoding significantly improves the term coverage of input tokens\nin dense representations, compared to vanilla BERT checkpoints. Building upon\nthis observation, we propose a modification to the traditional MAE by replacing\nthe decoder of a masked auto-encoder with a completely simplified Bag-of-Word\nprediction task. This modification enables the efficient compression of lexical\nsignals into dense representations through unsupervised pre-training.\nRemarkably, our proposed method achieves state-of-the-art retrieval performance\non several large-scale retrieval benchmarks without requiring any additional\nparameters, which provides a 67% training speed-up compared to standard masked\nauto-encoder pre-training with enhanced decoding.\n","authors":["Guangyuan Ma","Xing Wu","Zijia Lin","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2401.11248v1.pdf","comment":"Working in progress. Our code will be available at\n  https://github.com/ma787639046/bowdpr"},{"id":"http://arxiv.org/abs/2312.03122v3","updated":"2024-01-20T15:02:20Z","published":"2023-12-05T20:41:34Z","title":"Assertion Enhanced Few-Shot Learning: Instructive Technique for Large\n  Language Models to Generate Educational Explanations","summary":"  Human educators possess an intrinsic ability to anticipate and seek\neducational explanations from students, which drives them to pose\nthought-provoking questions when students cannot articulate these explanations\nindependently. We aim to imbue Intelligent Tutoring Systems with this ability\nusing few-shot learning capability of Large Language Models. Our work proposes\na novel prompting technique, Assertion Enhanced Few-Shot Learning, to\nfacilitate the generation of accurate, detailed oriented educational\nexplanations. Our central hypothesis is that, in educational domain, few-shot\ndemonstrations are necessary but not a sufficient condition for quality\nexplanation generation. We conducted a study involving 12 in-service teachers,\ncomparing our approach to Traditional Few-Shot Learning. The results show that\nAssertion Enhanced Few-Shot Learning improves explanation accuracy by 15% and\nyields higher-quality explanations, as evaluated by teachers. We also conduct a\nqualitative ablation study to factor the impact of assertions to provide\neducator-friendly prompting guidelines for generating explanations in their\ndomain of interest.\n","authors":["Tasmia Shahriar","Kelly Ramos","Noboru Matsuda"],"pdf_url":"https://arxiv.org/pdf/2312.03122v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09333v2","updated":"2024-01-20T15:01:01Z","published":"2024-01-17T16:57:18Z","title":"Machines Do See Color: A Guideline to Classify Different Forms of Racist\n  Discourse in Large Corpora","summary":"  Current methods to identify and classify racist language in text rely on\nsmall-n qualitative approaches or large-n approaches focusing exclusively on\novert forms of racist discourse. This article provides a step-by-step\ngeneralizable guideline to identify and classify different forms of racist\ndiscourse in large corpora. In our approach, we start by conceptualizing racism\nand its different manifestations. We then contextualize these racist\nmanifestations to the time and place of interest, which allows researchers to\nidentify their discursive form. Finally, we apply XLM-RoBERTa (XLM-R), a\ncross-lingual model for supervised text classification with a cutting-edge\ncontextual understanding of text. We show that XLM-R and XLM-R-Racismo, our\npretrained model, outperform other state-of-the-art approaches in classifying\nracism in large corpora. We illustrate our approach using a corpus of tweets\nrelating to the Ecuadorian ind\\'igena community between 2018 and 2021.\n","authors":["Diana Davila Gordillo","Joan Timoneda","Sebastian Vallejo Vera"],"pdf_url":"https://arxiv.org/pdf/2401.09333v2.pdf","comment":"37 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.11246v1","updated":"2024-01-20T14:59:43Z","published":"2024-01-20T14:59:43Z","title":"Prompt-RAG: Pioneering Vector Embedding-Free Retrieval-Augmented\n  Generation in Niche Domains, Exemplified by Korean Medicine","summary":"  We propose a natural language prompt-based retrieval augmented generation\n(Prompt-RAG), a novel approach to enhance the performance of generative large\nlanguage models (LLMs) in niche domains. Conventional RAG methods mostly\nrequire vector embeddings, yet the suitability of generic LLM-based embedding\nrepresentations for specialized domains remains uncertain. To explore and\nexemplify this point, we compared vector embeddings from Korean Medicine (KM)\nand Conventional Medicine (CM) documents, finding that KM document embeddings\ncorrelated more with token overlaps and less with human-assessed document\nrelatedness, in contrast to CM embeddings. Prompt-RAG, distinct from\nconventional RAG models, operates without the need for embedding vectors. Its\nperformance was assessed through a Question-Answering (QA) chatbot application,\nwhere responses were evaluated for relevance, readability, and informativeness.\nThe results showed that Prompt-RAG outperformed existing models, including\nChatGPT and conventional vector embedding-based RAGs, in terms of relevance and\ninformativeness. Despite challenges like content structuring and response\nlatency, the advancements in LLMs are expected to encourage the use of\nPrompt-RAG, making it a promising tool for other domains in need of RAG\nmethods.\n","authors":["Bongsu Kang","Jundong Kim","Tae-Rim Yun","Chang-Eop Kim"],"pdf_url":"https://arxiv.org/pdf/2401.11246v1.pdf","comment":"26 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.16326v2","updated":"2024-01-20T14:33:54Z","published":"2023-05-10T13:40:06Z","title":"Large language models in biomedical natural language processing:\n  benchmarks, baselines, and recommendations","summary":"  Biomedical literature is growing rapidly, making it challenging to curate and\nextract knowledge manually. Biomedical natural language processing (BioNLP)\ntechniques that can automatically extract information from biomedical\nliterature help alleviate this burden. Recently, large Language Models (LLMs),\nsuch as GPT-3 and GPT-4, have gained significant attention for their impressive\nperformance. However, their effectiveness in BioNLP tasks and impact on method\ndevelopment and downstream users remain understudied. This pilot study (1)\nestablishes the baseline performance of GPT-3 and GPT-4 at both zero-shot and\none-shot settings in eight BioNLP datasets across four applications: named\nentity recognition, relation extraction, multi-label document classification,\nand semantic similarity and reasoning, (2) examines the errors produced by the\nLLMs and categorized the errors into three types: missingness, inconsistencies,\nand unwanted artificial content, and (3) provides suggestions for using LLMs in\nBioNLP applications. We make the datasets, baselines, and results publicly\navailable to the community via\nhttps://github.com/qingyu-qc/gpt_bionlp_benchmark.\n","authors":["Qingyu Chen","Jingcheng Du","Yan Hu","Vipina Kuttichi Keloth","Xueqing Peng","Kalpana Raja","Rui Zhang","Zhiyong Lu","Hua Xu"],"pdf_url":"https://arxiv.org/pdf/2305.16326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17080v2","updated":"2024-01-20T14:08:16Z","published":"2023-12-28T15:49:43Z","title":"MR-GSM8K: A Meta-Reasoning Revolution in Large Language Model Evaluation","summary":"  In this work, we introduce a novel evaluation paradigm for Large Language\nModels, one that challenges them to engage in meta-reasoning. This approach\naddresses critical shortcomings in existing math problem-solving benchmarks,\ntraditionally used to evaluate the cognitive capabilities of agents. Our\nparadigm shifts the focus from result-oriented assessments, which often\noverlook the reasoning process, to a more holistic evaluation that effectively\ndifferentiates the cognitive capabilities among models. For example, in our\nbenchmark, GPT-4 demonstrates a performance five times better than GPT3-5. The\nsignificance of this new paradigm lies in its ability to reveal potential\ncognitive deficiencies in LLMs that current benchmarks, such as GSM8K, fail to\nuncover due to their saturation and lack of effective differentiation among\nvarying reasoning abilities. Our comprehensive analysis includes several\nstate-of-the-art math models from both open-source and closed-source\ncommunities, uncovering fundamental deficiencies in their training and\nevaluation approaches. This paper not only advocates for a paradigm shift in\nthe assessment of LLMs but also contributes to the ongoing discourse on the\ntrajectory towards Artificial General Intelligence (AGI). By promoting the\nadoption of meta-reasoning evaluation methods similar to ours, we aim to\nfacilitate a more accurate assessment of the true cognitive abilities of LLMs.\n","authors":["Zhongshen Zeng","Pengguang Chen","Shu Liu","Haiyun Jiang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.17080v2.pdf","comment":"Code: https://github.com/dvlab-research/MR-GSM8K"},{"id":"http://arxiv.org/abs/2401.05949v3","updated":"2024-01-20T13:46:33Z","published":"2024-01-11T14:38:19Z","title":"Universal Vulnerabilities in Large Language Models: In-context Learning\n  Backdoor Attacks","summary":"  In-context learning, a paradigm bridging the gap between pre-training and\nfine-tuning, has demonstrated high efficacy in several NLP tasks, especially in\nfew-shot settings. Unlike traditional fine-tuning methods, in-context learning\nadapts pre-trained models to unseen tasks without updating any parameters.\nDespite being widely applied, in-context learning is vulnerable to malicious\nattacks. In this work, we raise security concerns regarding this paradigm. Our\nstudies demonstrate that an attacker can manipulate the behavior of large\nlanguage models by poisoning the demonstration context, without the need for\nfine-tuning the model. Specifically, we have designed a new backdoor attack\nmethod, named ICLAttack, to target large language models based on in-context\nlearning. Our method encompasses two types of attacks: poisoning demonstration\nexamples and poisoning prompts, which can make models behave in accordance with\npredefined intentions. ICLAttack does not require additional fine-tuning to\nimplant a backdoor, thus preserving the model's generality. Furthermore, the\npoisoned examples are correctly labeled, enhancing the natural stealth of our\nattack method. Extensive experimental results across several language models,\nranging in size from 1.3B to 40B parameters, demonstrate the effectiveness of\nour attack method, exemplified by a high average attack success rate of 95.0%\nacross the three datasets on OPT models. Our findings highlight the\nvulnerabilities of language models, and we hope this work will raise awareness\nof the possible security threats associated with in-context learning.\n","authors":["Shuai Zhao","Meihuizi Jia","Luu Anh Tuan","Jinming Wen"],"pdf_url":"https://arxiv.org/pdf/2401.05949v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04620v3","updated":"2024-01-20T13:04:29Z","published":"2024-01-09T15:44:44Z","title":"Agent Alignment in Evolving Social Norms","summary":"  Agents based on Large Language Models (LLMs) are increasingly permeating\nvarious domains of human production and life, highlighting the importance of\naligning them with human values. The current alignment of AI systems primarily\nfocuses on passively aligning LLMs through human intervention. However, agents\npossess characteristics like receiving environmental feedback and\nself-evolution, rendering the LLM alignment methods inadequate. In response, we\npropose an evolutionary framework for agent evolution and alignment, named\nEvolutionaryAgent, which transforms agent alignment into a process of evolution\nand selection under the principle of survival of the fittest. In an environment\nwhere social norms continuously evolve, agents better adapted to the current\nsocial norms will have a higher probability of survival and proliferation,\nwhile those inadequately aligned dwindle over time. Experimental results\nassessing the agents from multiple perspectives in aligning with social norms\ndemonstrate that EvolutionaryAgent can align progressively better with the\nevolving social norms while maintaining its proficiency in general tasks.\nEffectiveness tests conducted on various open and closed-source LLMs as the\nfoundation for agents also prove the applicability of our approach.\n","authors":["Shimin Li","Tianxiang Sun","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2401.04620v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.09003v2","updated":"2024-01-20T12:43:37Z","published":"2024-01-17T06:48:16Z","title":"Augmenting Math Word Problems via Iterative Question Composing","summary":"  Despite recent progress in improving the mathematical reasoning ability of\nlarge language models(LLMs), solving competition-level math problems without\nthe use of external tools remains challenging for open-source LLMs. In this\nwork, we introduce the MMIQC dataset, a mixture of processed web data and\nsynthetic question-response pairs, to equip base models with better\nmathematical reasoning skills. In different model sizes, the models fine-tuned\non MMIQC consistently outperform their counterparts by a clear margin on MATH\ntest set. Notably, DeepSeek-67B-MMIQC achieves a 41.0% accuracy, 4.2% higher\nthan the previous open-source SOTA. Our experiments also show that a large part\nof the improvement can be attributed to our novel augmentation method\nIQC(Iterative Question Composing), where we iteratively ask an LLM to compose\nnew questions from the given seed problems and do rejection sampling from\nanother LLM. MMIQC has now been released on\nhttps://huggingface.co/datasets/Vivacem/MMIQC.\n","authors":["Haoxiong Liu","Andrew Chi-Chih Yao"],"pdf_url":"https://arxiv.org/pdf/2401.09003v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11218v1","updated":"2024-01-20T12:00:40Z","published":"2024-01-20T12:00:40Z","title":"End-to-End Argument Mining over Varying Rhetorical Structures","summary":"  Rhetorical Structure Theory implies no single discourse interpretation of a\ntext, and the limitations of RST parsers further exacerbate inconsistent\nparsing of similar structures. Therefore, it is important to take into account\nthat the same argumentative structure can be found in semantically similar\ntexts with varying rhetorical structures. In this work, the differences between\nparaphrases within the same argument scheme are evaluated from a rhetorical\nperspective. The study proposes a deep dependency parsing model to assess the\nconnection between rhetorical and argument structures. The model utilizes\nrhetorical relations; RST structures of paraphrases serve as training data\naugmentations. The method allows for end-to-end argumentation analysis using a\nrhetorical tree instead of a word sequence. It is evaluated on the bilingual\nMicrotexts corpus, and the first results on fully-fledged argument parsing for\nthe Russian version of the corpus are reported. The results suggest that\nargument mining can benefit from multiple variants of discourse structure.\n","authors":["Elena Chistova"],"pdf_url":"https://arxiv.org/pdf/2401.11218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11207v1","updated":"2024-01-20T10:42:15Z","published":"2024-01-20T10:42:15Z","title":"Unfair TOS: An Automated Approach using Customized BERT","summary":"  Terms of Service (ToS) form an integral part of any agreement as it defines\nthe legal relationship between a service provider and an end-user. Not only do\nthey establish and delineate reciprocal rights and responsibilities, but they\nalso provide users with information on essential aspects of contracts that\npertain to the use of digital spaces. These aspects include a wide range of\ntopics, including limitation of liability, data protection, etc. Users tend to\naccept the ToS without going through it before using any application or\nservice. Such ignorance puts them in a potentially weaker situation in case any\naction is required. Existing methodologies for the detection or classification\nof unfair clauses are however obsolete and show modest performance. In this\nresearch paper, we present SOTA(State of The Art) results on unfair clause\ndetection from ToS documents based on unprecedented Fine-tuning BERT in\nintegration with SVC(Support Vector Classifier). The study shows proficient\nperformance with a macro F1-score of 0.922 at unfair clause detection, and\nsuperior performance is also shown in the classification of unfair clauses by\neach tag. Further, a comparative analysis is performed by answering research\nquestions on the Transformer models utilized. In order to further research and\nexperimentation the code and results are made available on\nhttps://github.com/batking24/Unfair-TOS-An-Automated-Approach-based-on-Fine-tuning-BERT-in-conjunction-with-ML.\n","authors":["Bathini Sai Akash","Akshara Kupireddy","Lalita Bhanu Murthy"],"pdf_url":"https://arxiv.org/pdf/2401.11207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11206v1","updated":"2024-01-20T10:41:03Z","published":"2024-01-20T10:41:03Z","title":"InferAligner: Inference-Time Alignment for Harmlessness through\n  Cross-Model Guidance","summary":"  With the rapid development of large language models (LLMs), they are not only\nused as general-purpose AI assistants but are also customized through further\nfine-tuning to meet the requirements of different applications. A pivotal\nfactor in the success of current LLMs is the alignment process. Current\nalignment methods, such as supervised fine-tuning (SFT) and reinforcement\nlearning from human feedback (RLHF), focus on training-time alignment and are\noften complex and cumbersome to implement. Therefore, we develop\n\\textbf{InferAligner}, a novel inference-time alignment method that utilizes\ncross-model guidance for harmlessness alignment. InferAligner utilizes safety\nsteering vectors extracted from safety-aligned model to modify the activations\nof the target model when responding to harmful inputs, thereby guiding the\ntarget model to provide harmless responses. Experimental results show that our\nmethod can be very effectively applied to domain-specific models in finance,\nmedicine, and mathematics, as well as to multimodal large language models\n(MLLMs) such as LLaVA. It significantly diminishes the Attack Success Rate\n(ASR) of both harmful instructions and jailbreak attacks, while maintaining\nalmost unchanged performance in downstream tasks.\n","authors":["Pengyu Wang","Dong Zhang","Linyang Li","Chenkun Tan","Xinghao Wang","Ke Ren","Botian Jiang","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2401.11206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11185v1","updated":"2024-01-20T09:49:59Z","published":"2024-01-20T09:49:59Z","title":"How the Advent of Ubiquitous Large Language Models both Stymie and\n  Turbocharge Dynamic Adversarial Question Generation","summary":"  Dynamic adversarial question generation, where humans write examples to stump\na model, aims to create examples that are realistic and informative. However,\nthe advent of large language models (LLMs) has been a double-edged sword for\nhuman authors: more people are interested in seeing and pushing the limits of\nthese models, but because the models are so much stronger an opponent, they are\nharder to defeat. To understand how these models impact adversarial question\nwriting process, we enrich the writing guidance with LLMs and retrieval models\nfor the authors to reason why their questions are not adversarial. While\nauthors could create interesting, challenging adversarial questions, they\nsometimes resort to tricks that result in poor questions that are ambiguous,\nsubjective, or confusing not just to a computer but also to humans. To address\nthese issues, we propose new metrics and incentives for eliciting good,\nchallenging questions and present a new dataset of adversarially authored\nquestions.\n","authors":["Yoo Yeon Sung","Ishani Mondal","Jordan Boyd-Graber"],"pdf_url":"https://arxiv.org/pdf/2401.11185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04691v4","updated":"2024-01-20T09:36:41Z","published":"2023-10-07T05:37:41Z","title":"EMO: Earth Mover Distance Optimization for Auto-Regressive Language\n  Modeling","summary":"  Neural language models are probabilistic models of human text. They are\npredominantly trained using maximum likelihood estimation (MLE), which is\nequivalent to minimizing the forward cross-entropy between the empirical data\ndistribution and the model distribution. However, various degeneration\nphenomena are still widely observed when decoding from the distributions\nlearned by such models. We establish that the forward cross-entropy is\nsuboptimal as a distance metric for aligning human and model distribution due\nto its (1) recall-prioritization (2) negative diversity ignorance and (3)\ntrain-test mismatch. In this paper, we propose Earth Mover Distance\nOptimization (EMO) for auto-regressive language modeling. EMO capitalizes on\nthe inherent properties of earth mover distance to address the aforementioned\nchallenges. Due to the high complexity of direct computation, we further\nintroduce a feasible upper bound for EMO to ease end-to-end training. Upon\nextensive evaluation of language models trained using EMO and MLE. We find that\nEMO demonstrates a consistently better language modeling performance than MLE\nacross domains. Moreover, EMO demonstrates noteworthy enhancements in\ndownstream performance with minimal fine-tuning on merely 25,000 sentences.\nThis highlights the tremendous potential of EMO as a lightweight calibration\nmethod for enhancing large-scale pre-trained language models.\n","authors":["Siyu Ren","Zhiyong Wu","Kenny Q. Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.04691v4.pdf","comment":"To appear at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11143v1","updated":"2024-01-20T06:42:32Z","published":"2024-01-20T06:42:32Z","title":"Gaussian Adaptive Attention is All You Need: Robust Contextual\n  Representations Across Multiple Modalities","summary":"  We propose the Multi-Head Gaussian Adaptive Attention Mechanism (GAAM), a\nnovel probabilistic attention framework, and the Gaussian Adaptive Transformer\n(GAT), designed to enhance information aggregation across multiple modalities,\nincluding Speech, Text and Vision. GAAM integrates learnable mean and variance\ninto its attention mechanism, implemented in a Multi-Headed framework enabling\nit to collectively model any Probability Distribution for dynamic recalibration\nof feature significance. This method demonstrates significant improvements,\nespecially with highly non-stationary data, surpassing the state-of-the-art\nattention techniques in model performance (up to approximately +20% in\naccuracy) by identifying key elements within the feature space. GAAM's\ncompatibility with dot-product-based attention models and relatively low number\nof parameters showcases its adaptability and potential to boost existing\nattention frameworks. Empirically, GAAM exhibits superior adaptability and\nefficacy across a diverse range of tasks, including emotion recognition in\nspeech, image classification, and text classification, thereby establishing its\nrobustness and versatility in handling multi-modal data. Furthermore, we\nintroduce the Importance Factor (IF), a new learning-based metric that enhances\nthe explainability of models trained with GAAM-based methods. Overall, GAAM\nrepresents an advancement towards development of better performing and more\nexplainable attention models across multiple modalities.\n","authors":["Georgios Ioannides","Aman Chadha","Aaron Elkins"],"pdf_url":"https://arxiv.org/pdf/2401.11143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15407v2","updated":"2024-01-20T06:26:33Z","published":"2023-12-24T04:50:57Z","title":"A Comprehensive Analysis of the Effectiveness of Large Language Models\n  as Automatic Dialogue Evaluators","summary":"  Automatic evaluation is an integral aspect of dialogue system research. The\ntraditional reference-based NLG metrics are generally found to be unsuitable\nfor dialogue assessment. Consequently, recent studies have suggested various\nunique, reference-free neural metrics that better align with human evaluations.\nNotably among them, large language models (LLMs), particularly the\ninstruction-tuned variants like ChatGPT, are shown to be promising substitutes\nfor human judges. Yet, existing works on utilizing LLMs for automatic dialogue\nevaluation are limited in their scope in terms of the number of meta-evaluation\ndatasets, mode of evaluation, coverage of LLMs, etc. Hence, it remains\ninconclusive how effective these LLMs are. To this end, we conduct a\ncomprehensive study on the application of LLMs for automatic dialogue\nevaluation. Specifically, we analyze the multi-dimensional evaluation\ncapability of 30 recently emerged LLMs at both turn and dialogue levels, using\na comprehensive set of 12 meta-evaluation datasets. Additionally, we probe the\nrobustness of the LLMs in handling various adversarial perturbations at both\nturn and dialogue levels. Finally, we explore how model-level and\ndimension-level ensembles impact the evaluation performance. All resources are\navailable at https://github.com/e0397123/comp-analysis.\n","authors":["Chen Zhang","Luis Fernando D'Haro","Yiming Chen","Malu Zhang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2312.15407v2.pdf","comment":"An extended version of AAAI-2024 camera-ready paper (appendix\n  included, 16 pages)"},{"id":"http://arxiv.org/abs/2401.11120v1","updated":"2024-01-20T05:10:46Z","published":"2024-01-20T05:10:46Z","title":"Enhancing Large Language Models for Clinical Decision Support by\n  Incorporating Clinical Practice Guidelines","summary":"  Background Large Language Models (LLMs), enhanced with Clinical Practice\nGuidelines (CPGs), can significantly improve Clinical Decision Support (CDS).\nHowever, methods for incorporating CPGs into LLMs are not well studied. Methods\nWe develop three distinct methods for incorporating CPGs into LLMs: Binary\nDecision Tree (BDT), Program-Aided Graph Construction (PAGC), and\nChain-of-Thought-Few-Shot Prompting (CoT-FSP). To evaluate the effectiveness of\nthe proposed methods, we create a set of synthetic patient descriptions and\nconduct both automatic and human evaluation of the responses generated by four\nLLMs: GPT-4, GPT-3.5 Turbo, LLaMA, and PaLM 2. Zero-Shot Prompting (ZSP) was\nused as the baseline method. We focus on CDS for COVID-19 outpatient treatment\nas the case study. Results All four LLMs exhibit improved performance when\nenhanced with CPGs compared to the baseline ZSP. BDT outperformed both CoT-FSP\nand PAGC in automatic evaluation. All of the proposed methods demonstrated high\nperformance in human evaluation. Conclusion LLMs enhanced with CPGs demonstrate\nsuperior performance, as compared to plain LLMs with ZSP, in providing accurate\nrecommendations for COVID-19 outpatient treatment, which also highlights the\npotential for broader applications beyond the case study.\n","authors":["David Oniani","Xizhi Wu","Shyam Visweswaran","Sumit Kapoor","Shravan Kooragayalu","Katelyn Polanska","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10873v2","updated":"2024-01-20T03:58:10Z","published":"2023-10-16T22:53:54Z","title":"IDEAL: Influence-Driven Selective Annotations Empower In-Context\n  Learners in Large Language Models","summary":"  In-context learning is a promising paradigm that utilizes in-context examples\nas prompts for the predictions of large language models. These prompts are\ncrucial for achieving strong performance. However, since the prompts need to be\nsampled from a large volume of annotated examples, finding the right prompt may\nresult in high annotation costs. To address this challenge, this paper\nintroduces an influence-driven selective annotation method that aims to\nminimize annotation costs while improving the quality of in-context examples.\nThe essence of our method is to select a pivotal subset from a large-scale\nunlabeled data pool to annotate for the subsequent sampling of prompts.\nSpecifically, a directed graph is first constructed to represent unlabeled\ndata. Afterward, the influence of candidate unlabeled subsets is quantified\nwith a diffusion process. A simple yet effective greedy algorithm for unlabeled\ndata selection is lastly introduced. It iteratively selects the data if it\nprovides a maximum marginal gain with respect to quantified influence. Compared\nwith previous efforts on selective annotations, our influence-driven method\nworks in an end-to-end manner, avoids an intractable explicit balance between\ndata diversity and representativeness, and enjoys theoretical support.\nExperiments confirm the superiority of the proposed method on various\nbenchmarks, achieving better performance under lower time consumption during\nsubset selection. The project page is available at\nhttps://skzhang1.github.io/IDEAL/.\n","authors":["Shaokun Zhang","Xiaobo Xia","Zhaoqing Wang","Ling-Hao Chen","Jiale Liu","Qingyun Wu","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.10873v2.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11107v1","updated":"2024-01-20T03:55:17Z","published":"2024-01-20T03:55:17Z","title":"Exploiting Duality in Open Information Extraction with Predicate Prompt","summary":"  Open information extraction (OpenIE) aims to extract the schema-free triplets\nin the form of (\\emph{subject}, \\emph{predicate}, \\emph{object}) from a given\nsentence. Compared with general information extraction (IE), OpenIE poses more\nchallenges for the IE models, {especially when multiple complicated triplets\nexist in a sentence. To extract these complicated triplets more effectively, in\nthis paper we propose a novel generative OpenIE model, namely \\emph{DualOIE},\nwhich achieves a dual task at the same time as extracting some triplets from\nthe sentence, i.e., converting the triplets into the sentence.} Such dual task\nencourages the model to correctly recognize the structure of the given sentence\nand thus is helpful to extract all potential triplets from the sentence.\nSpecifically, DualOIE extracts the triplets in two steps: 1) first extracting a\nsequence of all potential predicates, 2) then using the predicate sequence as a\nprompt to induce the generation of triplets. Our experiments on two benchmarks\nand our dataset constructed from Meituan demonstrate that DualOIE achieves the\nbest performance among the state-of-the-art baselines. Furthermore, the online\nA/B test on Meituan platform shows that 0.93\\% improvement of QV-CTR and 0.56\\%\nimprovement of UV-CTR have been obtained when the triplets extracted by DualOIE\nwere leveraged in Meituan's search system.\n","authors":["Zhen Chen","Jingping Liu","Deqing Yang","Yanghua Xiao","Huimin Xu","Zongyu Wang","Rui Xie","Yunsen Xian"],"pdf_url":"https://arxiv.org/pdf/2401.11107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08577v3","updated":"2024-01-20T02:36:12Z","published":"2023-02-16T20:46:36Z","title":"For Generated Text, Is NLI-Neutral Text the Best Text?","summary":"  We explore incorporating natural language inference (NLI) into the text\ngenerative pipeline by using a pre-trained NLI model to assess whether a\ngenerated sentence entails, contradicts, or is neutral to the prompt and\npreceding text. First, we show that the NLI task is predictive of generation\nerrors made by GPT-3. We use these results to develop an NLI-informed\ngeneration procedure for GPT-J. Then, we evaluate these generations by\nobtaining human annotations on error types and overall quality. We find that an\nNLI strategy of maximizing entailment improves text generation when the nucleus\nsampling randomness parameter value is high, while one which maximizes\ncontradiction is in fact productive when the parameter value is low. Overall,\nthough, we demonstrate that an NLI strategy of maximizing the neutral class\nprovides the highest quality of generated text (significantly better than the\nvanilla generations), regardless of parameter value.\n","authors":["Michail Mersinias","Kyle Mahowald"],"pdf_url":"https://arxiv.org/pdf/2302.08577v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.11248v1","updated":"2024-01-20T15:02:33Z","published":"2024-01-20T15:02:33Z","title":"Drop your Decoder: Pre-training with Bag-of-Word Prediction for Dense\n  Passage Retrieval","summary":"  Masked auto-encoder pre-training has emerged as a prevalent technique for\ninitializing and enhancing dense retrieval systems. It generally utilizes\nadditional Transformer decoder blocks to provide sustainable supervision\nsignals and compress contextual information into dense representations.\nHowever, the underlying reasons for the effectiveness of such a pre-training\ntechnique remain unclear. The usage of additional Transformer-based decoders\nalso incurs significant computational costs. In this study, we aim to shed\nlight on this issue by revealing that masked auto-encoder (MAE) pre-training\nwith enhanced decoding significantly improves the term coverage of input tokens\nin dense representations, compared to vanilla BERT checkpoints. Building upon\nthis observation, we propose a modification to the traditional MAE by replacing\nthe decoder of a masked auto-encoder with a completely simplified Bag-of-Word\nprediction task. This modification enables the efficient compression of lexical\nsignals into dense representations through unsupervised pre-training.\nRemarkably, our proposed method achieves state-of-the-art retrieval performance\non several large-scale retrieval benchmarks without requiring any additional\nparameters, which provides a 67% training speed-up compared to standard masked\nauto-encoder pre-training with enhanced decoding.\n","authors":["Guangyuan Ma","Xing Wu","Zijia Lin","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2401.11248v1.pdf","comment":"Working in progress. Our code will be available at\n  https://github.com/ma787639046/bowdpr"},{"id":"http://arxiv.org/abs/2401.11246v1","updated":"2024-01-20T14:59:43Z","published":"2024-01-20T14:59:43Z","title":"Prompt-RAG: Pioneering Vector Embedding-Free Retrieval-Augmented\n  Generation in Niche Domains, Exemplified by Korean Medicine","summary":"  We propose a natural language prompt-based retrieval augmented generation\n(Prompt-RAG), a novel approach to enhance the performance of generative large\nlanguage models (LLMs) in niche domains. Conventional RAG methods mostly\nrequire vector embeddings, yet the suitability of generic LLM-based embedding\nrepresentations for specialized domains remains uncertain. To explore and\nexemplify this point, we compared vector embeddings from Korean Medicine (KM)\nand Conventional Medicine (CM) documents, finding that KM document embeddings\ncorrelated more with token overlaps and less with human-assessed document\nrelatedness, in contrast to CM embeddings. Prompt-RAG, distinct from\nconventional RAG models, operates without the need for embedding vectors. Its\nperformance was assessed through a Question-Answering (QA) chatbot application,\nwhere responses were evaluated for relevance, readability, and informativeness.\nThe results showed that Prompt-RAG outperformed existing models, including\nChatGPT and conventional vector embedding-based RAGs, in terms of relevance and\ninformativeness. Despite challenges like content structuring and response\nlatency, the advancements in LLMs are expected to encourage the use of\nPrompt-RAG, making it a promising tool for other domains in need of RAG\nmethods.\n","authors":["Bongsu Kang","Jundong Kim","Tae-Rim Yun","Chang-Eop Kim"],"pdf_url":"https://arxiv.org/pdf/2401.11246v1.pdf","comment":"26 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.16326v2","updated":"2024-01-20T14:33:54Z","published":"2023-05-10T13:40:06Z","title":"Large language models in biomedical natural language processing:\n  benchmarks, baselines, and recommendations","summary":"  Biomedical literature is growing rapidly, making it challenging to curate and\nextract knowledge manually. Biomedical natural language processing (BioNLP)\ntechniques that can automatically extract information from biomedical\nliterature help alleviate this burden. Recently, large Language Models (LLMs),\nsuch as GPT-3 and GPT-4, have gained significant attention for their impressive\nperformance. However, their effectiveness in BioNLP tasks and impact on method\ndevelopment and downstream users remain understudied. This pilot study (1)\nestablishes the baseline performance of GPT-3 and GPT-4 at both zero-shot and\none-shot settings in eight BioNLP datasets across four applications: named\nentity recognition, relation extraction, multi-label document classification,\nand semantic similarity and reasoning, (2) examines the errors produced by the\nLLMs and categorized the errors into three types: missingness, inconsistencies,\nand unwanted artificial content, and (3) provides suggestions for using LLMs in\nBioNLP applications. We make the datasets, baselines, and results publicly\navailable to the community via\nhttps://github.com/qingyu-qc/gpt_bionlp_benchmark.\n","authors":["Qingyu Chen","Jingcheng Du","Yan Hu","Vipina Kuttichi Keloth","Xueqing Peng","Kalpana Raja","Rui Zhang","Zhiyong Lu","Hua Xu"],"pdf_url":"https://arxiv.org/pdf/2305.16326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11201v1","updated":"2024-01-20T10:28:25Z","published":"2024-01-20T10:28:25Z","title":"Navigating the Thin Line: Examining User Behavior in Search to Detect\n  Engagement and Backfire Effects","summary":"  Opinionated users often seek information that aligns with their preexisting\nbeliefs while dismissing contradictory evidence due to confirmation bias. This\nconduct hinders their ability to consider alternative stances when searching\nthe web. Despite this, few studies have analyzed how the diversification of\nsearch results on disputed topics influences the search behavior of highly\nopinionated users. To this end, we present a preregistered user study (n = 257)\ninvestigating whether different levels (low and high) of bias metrics and\nsearch results presentation (with or without AI-predicted stances labels) can\naffect the stance diversity consumption and search behavior of opinionated\nusers on three debated topics (i.e., atheism, intellectual property rights, and\nschool uniforms). Our results show that exposing participants to\n(counter-attitudinally) biased search results increases their consumption of\nattitude-opposing content, but we also found that bias was associated with a\ntrend toward overall fewer interactions within the search page. We also found\nthat 19% of users interacted with queries and search pages but did not select\nany search results. When we removed these participants in a post-hoc analysis,\nwe found that stance labels increased the diversity of stances consumed by\nusers, particularly when the search results were biased. Our findings highlight\nthe need for future research to explore distinct search scenario settings to\ngain insight into opinionated users' behavior.\n","authors":["F. M. Cau","N. Tintarev"],"pdf_url":"https://arxiv.org/pdf/2401.11201v1.pdf","comment":"17 pages, 3 figures, ECIR2024 (46th European Conference on\n  Information Retrieval - IR4Good track)"},{"id":"http://arxiv.org/abs/2401.11198v1","updated":"2024-01-20T10:25:58Z","published":"2024-01-20T10:25:58Z","title":"A Deep Learning Approach for Selective Relevance Feedback","summary":"  Pseudo-relevance feedback (PRF) can enhance average retrieval effectiveness\nover a sufficiently large number of queries. However, PRF often introduces a\ndrift into the original information need, thus hurting the retrieval\neffectiveness of several queries. While a selective application of PRF can\npotentially alleviate this issue, previous approaches have largely relied on\nunsupervised or feature-based learning to determine whether a query should be\nexpanded. In contrast, we revisit the problem of selective PRF from a deep\nlearning perspective, presenting a model that is entirely data-driven and\ntrained in an end-to-end manner. The proposed model leverages a\ntransformer-based bi-encoder architecture. Additionally, to further improve\nretrieval effectiveness with this selective PRF approach, we make use of the\nmodel's confidence estimates to combine the information from the original and\nexpanded queries. In our experiments, we apply this selective feedback on a\nnumber of different combinations of ranking and feedback models, and show that\nour proposed approach consistently improves retrieval effectiveness for both\nsparse and dense ranking models, with the feedback models being either sparse,\ndense or generative.\n","authors":["Suchana Datta","Debasis Ganguly","Sean MacAvaney","Derek Greene"],"pdf_url":"https://arxiv.org/pdf/2401.11198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07042v2","updated":"2024-01-20T08:58:56Z","published":"2023-04-14T10:33:56Z","title":"Learning Graph ODE for Continuous-Time Sequential Recommendation","summary":"  Sequential recommendation aims at understanding user preference by capturing\nsuccessive behavior correlations, which are usually represented as the item\npurchasing sequences based on their past interactions. Existing efforts\ngenerally predict the next item via modeling the sequential patterns. Despite\neffectiveness, there exist two natural deficiencies: (i) user preference is\ndynamic in nature, and the evolution of collaborative signals is often ignored;\nand (ii) the observed interactions are often irregularly-sampled, while\nexisting methods model item transitions assuming uniform intervals. Thus, how\nto effectively model and predict the underlying dynamics for user preference\nbecomes a critical research problem. To tackle the above challenges, in this\npaper, we focus on continuous-time sequential recommendation and propose a\nprincipled graph ordinary differential equation framework named GDERec.\nTechnically, GDERec is characterized by an autoregressive graph ordinary\ndifferential equation consisting of two components, which are parameterized by\ntwo tailored graph neural networks (GNNs) respectively to capture user\npreference from the perspective of hybrid dynamical systems. The two customized\nGNNs are trained alternately in an autoregressive manner to track the evolution\nof the underlying system from irregular observations, and thus learn effective\nrepresentations of users and items beneficial to the sequential recommendation.\nExtensive experiments on five benchmark datasets demonstrate the superiority of\nour model over various state-of-the-art recommendation methods.\n","authors":["Yifang Qin","Wei Ju","Hongjun Wu","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.07042v2.pdf","comment":"Accepted by EEE Transactions on Knowledge and Data Engineering (TKDE\n  2024)"},{"id":"http://arxiv.org/abs/2401.11145v1","updated":"2024-01-20T06:52:14Z","published":"2024-01-20T06:52:14Z","title":"Document Set Expansion with Positive-Unlabeled Learning: A Density\n  Estimation-based Approach","summary":"  Document set expansion aims to identify relevant documents from a large\ncollection based on a small set of documents that are on a fine-grained topic.\nPrevious work shows that PU learning is a promising method for this task.\nHowever, some serious issues remain unresolved, i.e. typical challenges that PU\nmethods suffer such as unknown class prior and imbalanced data, and the need\nfor transductive experimental settings. In this paper, we propose a novel PU\nlearning framework based on density estimation, called puDE, that can handle\nthe above issues. The advantage of puDE is that it neither constrained to the\nSCAR assumption and nor require any class prior knowledge. We demonstrate the\neffectiveness of the proposed method using a series of real-world datasets and\nconclude that our method is a better alternative for the DSE task.\n","authors":["Haiyang Zhang","Qiuyi Chen","Yuanjie Zou","Yushan Pan","Jia Wang","Mark Stevenson"],"pdf_url":"https://arxiv.org/pdf/2401.11145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10049v2","updated":"2024-01-20T05:12:52Z","published":"2023-12-02T06:36:14Z","title":"Knowledge Graph Reasoning Based on Attention GCN","summary":"  We propose a novel technique to enhance Knowledge Graph Reasoning by\ncombining Graph Convolution Neural Network (GCN) with the Attention Mechanism.\nThis approach utilizes the Attention Mechanism to examine the relationships\nbetween entities and their neighboring nodes, which helps to develop detailed\nfeature vectors for each entity. The GCN uses shared parameters to effectively\nrepresent the characteristics of adjacent entities. We first learn the\nsimilarity of entities for node representation learning. By integrating the\nattributes of the entities and their interactions, this method generates\nextensive implicit feature vectors for each entity, improving performance in\ntasks including entity classification and link prediction, outperforming\ntraditional neural network models. To conclude, this work provides crucial\nmethodological support for a range of applications, such as search engines,\nquestion-answering systems, recommendation systems, and data integration tasks.\n","authors":["Meera Gupta","Ravi Khanna","Divya Choudhary","Nandini Rao"],"pdf_url":"https://arxiv.org/pdf/2312.10049v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11107v1","updated":"2024-01-20T03:55:17Z","published":"2024-01-20T03:55:17Z","title":"Exploiting Duality in Open Information Extraction with Predicate Prompt","summary":"  Open information extraction (OpenIE) aims to extract the schema-free triplets\nin the form of (\\emph{subject}, \\emph{predicate}, \\emph{object}) from a given\nsentence. Compared with general information extraction (IE), OpenIE poses more\nchallenges for the IE models, {especially when multiple complicated triplets\nexist in a sentence. To extract these complicated triplets more effectively, in\nthis paper we propose a novel generative OpenIE model, namely \\emph{DualOIE},\nwhich achieves a dual task at the same time as extracting some triplets from\nthe sentence, i.e., converting the triplets into the sentence.} Such dual task\nencourages the model to correctly recognize the structure of the given sentence\nand thus is helpful to extract all potential triplets from the sentence.\nSpecifically, DualOIE extracts the triplets in two steps: 1) first extracting a\nsequence of all potential predicates, 2) then using the predicate sequence as a\nprompt to induce the generation of triplets. Our experiments on two benchmarks\nand our dataset constructed from Meituan demonstrate that DualOIE achieves the\nbest performance among the state-of-the-art baselines. Furthermore, the online\nA/B test on Meituan platform shows that 0.93\\% improvement of QV-CTR and 0.56\\%\nimprovement of UV-CTR have been obtained when the triplets extracted by DualOIE\nwere leveraged in Meituan's search system.\n","authors":["Zhen Chen","Jingping Liu","Deqing Yang","Yanghua Xiao","Huimin Xu","Zongyu Wang","Rui Xie","Yunsen Xian"],"pdf_url":"https://arxiv.org/pdf/2401.11107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11089v1","updated":"2024-01-20T02:38:21Z","published":"2024-01-20T02:38:21Z","title":"FedRKG: A Privacy-preserving Federated Recommendation Framework via\n  Knowledge Graph Enhancement","summary":"  Federated Learning (FL) has emerged as a promising approach for preserving\ndata privacy in recommendation systems by training models locally. Recently,\nGraph Neural Networks (GNN) have gained popularity in recommendation tasks due\nto their ability to capture high-order interactions between users and items.\nHowever, privacy concerns prevent the global sharing of the entire user-item\ngraph. To address this limitation, some methods create pseudo-interacted items\nor users in the graph to compensate for missing information for each client.\nUnfortunately, these methods introduce random noise and raise privacy concerns.\nIn this paper, we propose FedRKG, a novel federated recommendation system,\nwhere a global knowledge graph (KG) is constructed and maintained on the server\nusing publicly available item information, enabling higher-order user-item\ninteractions. On the client side, a relation-aware GNN model leverages diverse\nKG relationships. To protect local interaction items and obscure gradients, we\nemploy pseudo-labeling and Local Differential Privacy (LDP). Extensive\nexperiments conducted on three real-world datasets demonstrate the competitive\nperformance of our approach compared to centralized algorithms while ensuring\nprivacy preservation. Moreover, FedRKG achieves an average accuracy improvement\nof 4% compared to existing federated learning baselines.\n","authors":["Dezhong Yao","Tongtong Liu","Qi Cao","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2401.11089v1.pdf","comment":null}]},"2024-01-23T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.12975v1","updated":"2024-01-23T18:59:43Z","published":"2024-01-23T18:59:43Z","title":"HAZARD Challenge: Embodied Decision Making in Dynamically Changing\n  Environments","summary":"  Recent advances in high-fidelity virtual environments serve as one of the\nmajor driving forces for building intelligent embodied agents to perceive,\nreason and interact with the physical world. Typically, these environments\nremain unchanged unless agents interact with them. However, in real-world\nscenarios, agents might also face dynamically changing environments\ncharacterized by unexpected events and need to rapidly take action accordingly.\nTo remedy this gap, we propose a new simulated embodied benchmark, called\nHAZARD, specifically designed to assess the decision-making abilities of\nembodied agents in dynamic situations. HAZARD consists of three unexpected\ndisaster scenarios, including fire, flood, and wind, and specifically supports\nthe utilization of large language models (LLMs) to assist common sense\nreasoning and decision-making. This benchmark enables us to evaluate autonomous\nagents' decision-making capabilities across various pipelines, including\nreinforcement learning (RL), rule-based, and search-based methods in\ndynamically changing environments. As a first step toward addressing this\nchallenge using large language models, we further develop an LLM-based agent\nand perform an in-depth analysis of its promise and challenge of solving these\nchallenging tasks. HAZARD is available at https://vis-www.cs.umass.edu/hazard/.\n","authors":["Qinhong Zhou","Sunli Chen","Yisong Wang","Haozhe Xu","Weihua Du","Hongxin Zhang","Yilun Du","Joshua B. Tenenbaum","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2401.12975v1.pdf","comment":"ICLR 2024. The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2401.12973v1","updated":"2024-01-23T18:59:21Z","published":"2024-01-23T18:59:21Z","title":"In-Context Language Learning: Arhitectures and Algorithms","summary":"  Large-scale neural language models exhibit a remarkable capacity for\nin-context learning (ICL): they can infer novel functions from datasets\nprovided as input. Most of our current understanding of when and how ICL arises\ncomes from LMs trained on extremely simple learning problems like linear\nregression and associative recall. There remains a significant gap between\nthese model problems and the \"real\" ICL exhibited by LMs trained on large text\ncorpora, which involves not just retrieval and function approximation but\nfree-form generation of language and other structured outputs. In this paper,\nwe study ICL through the lens of a new family of model problems we term in\ncontext language learning (ICLL). In ICLL, LMs are presented with a set of\nstrings from a formal language, and must generate additional strings from the\nsame language. We focus on in-context learning of regular languages generated\nby random finite automata. We evaluate a diverse set of neural sequence models\n(including several RNNs, Transformers, and state-space model variants) on\nregular ICLL tasks, aiming to answer three questions: (1) Which model classes\nare empirically capable of ICLL? (2) What algorithmic solutions do successful\nmodels implement to perform ICLL? (3) What architectural changes can improve\nICLL in less performant models? We first show that Transformers significantly\noutperform neural sequence models with recurrent or convolutional\nrepresentations on ICLL tasks. Next, we provide evidence that their ability to\ndo so relies on specialized \"n-gram heads\" (higher-order variants of induction\nheads) that compute input-conditional next-token distributions. Finally, we\nshow that hard-wiring these heads into recurrent and convolutional models\nimproves performance not just on ICLL, but natural language modeling --\nimproving the perplexity of 340M-parameter models by up to 1.14 points (6.7%)\non the SlimPajama dataset.\n","authors":["Ekin Akyürek","Bailin Wang","Yoon Kim","Jacob Andreas"],"pdf_url":"https://arxiv.org/pdf/2401.12973v1.pdf","comment":"29 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.12970v1","updated":"2024-01-23T18:57:53Z","published":"2024-01-23T18:57:53Z","title":"Raidar: geneRative AI Detection viA Rewriting","summary":"  We find that large language models (LLMs) are more likely to modify\nhuman-written text than AI-generated text when tasked with rewriting. This\ntendency arises because LLMs often perceive AI-generated text as high-quality,\nleading to fewer modifications. We introduce a method to detect AI-generated\ncontent by prompting LLMs to rewrite text and calculating the editing distance\nof the output. We dubbed our geneRative AI Detection viA Rewriting method\nRaidar. Raidar significantly improves the F1 detection scores of existing AI\ncontent detection models -- both academic and commercial -- across various\ndomains, including News, creative writing, student essays, code, Yelp reviews,\nand arXiv papers, with gains of up to 29 points. Operating solely on word\nsymbols without high-dimensional features, our method is compatible with black\nbox LLMs, and is inherently robust on new content. Our results illustrate the\nunique imprint of machine-generated text through the lens of the machines\nthemselves.\n","authors":["Chengzhi Mao","Carl Vondrick","Hao Wang","Junfeng Yang"],"pdf_url":"https://arxiv.org/pdf/2401.12970v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2401.12963v1","updated":"2024-01-23T18:45:54Z","published":"2024-01-23T18:45:54Z","title":"AutoRT: Embodied Foundation Models for Large Scale Orchestration of\n  Robotic Agents","summary":"  Foundation models that incorporate language, vision, and more recently\nactions have revolutionized the ability to harness internet scale data to\nreason about useful tasks. However, one of the key challenges of training\nembodied foundation models is the lack of data grounded in the physical world.\nIn this paper, we propose AutoRT, a system that leverages existing foundation\nmodels to scale up the deployment of operational robots in completely unseen\nscenarios with minimal human supervision. AutoRT leverages vision-language\nmodels (VLMs) for scene understanding and grounding, and further uses large\nlanguage models (LLMs) for proposing diverse and novel instructions to be\nperformed by a fleet of robots. Guiding data collection by tapping into the\nknowledge of foundation models enables AutoRT to effectively reason about\nautonomy tradeoffs and safety while significantly scaling up data collection\nfor robot learning. We demonstrate AutoRT proposing instructions to over 20\nrobots across multiple buildings and collecting 77k real robot episodes via\nboth teleoperation and autonomous robot policies. We experimentally show that\nsuch \"in-the-wild\" data collected by AutoRT is significantly more diverse, and\nthat AutoRT's use of LLMs allows for instruction following data collection\nrobots that can align to human preferences.\n","authors":["Michael Ahn","Debidatta Dwibedi","Chelsea Finn","Montse Gonzalez Arenas","Keerthana Gopalakrishnan","Karol Hausman","Brian Ichter","Alex Irpan","Nikhil Joshi","Ryan Julian","Sean Kirmani","Isabel Leal","Edward Lee","Sergey Levine","Yao Lu","Isabel Leal","Sharath Maddineni","Kanishka Rao","Dorsa Sadigh","Pannag Sanketi","Pierre Sermanet","Quan Vuong","Stefan Welker","Fei Xia","Ted Xiao","Peng Xu","Steve Xu","Zhuo Xu"],"pdf_url":"https://arxiv.org/pdf/2401.12963v1.pdf","comment":"26 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.08535v2","updated":"2024-01-23T18:35:40Z","published":"2023-10-12T17:24:15Z","title":"Formally Specifying the High-Level Behavior of LLM-Based Agents","summary":"  Autonomous, goal-driven agents powered by LLMs have recently emerged as\npromising tools for solving challenging problems without the need for\ntask-specific finetuned models that can be expensive to procure. Currently, the\ndesign and implementation of such agents is ad hoc, as the wide variety of\ntasks that LLM-based agents may be applied to naturally means there can be no\none-size-fits-all approach to agent design. In this work we aim to alleviate\nthe difficulty of designing and implementing new agents by proposing a\nminimalistic generation framework that simplifies the process of building\nagents. The framework we introduce allows the user to define desired agent\nbehaviors in a high-level, declarative specification that is then used to\nconstruct a decoding monitor which guarantees the LLM will produce an output\nexhibiting the desired behavior. Our declarative approach, in which the\nbehavior is described without concern for how it should be implemented or\nenforced, enables rapid design, implementation, and experimentation with\ndifferent LLM-based agents. We demonstrate how the proposed framework can be\nused to implement recent LLM-based agents (e.g., ReACT), and show how the\nflexibility of our approach can be leveraged to define a new agent with more\ncomplex behavior, the Plan-Act-Summarize-Solve (PASS) agent. Lastly, we\ndemonstrate that our method outperforms other agents on multiple popular\nreasoning-centric question-answering benchmarks.\n","authors":["Maxwell Crouse","Ibrahim Abdelaziz","Ramon Astudillo","Kinjal Basu","Soham Dan","Sadhana Kumaravel","Achille Fokoue","Pavan Kapanipathi","Salim Roukos","Luis Lastras"],"pdf_url":"https://arxiv.org/pdf/2310.08535v2.pdf","comment":"Preprint under review"},{"id":"http://arxiv.org/abs/2401.12954v1","updated":"2024-01-23T18:22:19Z","published":"2024-01-23T18:22:19Z","title":"Meta-Prompting: Enhancing Language Models with Task-Agnostic Scaffolding","summary":"  We introduce meta-prompting, an effective scaffolding technique designed to\nenhance the functionality of language models (LMs). This approach transforms a\nsingle LM into a multi-faceted conductor, adept at managing and integrating\nmultiple independent LM queries. By employing high-level instructions,\nmeta-prompting guides the LM to break down complex tasks into smaller, more\nmanageable subtasks. These subtasks are then handled by distinct \"expert\"\ninstances of the same LM, each operating under specific, tailored instructions.\nCentral to this process is the LM itself, in its role as the conductor, which\nensures seamless communication and effective integration of the outputs from\nthese expert models. It additionally employs its inherent critical thinking and\nrobust verification processes to refine and authenticate the end result. This\ncollaborative prompting approach empowers a single LM to simultaneously act as\na comprehensive orchestrator and a panel of diverse experts, significantly\nenhancing its performance across a wide array of tasks. The zero-shot,\ntask-agnostic nature of meta-prompting greatly simplifies user interaction by\nobviating the need for detailed, task-specific instructions. Furthermore, our\nresearch demonstrates the seamless integration of external tools, such as a\nPython interpreter, into the meta-prompting framework, thereby broadening its\napplicability and utility. Through rigorous experimentation with GPT-4, we\nestablish the superiority of meta-prompting over conventional scaffolding\nmethods: When averaged across all tasks, including the Game of 24,\nCheckmate-in-One, and Python Programming Puzzles, meta-prompting, augmented\nwith a Python interpreter functionality, surpasses standard prompting by 17.1%,\nexpert (dynamic) prompting by 17.3%, and multipersona prompting by 15.2%.\n","authors":["Mirac Suzgun","Adam Tauman Kalai"],"pdf_url":"https://arxiv.org/pdf/2401.12954v1.pdf","comment":"https://github.com/suzgunmirac/meta-prompting"},{"id":"http://arxiv.org/abs/2310.17715v2","updated":"2024-01-23T18:19:18Z","published":"2023-10-26T18:22:13Z","title":"Outlier Dimensions Encode Task-Specific Knowledge","summary":"  Representations from large language models (LLMs) are known to be dominated\nby a small subset of dimensions with exceedingly high variance. Previous works\nhave argued that although ablating these outlier dimensions in LLM\nrepresentations hurts downstream performance, outlier dimensions are\ndetrimental to the representational quality of embeddings. In this study, we\ninvestigate how fine-tuning impacts outlier dimensions and show that 1) outlier\ndimensions that occur in pre-training persist in fine-tuned models and 2) a\nsingle outlier dimension can complete downstream tasks with a minimal error\nrate. Our results suggest that outlier dimensions can encode crucial\ntask-specific knowledge and that the value of a representation in a single\noutlier dimension drives downstream model decisions.\n","authors":["William Rudman","Catherine Chen","Carsten Eickhoff"],"pdf_url":"https://arxiv.org/pdf/2310.17715v2.pdf","comment":"Camera-ready version for EMNLP 2023"},{"id":"http://arxiv.org/abs/2401.12947v1","updated":"2024-01-23T18:07:38Z","published":"2024-01-23T18:07:38Z","title":"Transformer-Based Models Are Not Yet Perfect At Learning to Emulate\n  Structural Recursion","summary":"  This paper investigates the ability of transformer-based models to learn\nstructural recursion from examples. Recursion is a universal concept in both\nnatural and formal languages. Structural recursion is central to the\nprogramming language and formal mathematics tasks where symbolic tools\ncurrently excel beyond neural models, such as inferring semantic relations\nbetween datatypes and emulating program behavior. We introduce a general\nframework that nicely connects the abstract concepts of structural recursion in\nthe programming language domain to concrete sequence modeling problems and\nlearned models' behavior. The framework includes a representation that captures\nthe general \\textit{syntax} of structural recursion, coupled with two different\nframeworks for understanding their \\textit{semantics} -- one that is more\nnatural from a programming languages perspective and one that helps bridge that\nperspective with a mechanistic understanding of the underlying transformer\narchitecture.\n  With our framework as a powerful conceptual tool, we identify different\nissues under various set-ups. The models trained to emulate recursive\ncomputations cannot fully capture the recursion yet instead fit short-cut\nalgorithms and thus cannot solve certain edge cases that are under-represented\nin the training distribution. In addition, it is difficult for state-of-the-art\nlarge language models (LLMs) to mine recursive rules from in-context\ndemonstrations. Meanwhile, these LLMs fail in interesting ways when emulating\nreduction (step-wise computation) of the recursive function.\n","authors":["Dylan Zhang","Curt Tigges","Zory Zhang","Stella Biderman","Maxim Raginsky","Talia Ringer"],"pdf_url":"https://arxiv.org/pdf/2401.12947v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.14699"},{"id":"http://arxiv.org/abs/2401.12941v1","updated":"2024-01-23T17:58:38Z","published":"2024-01-23T17:58:38Z","title":"Multicultural Name Recognition For Previously Unseen Names","summary":"  State of the art Named Entity Recognition (NER) models have achieved an\nimpressive ability to extract common phrases from text that belong to labels\nsuch as location, organization, time, and person. However, typical NER systems\nthat rely on having seen a specific entity in their training data in order to\nlabel an entity perform poorly on rare or unseen entities ta in order to label\nan entity perform poorly on rare or unseen entities (Derczynski et al., 2017).\nThis paper attempts to improve recognition of person names, a diverse category\nthat can grow any time someone is born or changes their name. In order for\ndownstream tasks to not exhibit bias based on cultural background, a model\nshould perform well on names from a variety of backgrounds. In this paper I\nexperiment with the training data and input structure of an English Bi-LSTM\nname recognition model. I look at names from 103 countries to compare how well\nthe model performs on names from different cultures, specifically in the\ncontext of a downstream task where extracted names will be matched to\ninformation on file. I find that a model with combined character and word input\noutperforms word-only models and may improve on accuracy compared to classical\nNER models that are not geared toward identifying unseen entity values.\n","authors":["Alexandra Loessberg-Zahl"],"pdf_url":"https://arxiv.org/pdf/2401.12941v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2401.12915v1","updated":"2024-01-23T17:07:18Z","published":"2024-01-23T17:07:18Z","title":"Red Teaming Visual Language Models","summary":"  VLMs (Vision-Language Models) extend the capabilities of LLMs (Large Language\nModels) to accept multimodal inputs. Since it has been verified that LLMs can\nbe induced to generate harmful or inaccurate content through specific test\ncases (termed as Red Teaming), how VLMs perform in similar scenarios,\nespecially with their combination of textual and visual inputs, remains a\nquestion. To explore this problem, we present a novel red teaming dataset\nRTVLM, which encompasses 10 subtasks (e.g., image misleading, multi-modal\njail-breaking, face fairness, etc) under 4 primary aspects (faithfulness,\nprivacy, safety, fairness). Our RTVLM is the first red-teaming dataset to\nbenchmark current VLMs in terms of these 4 different aspects. Detailed analysis\nshows that 10 prominent open-sourced VLMs struggle with the red teaming in\ndifferent degrees and have up to 31% performance gap with GPT-4V. Additionally,\nwe simply apply red teaming alignment to LLaVA-v1.5 with Supervised Fine-tuning\n(SFT) using RTVLM, and this bolsters the models' performance with 10% in RTVLM\ntest set, 13% in MM-Hal, and without noticeable decline in MM-Bench,\noverpassing other LLaVA-based models with regular alignment data. This reveals\nthat current open-sourced VLMs still lack red teaming alignment. Our code and\ndatasets will be open-source.\n","authors":["Mukai Li","Lei Li","Yuwei Yin","Masood Ahmed","Zhenguang Liu","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2401.12915v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2306.02272v3","updated":"2024-01-23T16:28:49Z","published":"2023-06-04T06:33:13Z","title":"OWQ: Lessons learned from activation outliers for weight quantization in\n  large language models","summary":"  Large language models (LLMs) with hundreds of billions of parameters require\npowerful server-grade GPUs for inference, limiting their practical deployment.\nTo address this challenge, we introduce the outlier-aware weight quantization\n(OWQ) method, which aims to minimize LLM's footprint through low-precision\nrepresentation. OWQ prioritizes a small subset of structured weights sensitive\nto quantization, storing them in high-precision, while applying highly tuned\nquantization to the remaining dense weights. This sensitivity-aware\nmixed-precision scheme reduces the quantization error notably, and extensive\nexperiments demonstrate that 3.1-bit models using OWQ perform comparably to\n4-bit models optimized by OPTQ. Furthermore, OWQ incorporates a\nparameter-efficient fine-tuning for task-specific adaptation, called weak\ncolumn tuning (WCT), enabling accurate task-specific LLM adaptation with\nminimal memory overhead in the optimized format. OWQ represents a notable\nadvancement in the flexibility, efficiency, and practicality of LLM\noptimization literature. The source code is available at\nhttps://github.com/xvyaward/owq\n","authors":["Changhun Lee","Jungyu Jin","Taesu Kim","Hyungjun Kim","Eunhyeok Park"],"pdf_url":"https://arxiv.org/pdf/2306.02272v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12874v1","updated":"2024-01-23T16:09:53Z","published":"2024-01-23T16:09:53Z","title":"From Understanding to Utilization: A Survey on Explainability for Large\n  Language Models","summary":"  This survey paper delves into the burgeoning field of explainability for\nLarge Language Models (LLMs), a critical yet challenging aspect of natural\nlanguage processing. With LLMs playing a pivotal role in various applications,\ntheir \"black-box\" nature raises concerns about transparency and ethical use.\nThis paper emphasizes the necessity for enhanced explainability in LLMs,\naddressing both the general public's trust and the technical community's need\nfor a deeper understanding of these models. We concentrate on pre-trained\nTransformer-based LLMs, such as LLaMA, which present unique interpretability\nchallenges due to their scale and complexity. Our review categorizes existing\nexplainability methods and discusses their application in improving model\ntransparency and reliability. We also discuss representative evaluation\nmethods, highlighting their strengths and limitations. The goal of this survey\nis to bridge the gap between theoretical understanding and practical\napplication, offering insights for future research and development in the field\nof LLM explainability.\n","authors":["Haoyan Luo","Lucia Specia"],"pdf_url":"https://arxiv.org/pdf/2401.12874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12873v1","updated":"2024-01-23T16:07:43Z","published":"2024-01-23T16:07:43Z","title":"Improving Machine Translation with Human Feedback: An Exploration of\n  Quality Estimation as a Reward Model","summary":"  Insufficient modeling of human preferences within the reward model is a major\nobstacle for leveraging human feedback to improve translation quality.\nFortunately, quality estimation (QE), which predicts the quality of a given\ntranslation without reference, has achieved impressive alignment with human\nevaluations in the last two years. In this work, we investigate the potential\nof employing the QE model as the reward model (the QE-based reward model) to\npredict human preferences for feedback training. We first identify the\noveroptimization problem during QE-based feedback training, manifested as an\nincrease in reward while translation quality declines. We examine the problem\nand argue that the vulnerability of the QE model might lead to high rewards for\nincorrect translations, resulting in overoptimization and error propagation. To\naddress the problem, we adopt a simple yet effective method that uses heuristic\nrules to detect the incorrect translations and assigns a penalty term to the\nQE-based rewards for the detected incorrect translations. Experimental results\nshow that the proposed QE-based feedback training achieves consistent and\nsignificant improvements across various settings, further verified through\nhuman preference studies. Our subsequent analysis demonstrates the high data\nefficiency of the proposed QE-based feedback training: the proposed approach\nusing a small amount of monolingual data can outperform systems using larger\nparallel corpora.\n","authors":["Zhiwei He","Xing Wang","Wenxiang Jiao","Zhuosheng Zhang","Rui Wang","Shuming Shi","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2401.12873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12863v1","updated":"2024-01-23T15:56:11Z","published":"2024-01-23T15:56:11Z","title":"KAM-CoT: Knowledge Augmented Multimodal Chain-of-Thoughts Reasoning","summary":"  Large Language Models (LLMs) have demonstrated impressive performance in\nnatural language processing tasks by leveraging chain of thought (CoT) that\nenables step-by-step thinking. Extending LLMs with multimodal capabilities is\nthe recent interest, but incurs computational cost and requires substantial\nhardware resources. To address these challenges, we propose KAM-CoT a framework\nthat integrates CoT reasoning, Knowledge Graphs (KGs), and multiple modalities\nfor a comprehensive understanding of multimodal tasks. KAM-CoT adopts a\ntwo-stage training process with KG grounding to generate effective rationales\nand answers. By incorporating external knowledge from KGs during reasoning, the\nmodel gains a deeper contextual understanding reducing hallucinations and\nenhancing the quality of answers. This knowledge-augmented CoT reasoning\nempowers the model to handle questions requiring external context, providing\nmore informed answers. Experimental findings show KAM-CoT outperforms the\nstate-of-the-art methods. On the ScienceQA dataset, we achieve an average\naccuracy of 93.87%, surpassing GPT-3.5 (75.17%) by 18% and GPT-4 (83.99%) by\n10%. Remarkably, KAM-CoT achieves these results with only 280M trainable\nparameters at a time, demonstrating its cost-efficiency and effectiveness.\n","authors":["Debjyoti Mondal","Suraj Modi","Subhadarshi Panda","Rituraj Singh","Godawari Sudhakar Rao"],"pdf_url":"https://arxiv.org/pdf/2401.12863v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2304.14391v4","updated":"2024-01-23T15:52:28Z","published":"2023-04-27T17:55:13Z","title":"Energy-based Models are Zero-Shot Planners for Compositional Scene\n  Rearrangement","summary":"  Language is compositional; an instruction can express multiple relation\nconstraints to hold among objects in a scene that a robot is tasked to\nrearrange. Our focus in this work is an instructable scene-rearranging\nframework that generalizes to longer instructions and to spatial concept\ncompositions never seen at training time. We propose to represent\nlanguage-instructed spatial concepts with energy functions over relative object\narrangements. A language parser maps instructions to corresponding energy\nfunctions and an open-vocabulary visual-language model grounds their arguments\nto relevant objects in the scene. We generate goal scene configurations by\ngradient descent on the sum of energy functions, one per language predicate in\nthe instruction. Local vision-based policies then re-locate objects to the\ninferred goal locations. We test our model on established instruction-guided\nmanipulation benchmarks, as well as benchmarks of compositional instructions we\nintroduce. We show our model can execute highly compositional instructions\nzero-shot in simulation and in the real world. It outperforms\nlanguage-to-action reactive policies and Large Language Model planners by a\nlarge margin, especially for long instructions that involve compositions of\nmultiple spatial concepts. Simulation and real-world robot execution videos, as\nwell as our code and datasets are publicly available on our website:\nhttps://ebmplanner.github.io.\n","authors":["Nikolaos Gkanatsios","Ayush Jain","Zhou Xian","Yunchu Zhang","Christopher Atkeson","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2304.14391v4.pdf","comment":"First two authors contributed equally | RSS 2023"},{"id":"http://arxiv.org/abs/2310.00367v2","updated":"2024-01-23T15:20:33Z","published":"2023-09-30T13:15:49Z","title":"AutomaTikZ: Text-Guided Synthesis of Scientific Vector Graphics with\n  TikZ","summary":"  Generating bitmap graphics from text has gained considerable attention, yet\nfor scientific figures, vector graphics are often preferred. Given that vector\ngraphics are typically encoded using low-level graphics primitives, generating\nthem directly is difficult. To address this, we propose the use of TikZ, a\nwell-known abstract graphics language that can be compiled to vector graphics,\nas an intermediate representation of scientific figures. TikZ offers\nhuman-oriented, high-level commands, thereby facilitating conditional language\nmodeling with any large language model. To this end, we introduce DaTikZ, the\nfirst large-scale TikZ dataset consisting of 120k TikZ drawings aligned with\ncaptions. We fine-tune LLaMA on DaTikZ, as well as our new model CLiMA, which\naugments LLaMA with multimodal CLIP embeddings. In both human and automatic\nevaluation, CLiMA and LLaMA outperform commercial GPT-4 and Claude 2 in terms\nof similarity to human-created figures, with CLiMA additionally improving\ntext-image alignment. Our detailed analysis shows that all models generalize\nwell and are not susceptible to memorization. GPT-4 and Claude 2, however, tend\nto generate more simplistic figures compared to both humans and our models. We\nmake our framework, AutomaTikZ, along with model weights and datasets, publicly\navailable.\n","authors":["Jonas Belouadi","Anne Lauscher","Steffen Eger"],"pdf_url":"https://arxiv.org/pdf/2310.00367v2.pdf","comment":"Accepted at ICLR 2024 (poster); Project Page:\n  https://github.com/potamides/AutomaTikZ"},{"id":"http://arxiv.org/abs/2401.10543v2","updated":"2024-01-23T14:46:23Z","published":"2024-01-19T08:02:37Z","title":"Multilingual acoustic word embeddings for zero-resource languages","summary":"  This research addresses the challenge of developing speech applications for\nzero-resource languages that lack labelled data. It specifically uses acoustic\nword embedding (AWE) -- fixed-dimensional representations of variable-duration\nspeech segments -- employing multilingual transfer, where labelled data from\nseveral well-resourced languages are used for pertaining. The study introduces\na new neural network that outperforms existing AWE models on zero-resource\nlanguages. It explores the impact of the choice of well-resourced languages.\nAWEs are applied to a keyword-spotting system for hate speech detection in\nSwahili radio broadcasts, demonstrating robustness in real-world scenarios.\nAdditionally, novel semantic AWE models improve semantic query-by-example\nsearch.\n","authors":["Christiaan Jacobs"],"pdf_url":"https://arxiv.org/pdf/2401.10543v2.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2401.12798v1","updated":"2024-01-23T14:31:12Z","published":"2024-01-23T14:31:12Z","title":"Gradient Flow of Energy: A General and Efficient Approach for Entity\n  Alignment Decoding","summary":"  Entity alignment (EA), a pivotal process in integrating multi-source\nKnowledge Graphs (KGs), seeks to identify equivalent entity pairs across these\ngraphs. Most existing approaches regard EA as a graph representation learning\ntask, concentrating on enhancing graph encoders. However, the decoding process\nin EA - essential for effective operation and alignment accuracy - has received\nlimited attention and remains tailored to specific datasets and model\narchitectures, necessitating both entity and additional explicit relation\nembeddings. This specificity limits its applicability, particularly in\nGNN-based models. To address this gap, we introduce a novel, generalized, and\nefficient decoding approach for EA, relying solely on entity embeddings. Our\nmethod optimizes the decoding process by minimizing Dirichlet energy, leading\nto the gradient flow within the graph, to promote graph homophily. The\ndiscretization of the gradient flow produces a fast and scalable approach,\ntermed Triple Feature Propagation (TFP). TFP innovatively channels gradient\nflow through three views: entity-to-entity, entity-to-relation, and\nrelation-to-entity. This generalized gradient flow enables TFP to harness the\nmulti-view structural information of KGs. Rigorous experimentation on diverse\nreal-world datasets demonstrates that our approach significantly enhances\nvarious EA methods. Notably, the approach achieves these advancements with less\nthan 6 seconds of additional computational time, establishing a new benchmark\nin efficiency and adaptability for future EA methods.\n","authors":["Yuanyi Wang","Haifeng Sun","Jingyu Wang","Qi Qi","Shaoling Sun","Jianxin Liao"],"pdf_url":"https://arxiv.org/pdf/2401.12798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12794v1","updated":"2024-01-23T14:29:17Z","published":"2024-01-23T14:29:17Z","title":"Benchmarking LLMs via Uncertainty Quantification","summary":"  The proliferation of open-source Large Language Models (LLMs) from various\ninstitutions has highlighted the urgent need for comprehensive evaluation\nmethods. However, current evaluation platforms, such as the widely recognized\nHuggingFace open LLM leaderboard, neglect a crucial aspect -- uncertainty,\nwhich is vital for thoroughly assessing LLMs. To bridge this gap, we introduce\na new benchmarking approach for LLMs that integrates uncertainty\nquantification. Our examination involves eight LLMs (LLM series) spanning five\nrepresentative natural language processing tasks. Additionally, we introduce an\nuncertainty-aware evaluation metric, UAcc, which takes into account both\nprediction accuracy and prediction uncertainty. Our findings reveal that: I)\nLLMs with higher accuracy may exhibit lower certainty; II) Larger-scale LLMs\nmay display greater uncertainty compared to their smaller counterparts; and\nIII) Instruction-finetuning tends to increase the uncertainty of LLMs. By\ntaking uncertainty into account, our new UAcc metric can either amplify or\ndiminish the relative improvement of one LLM over another and may even change\nthe relative ranking of two LLMs. These results underscore the significance of\nincorporating uncertainty in the evaluation of LLMs.\n","authors":["Fanghua Ye","Mingming Yang","Jianhui Pang","Longyue Wang","Derek F. Wong","Emine Yilmaz","Shuming Shi","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2401.12794v1.pdf","comment":"24 pages, preprints"},{"id":"http://arxiv.org/abs/2401.12789v1","updated":"2024-01-23T14:19:01Z","published":"2024-01-23T14:19:01Z","title":"Multilingual and Fully Non-Autoregressive ASR with Large Language Model\n  Fusion: A Comprehensive Study","summary":"  In the era of large models, the autoregressive nature of decoding often\nresults in latency serving as a significant bottleneck. We propose a\nnon-autoregressive LM-fused ASR system that effectively leverages the\nparallelization capabilities of accelerator hardware. Our approach combines the\nUniversal Speech Model (USM) and the PaLM 2 language model in per-segment\nscoring mode, achieving an average relative WER improvement across all\nlanguages of 10.8% on FLEURS and 3.6% on YouTube captioning. Furthermore, our\ncomprehensive ablation study analyzes key parameters such as LLM size, context\nlength, vocabulary size, fusion methodology. For instance, we explore the\nimpact of LLM size ranging from 128M to 340B parameters on ASR performance.\nThis study provides valuable insights into the factors influencing the\neffectiveness of practical large-scale LM-fused speech recognition systems.\n","authors":["W. Ronny Huang","Cyril Allauzen","Tongzhou Chen","Kilol Gupta","Ke Hu","James Qin","Yu Zhang","Yongqiang Wang","Shuo-Yiin Chang","Tara N. Sainath"],"pdf_url":"https://arxiv.org/pdf/2401.12789v1.pdf","comment":"ICASSP 2024"},{"id":"http://arxiv.org/abs/2308.12890v3","updated":"2024-01-23T13:42:03Z","published":"2023-08-24T16:09:13Z","title":"Large Language Models Vote: Prompting for Rare Disease Identification","summary":"  The emergence of generative Large Language Models (LLMs) emphasizes the need\nfor accurate and efficient prompting approaches. LLMs are often applied in\nFew-Shot Learning (FSL) contexts, where tasks are executed with minimal\ntraining data. FSL has become popular in many Artificial Intelligence (AI)\nsubdomains, including AI for health. Rare diseases affect a small fraction of\nthe population. Rare disease identification from clinical notes inherently\nrequires FSL techniques due to limited data availability. Manual data\ncollection and annotation is both expensive and time-consuming. In this paper,\nwe propose Models-Vote Prompting (MVP), a flexible prompting approach for\nimproving the performance of LLM queries in FSL settings. MVP works by\nprompting numerous LLMs to perform the same tasks and then conducting a\nmajority vote on the resulting outputs. This method achieves improved results\nto any one model in the ensemble on one-shot rare disease identification and\nclassification tasks. We also release a novel rare disease dataset for FSL,\navailable to those who signed the MIMIC-IV Data Use Agreement (DUA).\nFurthermore, in using MVP, each model is prompted multiple times, substantially\nincreasing the time needed for manual annotation, and to address this, we\nassess the feasibility of using JSON for automating generative LLM evaluation.\n","authors":["David Oniani","Jordan Hilsman","Hang Dong","Fengyi Gao","Shiven Verma","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12890v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12756v1","updated":"2024-01-23T13:35:47Z","published":"2024-01-23T13:35:47Z","title":"What the Weight?! A Unified Framework for Zero-Shot Knowledge\n  Composition","summary":"  The knowledge encapsulated in a model is the core factor determining its\nfinal performance on downstream tasks. Much research in NLP has focused on\nefficient methods for storing and adapting different types of knowledge, e.g.,\nin dedicated modularized structures, and on how to effectively combine these,\ne.g., by learning additional parameters. However, given the many possible\noptions, a thorough understanding of the mechanisms involved in these\ncompositions is missing, and hence it remains unclear which strategies to\nutilize. To address this research gap, we propose a novel framework for\nzero-shot module composition, which encompasses existing and some novel\nvariations for selecting, weighting, and combining parameter modules under a\nsingle unified notion. Focusing on the scenario of domain knowledge and adapter\nlayers, our framework provides a systematic unification of concepts, allowing\nus to conduct the first comprehensive benchmarking study of various zero-shot\nknowledge composition strategies. In particular, we test two module combination\nmethods and five selection and weighting strategies for their effectiveness and\nefficiency in an extensive experimental setup. Our results highlight the\nefficacy of ensembling but also hint at the power of simple though\noften-ignored weighting methods. Further in-depth analyses allow us to\nunderstand the role of weighting vs. top-k selection, and show that, to a\ncertain extent, the performance of adapter composition can even be predicted.\n","authors":["Carolin Holtermann","Markus Frohmann","Navid Rekabsaz","Anne Lauscher"],"pdf_url":"https://arxiv.org/pdf/2401.12756v1.pdf","comment":"Accepted to Findings of the ACL: EACL 2024"},{"id":"http://arxiv.org/abs/2401.08517v2","updated":"2024-01-23T13:29:20Z","published":"2024-01-16T17:31:35Z","title":"Supporting Student Decisions on Learning Recommendations: An LLM-Based\n  Chatbot with Knowledge Graph Contextualization for Conversational\n  Explainability and Mentoring","summary":"  Student commitment towards a learning recommendation is not separable from\ntheir understanding of the reasons it was recommended to them; and their\nability to modify it based on that understanding. Among explainability\napproaches, chatbots offer the potential to engage the student in a\nconversation, similar to a discussion with a peer or a mentor. The capabilities\nof chatbots, however, are still not sufficient to replace a human mentor,\ndespite the advancements of generative AI (GenAI) and large language models\n(LLM). Therefore, we propose an approach to utilize chatbots as mediators of\nthe conversation and sources of limited and controlled generation of\nexplanations, to harvest the potential of LLMs while reducing their potential\nrisks at the same time. The proposed LLM-based chatbot supports students in\nunderstanding learning-paths recommendations. We use a knowledge graph (KG) as\na human-curated source of information, to regulate the LLM's output through\ndefining its prompt's context. A group chat approach is developed to connect\nstudents with human mentors, either on demand or in cases that exceed the\nchatbot's pre-defined tasks. We evaluate the chatbot with a user study, to\nprovide a proof-of-concept and highlight the potential requirements and\nlimitations of utilizing chatbots in conversational explainability.\n","authors":["Hasan Abu-Rasheed","Mohamad Hussam Abdulsalam","Christian Weber","Madjid Fathi"],"pdf_url":"https://arxiv.org/pdf/2401.08517v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07913v4","updated":"2024-01-23T13:26:56Z","published":"2023-12-13T06:11:42Z","title":"A Survey of Text Watermarking in the Era of Large Language Models","summary":"  Text watermarking algorithms play a crucial role in the copyright protection\nof textual content, yet their capabilities and application scenarios have been\nlimited historically. The recent developments in large language models (LLMs)\nhave opened new opportunities for the advancement of text watermarking\ntechniques. LLMs not only enhance the capabilities of text watermarking\nalgorithms through their text understanding and generation abilities but also\nnecessitate the use of text watermarking algorithms for their own copyright\nprotection. This paper conducts a comprehensive survey of the current state of\ntext watermarking technology, covering four main aspects: (1) an overview and\ncomparison of different text watermarking techniques; (2) evaluation methods\nfor text watermarking algorithms, including their success rates, impact on text\nquality, robustness, and unforgeability; (3) potential application scenarios\nfor text watermarking technology; (4) current challenges and future directions\nfor development. This survey aims to provide researchers with a thorough\nunderstanding of text watermarking technology, thereby promoting its further\nadvancement.\n","authors":["Aiwei Liu","Leyi Pan","Yijian Lu","Jingjing Li","Xuming Hu","Xi Zhang","Lijie Wen","Irwin King","Hui Xiong","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2312.07913v4.pdf","comment":"35 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.12720v1","updated":"2024-01-23T12:41:03Z","published":"2024-01-23T12:41:03Z","title":"A Comprehensive View of the Biases of Toxicity and Sentiment Analysis\n  Methods Towards Utterances with African American English Expressions","summary":"  Language is a dynamic aspect of our culture that changes when expressed in\ndifferent technologies/communities. Online social networks have enabled the\ndiffusion and evolution of different dialects, including African American\nEnglish (AAE). However, this increased usage is not without barriers. One\nparticular barrier is how sentiment (Vader, TextBlob, and Flair) and toxicity\n(Google's Perspective and the open-source Detoxify) methods present biases\ntowards utterances with AAE expressions. Consider Google's Perspective to\nunderstand bias. Here, an utterance such as ``All n*ggers deserve to die\nrespectfully. The police murder us.'' it reaches a higher toxicity than\n``African-Americans deserve to die respectfully. The police murder us.''. This\nscore difference likely arises because the tool cannot understand the\nre-appropriation of the term ``n*gger''. One explanation for this bias is that\nAI models are trained on limited datasets, and using such a term in training\ndata is more likely to appear in a toxic utterance. While this may be\nplausible, the tool will make mistakes regardless. Here, we study bias on two\nWeb-based (YouTube and Twitter) datasets and two spoken English datasets. Our\nanalysis shows how most models present biases towards AAE in most settings. We\nisolate the impact of AAE expression usage via linguistic control features from\nthe Linguistic Inquiry and Word Count (LIWC) software, grammatical control\nfeatures extracted via Part-of-Speech (PoS) tagging from Natural Language\nProcessing (NLP) models, and the semantic of utterances by comparing sentence\nembeddings from recent language models. We present consistent results on how a\nheavy usage of AAE expressions may cause the speaker to be considered\nsubstantially more toxic, even when speaking about nearly the same subject. Our\nstudy complements similar analyses focusing on small datasets and/or one method\nonly.\n","authors":["Guilherme H. Resende","Luiz F. Nery","Fabrício Benevenuto","Savvas Zannettou","Flavio Figueiredo"],"pdf_url":"https://arxiv.org/pdf/2401.12720v1.pdf","comment":"Under peer review"},{"id":"http://arxiv.org/abs/2401.12713v1","updated":"2024-01-23T12:29:37Z","published":"2024-01-23T12:29:37Z","title":"Generating Unsupervised Abstractive Explanations for Rumour Verification","summary":"  The task of rumour verification in social media concerns assessing the\nveracity of a claim on the basis of conversation threads that result from it.\nWhile previous work has focused on predicting a veracity label, here we\nreformulate the task to generate model-centric, free-text explanations of a\nrumour's veracity. We follow an unsupervised approach by first utilising\npost-hoc explainability methods to score the most important posts within a\nthread and then we use these posts to generate informative explanatory\nsummaries by employing template-guided summarisation. To evaluate the\ninformativeness of the explanatory summaries, we exploit the few-shot learning\ncapabilities of a large language model (LLM). Our experiments show that LLMs\ncan have similar agreement to humans in evaluating summaries. Importantly, we\nshow that explanatory abstractive summaries are more informative and better\nreflect the predicted rumour veracity than just using the highest ranking posts\nin the thread.\n","authors":["Iman Munire Bilal","Preslav Nakov","Rob Procter","Maria Liakata"],"pdf_url":"https://arxiv.org/pdf/2401.12713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12689v1","updated":"2024-01-23T11:54:09Z","published":"2024-01-23T11:54:09Z","title":"Energy-based Automated Model Evaluation","summary":"  The conventional evaluation protocols on machine learning models rely heavily\non a labeled, i.i.d-assumed testing dataset, which is not often present in real\nworld applications. The Automated Model Evaluation (AutoEval) shows an\nalternative to this traditional workflow, by forming a proximal prediction\npipeline of the testing performance without the presence of ground-truth\nlabels. Despite its recent successes, the AutoEval frameworks still suffer from\nan overconfidence issue, substantial storage and computational cost. In that\nregard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that\nallows the AutoEval framework to be both more efficient and effective. The core\nof the MDE is to establish a meta-distribution statistic, on the information\n(energy) associated with individual samples, then offer a smoother\nrepresentation enabled by energy-based learning. We further provide our\ntheoretical insights by connecting the MDE with the classification loss. We\nprovide extensive experiments across modalities, datasets and different\narchitectural backbones to validate MDE's validity, together with its\nsuperiority compared with prior approaches. We also prove MDE's versatility by\nshowing its seamless integration with large-scale models, and easy adaption to\nlearning scenarios with noisy- or imbalanced- labels.\n","authors":["Ru Peng","Heming Zou","Haobo Wang","Yawen Zeng","Zenan Huang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.12689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12671v1","updated":"2024-01-23T11:25:34Z","published":"2024-01-23T11:25:34Z","title":"Context Matters: Pushing the Boundaries of Open-Ended Answer Generation\n  with Graph-Structured Knowledge Context","summary":"  In the continuously advancing AI landscape, crafting context-rich and\nmeaningful responses via Large Language Models (LLMs) is essential. Researchers\nare becoming more aware of the challenges that LLMs with fewer parameters\nencounter when trying to provide suitable answers to open-ended questions. To\naddress these hurdles, the integration of cutting-edge strategies, augmentation\nof rich external domain knowledge to LLMs, offers significant improvements.\nThis paper introduces a novel framework that combines graph-driven context\nretrieval in conjunction to knowledge graphs based enhancement, honing the\nproficiency of LLMs, especially in domain specific community question answering\nplatforms like AskUbuntu, Unix, and ServerFault. We conduct experiments on\nvarious LLMs with different parameter sizes to evaluate their ability to ground\nknowledge and determine factual accuracy in answers to open-ended questions.\nOur methodology GraphContextGen consistently outperforms dominant text-based\nretrieval systems, demonstrating its robustness and adaptability to a larger\nnumber of use cases. This advancement highlights the importance of pairing\ncontext rich data retrieval with LLMs, offering a renewed approach to knowledge\nsourcing and generation in AI systems. We also show that, due to rich\ncontextual data retrieval, the crucial entities, along with the generated\nanswer, remain factually coherent with the gold answer.\n","authors":["Somnath Banerjee","Amruit Sahoo","Sayan Layek","Avik Dutta","Rima Hazra","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2401.12671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12631v1","updated":"2024-01-23T10:27:42Z","published":"2024-01-23T10:27:42Z","title":"A Reply to Makelov et al. (2023)'s \"Interpretability Illusion\" Arguments","summary":"  We respond to the recent paper by Makelov et al. (2023), which reviews\nsubspace interchange intervention methods like distributed alignment search\n(DAS; Geiger et al. 2023) and claims that these methods potentially cause\n\"interpretability illusions\". We first review Makelov et al. (2023)'s technical\nnotion of what an \"interpretability illusion\" is, and then we show that even\nintuitive and desirable explanations can qualify as illusions in this sense. As\na result, their method of discovering \"illusions\" can reject explanations they\nconsider \"non-illusory\". We then argue that the illusions Makelov et al. (2023)\nsee in practice are artifacts of their training and evaluation paradigms. We\nclose by emphasizing that, though we disagree with their core characterization,\nMakelov et al. (2023)'s examples and discussion have undoubtedly pushed the\nfield of interpretability forward.\n","authors":["Zhengxuan Wu","Atticus Geiger","Jing Huang","Aryaman Arora","Thomas Icard","Christopher Potts","Noah D. Goodman"],"pdf_url":"https://arxiv.org/pdf/2401.12631v1.pdf","comment":"20 pages, 14 figures"},{"id":"http://arxiv.org/abs/2401.11969v2","updated":"2024-01-23T09:35:02Z","published":"2024-01-22T14:17:03Z","title":"Claim Detection for Automated Fact-checking: A Survey on Monolingual,\n  Multilingual and Cross-Lingual Research","summary":"  Automated fact-checking has drawn considerable attention over the past few\ndecades due to the increase in the diffusion of misinformation on online\nplatforms. This is often carried out as a sequence of tasks comprising (i) the\ndetection of sentences circulating in online platforms which constitute claims\nneeding verification, followed by (ii) the verification process of those\nclaims. This survey focuses on the former, by discussing existing efforts\ntowards detecting claims needing fact-checking, with a particular focus on\nmultilingual data and methods. This is a challenging and fertile direction\nwhere existing methods are yet far from matching human performance due to the\nprofoundly challenging nature of the issue. Especially, the dissemination of\ninformation across multiple social platforms, articulated in multiple languages\nand modalities demands more generalized solutions for combating misinformation.\nFocusing on multilingual misinformation, we present a comprehensive survey of\nexisting multilingual claim detection research. We present state-of-the-art\nmultilingual claim detection research categorized into three key factors of the\nproblem, verifiability, priority, and similarity. Further, we present a\ndetailed overview of the existing multilingual datasets along with the\nchallenges and suggest possible future advancements.\n","authors":["Rrubaa Panchendrarajan","Arkaitz Zubiaga"],"pdf_url":"https://arxiv.org/pdf/2401.11969v2.pdf","comment":"Typo corrected"},{"id":"http://arxiv.org/abs/2401.12585v1","updated":"2024-01-23T09:33:31Z","published":"2024-01-23T09:33:31Z","title":"SLANG: New Concept Comprehension of Large Language Models","summary":"  The dynamic nature of language, particularly evident in the realm of slang\nand memes on the Internet, poses serious challenges to the adaptability of\nlarge language models (LLMs). Traditionally anchored to static datasets, these\nmodels often struggle to keep up with the rapid linguistic evolution\ncharacteristic of online communities. This research addresses the critical need\nto bridge this gap, aiming to enhance LLMs' comprehension of evolving new\nconcepts on the internet, without the high cost and impracticality of continual\nretraining. To address this issue, we propose a new benchmark $\\textbf{SLANG}$\nto assess LLMs' proficiency in comprehending emerging linguistic trends and a\nbaseline approach $\\textbf{FOCUS}$, which uses causal inference to enhance LLMs\nto understand new phrases and usage patterns. This approach involves\nscrutinizing real-world instances of linguistic shifts, serving as contextual\nbeacons, to form more precise and contextually relevant connections between\nnewly emerging expressions and their intended meanings. The empirical analysis\nshows that our causal inference-based approach outperforms the traditional\nmodels in terms of precision and relevance in the interpretation of Internet\nslang and memes.\n","authors":["Lingrui Mei","Shenghua Liu","Yiwei Wang","Baolong Bi","Xueqi Chen"],"pdf_url":"https://arxiv.org/pdf/2401.12585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01185v3","updated":"2024-01-23T09:16:00Z","published":"2023-12-02T17:24:17Z","title":"A ripple in time: a discontinuity in American history","summary":"  In this note we use the State of the Union Address (SOTU) dataset from Kaggle\nto make some surprising (and some not so surprising) observations pertaining to\nthe general timeline of American history, and the character and nature of the\naddresses themselves. Our main approach is using vector embeddings, such as\nBERT (DistilBERT) and GPT-2.\n  While it is widely believed that BERT (and its variations) is most suitable\nfor NLP classification tasks, we find out that GPT-2 in conjunction with\nnonlinear dimension reduction methods such as UMAP provide better separation\nand stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In\nour case, no model fine-tuning is required, and the pre-trained out-of-the-box\nGPT-2 model is enough.\n  We also used a fine-tuned DistilBERT model for classification detecting which\nPresident delivered which address, with very good results (accuracy 93% - 95%\ndepending on the run). An analogous task was performed to determine the year of\nwriting, and we were able to pin it down to about 4 years (which is a single\npresidential term).\n  It is worth noting that SOTU addresses provide relatively small writing\nsamples (with about 8'000 words on average, and varying widely from under 2'000\nwords to more than 20'000), and that the number of authors is relatively large\n(we used SOTU addresses of 42 US presidents). This shows that the techniques\nemployed turn out to be rather efficient, while all the computations described\nin this note can be performed using a single GPU instance of Google Colab.\n  The accompanying code is available on GitHub.\n","authors":["Alexander Kolpakov","Igor Rivin"],"pdf_url":"https://arxiv.org/pdf/2312.01185v3.pdf","comment":"7 pages, 8 figures; GitHub repository\n  https://github.com/sashakolpakov/ripple_in_time"},{"id":"http://arxiv.org/abs/2401.12576v1","updated":"2024-01-23T09:11:07Z","published":"2024-01-23T09:11:07Z","title":"LLMCheckup: Conversational Examination of Large Language Models via\n  Interpretability Tools","summary":"  Interpretability tools that offer explanations in the form of a dialogue have\ndemonstrated their efficacy in enhancing users' understanding, as one-off\nexplanations may occasionally fall short in providing sufficient information to\nthe user. Current solutions for dialogue-based explanations, however, require\nmany dependencies and are not easily transferable to tasks they were not\ndesigned for. With LLMCheckup, we present an easily accessible tool that allows\nusers to chat with any state-of-the-art large language model (LLM) about its\nbehavior. We enable LLMs to generate all explanations by themselves and take\ncare of intent recognition without fine-tuning, by connecting them with a broad\nspectrum of Explainable AI (XAI) tools, e.g. feature attributions,\nembedding-based similarity, and prompting strategies for counterfactual and\nrationale generation. LLM (self-)explanations are presented as an interactive\ndialogue that supports follow-up questions and generates suggestions.\nLLMCheckup provides tutorials for operations available in the system, catering\nto individuals with varying levels of expertise in XAI and supports multiple\ninput modalities. We introduce a new parsing strategy called multi-prompt\nparsing substantially enhancing the parsing accuracy of LLMs. Finally, we\nshowcase the tasks of fact checking and commonsense question answering.\n","authors":["Qianli Wang","Tatiana Anikina","Nils Feldhus","Josef van Genabith","Leonhard Hennig","Sebastian Möller"],"pdf_url":"https://arxiv.org/pdf/2401.12576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.12192v2","updated":"2024-01-23T09:05:41Z","published":"2022-12-23T08:23:32Z","title":"Learning to Generate Questions by Enhancing Text Generation with\n  Sentence Selection","summary":"  We introduce an approach for the answer-aware question generation problem.\nInstead of only relying on the capability of strong pre-trained language\nmodels, we observe that the information of answers and questions can be found\nin some relevant sentences in the context. Based on that, we design a model\nwhich includes two modules: a selector and a generator. The selector forces the\nmodel to more focus on relevant sentences regarding an answer to provide\nimplicit local information. The generator generates questions by implicitly\ncombining local information from the selector and global information from the\nwhole context encoded by the encoder. The model is trained jointly to take\nadvantage of latent interactions between the two modules. Experimental results\non two benchmark datasets show that our model is better than strong pre-trained\nmodels for the question generation task. The code is also available.\n","authors":["Pham Quoc-Hung","Minh-Tien Nguyen","Manh Tran-Tien","Hung Le","Xuan-Hieu Phan"],"pdf_url":"https://arxiv.org/pdf/2212.12192v2.pdf","comment":"This paper describes an on-going work"},{"id":"http://arxiv.org/abs/2401.06827v2","updated":"2024-01-23T08:54:15Z","published":"2024-01-12T04:54:01Z","title":"APLe: Token-Wise Adaptive for Multi-Modal Prompt Learning","summary":"  Pre-trained Vision-Language (V-L) models set the benchmark for generalization\nto downstream tasks among the noteworthy contenders. Many characteristics of\nthe V-L model have been explored in existing research including the challenge\nof the sensitivity to text input and the tuning process across multi-modal\nprompts. With the advanced utilization of the V-L model like CLIP, recent\napproaches deploy learnable prompts instead of hand-craft prompts to boost the\ngeneralization performance and address the aforementioned challenges. Inspired\nby layer-wise training, which is wildly used in image fusion, we note that\nusing a sequential training process to adapt different modalities branches of\nCLIP efficiently facilitates the improvement of generalization. In the context\nof addressing the multi-modal prompting challenge, we propose Token-wise\nAdaptive for Multi-modal Prompt Learning (APLe) for tuning both modalities\nprompts, vision and language, as tokens in a sequential manner. APLe addresses\nthe challenges in V-L models to promote prompt learning across both modalities,\nwhich indicates a competitive generalization performance in line with the\nstate-of-the-art. Preeminently, APLe shows robustness and favourable\nperformance in prompt-length experiments with an absolute advantage in adopting\nthe V-L models.\n","authors":["Guiming Cao","Kaize Shi","Hong Fu","Huaiwen Zhang","Guandong Xu"],"pdf_url":"https://arxiv.org/pdf/2401.06827v2.pdf","comment":"7 pages,3 figures"},{"id":"http://arxiv.org/abs/2401.12566v1","updated":"2024-01-23T08:49:23Z","published":"2024-01-23T08:49:23Z","title":"Automated Fact-Checking of Climate Change Claims with Large Language\n  Models","summary":"  This paper presents Climinator, a novel AI-based tool designed to automate\nthe fact-checking of climate change claims. Utilizing an array of Large\nLanguage Models (LLMs) informed by authoritative sources like the IPCC reports\nand peer-reviewed scientific literature, Climinator employs an innovative\nMediator-Advocate framework. This design allows Climinator to effectively\nsynthesize varying scientific perspectives, leading to robust, evidence-based\nevaluations. Our model demonstrates remarkable accuracy when testing claims\ncollected from Climate Feedback and Skeptical Science. Notably, when\nintegrating an advocate with a climate science denial perspective in our\nframework, Climinator's iterative debate process reliably converges towards\nscientific consensus, underscoring its adeptness at reconciling diverse\nviewpoints into science-based, factual conclusions. While our research is\nsubject to certain limitations and necessitates careful interpretation, our\napproach holds significant potential. We hope to stimulate further research and\nencourage exploring its applicability in other contexts, including political\nfact-checking and legal domains.\n","authors":["Markus Leippold","Saeid Ashraf Vaghefi","Dominik Stammbach","Veruska Muccione","Julia Bingler","Jingwei Ni","Chiara Colesanti-Senni","Tobias Wekhof","Tobias Schimanski","Glen Gostlow","Tingyu Yu","Juerg Luterbacher","Christian Huggel"],"pdf_url":"https://arxiv.org/pdf/2401.12566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03025v2","updated":"2024-01-23T07:49:13Z","published":"2023-10-04T17:59:41Z","title":"Retrieval meets Long Context Large Language Models","summary":"  Extending the context window of large language models (LLMs) is getting\npopular recently, while the solution of augmenting LLMs with retrieval has\nexisted for years. The natural questions are: i) Retrieval-augmentation versus\nlong context window, which one is better for downstream tasks? ii) Can both\nmethods be combined to get the best of both worlds? In this work, we answer\nthese questions by studying both solutions using two state-of-the-art\npretrained LLMs, i.e., a proprietary 43B GPT and Llama2-70B. Perhaps\nsurprisingly, we find that LLM with 4K context window using simple\nretrieval-augmentation at generation can achieve comparable performance to\nfinetuned LLM with 16K context window via positional interpolation on long\ncontext tasks, while taking much less computation. More importantly, we\ndemonstrate that retrieval can significantly improve the performance of LLMs\nregardless of their extended context window sizes. Our best model,\nretrieval-augmented Llama2-70B with 32K context window, outperforms\nGPT-3.5-turbo-16k and Davinci003 in terms of average score on nine long context\ntasks including question answering, query-based summarization, and in-context\nfew-shot learning tasks. It also outperforms its non-retrieval Llama2-70B-32k\nbaseline by a margin, while being much faster at generation. Our study provides\ngeneral insights on the choice of retrieval-augmentation versus long context\nextension of LLM for practitioners.\n","authors":["Peng Xu","Wei Ping","Xianchao Wu","Lawrence McAfee","Chen Zhu","Zihan Liu","Sandeep Subramanian","Evelina Bakhturina","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2310.03025v2.pdf","comment":"Published at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.12540v1","updated":"2024-01-23T07:48:58Z","published":"2024-01-23T07:48:58Z","title":"DREditor: An Time-efficient Approach for Building a Domain-specific\n  Dense Retrieval Model","summary":"  Deploying dense retrieval models efficiently is becoming increasingly\nimportant across various industries. This is especially true for enterprise\nsearch services, where customizing search engines to meet the time demands of\ndifferent enterprises in different domains is crucial. Motivated by this, we\ndevelop a time-efficient approach called DREditor to edit the matching rule of\nan off-the-shelf dense retrieval model to suit a specific domain. This is\nachieved by directly calibrating the output embeddings of the model using an\nefficient and effective linear mapping. This mapping is powered by an edit\noperator that is obtained by solving a specially constructed least squares\nproblem. Compared to implicit rule modification via long-time finetuning, our\nexperimental results show that DREditor provides significant advantages on\ndifferent domain-specific datasets, dataset sources, retrieval models, and\ncomputing devices. It consistently enhances time efficiency by 100-300 times\nwhile maintaining comparable or even superior retrieval performance. In a\nbroader context, we take the first step to introduce a novel embedding\ncalibration approach for the retrieval task, filling the technical blank in the\ncurrent field of embedding calibration. This approach also paves the way for\nbuilding domain-specific dense retrieval models efficiently and inexpensively.\n","authors":["Chen Huang","Duanyu Feng","Wenqiang Lei","Jiancheng Lv"],"pdf_url":"https://arxiv.org/pdf/2401.12540v1.pdf","comment":"15 pages, 6 figures, Codes are available at\n  https://github.com/huangzichun/DREditor"},{"id":"http://arxiv.org/abs/2401.10134v2","updated":"2024-01-23T07:42:40Z","published":"2024-01-18T17:03:59Z","title":"Spatial-Temporal Large Language Model for Traffic Prediction","summary":"  Traffic prediction, a critical component for intelligent transportation\nsystems, endeavors to foresee future traffic at specific locations using\nhistorical data. Although existing traffic prediction models often emphasize\ndeveloping complex neural network structures, their accuracy has not seen\nimprovements accordingly. Recently, Large Language Models (LLMs) have shown\noutstanding capabilities in time series analysis. Differing from existing\nmodels, LLMs progress mainly through parameter expansion and extensive\npre-training while maintaining their fundamental structures. In this paper, we\npropose a Spatial-Temporal Large Language Model (ST-LLM) for traffic\nprediction. Specifically, ST-LLM redefines the timesteps at each location as\ntokens and incorporates a spatial-temporal embedding module to learn the\nspatial location and global temporal representations of tokens. Then these\nrepresentations are fused to provide each token with unified spatial and\ntemporal information. Furthermore, we propose a novel partially frozen\nattention strategy of the LLM, which is designed to capture spatial-temporal\ndependencies for traffic prediction. Comprehensive experiments on real traffic\ndatasets offer evidence that ST-LLM outperforms state-of-the-art models.\nNotably, the ST-LLM also exhibits robust performance in both few-shot and\nzero-shot prediction scenarios.\n","authors":["Chenxi Liu","Sun Yang","Qianxiong Xu","Zhishuai Li","Cheng Long","Ziyue Li","Rui Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10134v2.pdf","comment":"Revise"},{"id":"http://arxiv.org/abs/2311.12373v2","updated":"2024-01-23T07:12:01Z","published":"2023-11-21T06:23:38Z","title":"Beyond Turing: A Comparative Analysis of Approaches for Detecting\n  Machine-Generated Text","summary":"  Significant progress has been made on text generation by pre-trained language\nmodels (PLMs), yet distinguishing between human and machine-generated text\nposes an escalating challenge. This paper offers an in-depth evaluation of\nthree distinct methods used to address this task: traditional shallow learning,\nLanguage Model (LM) fine-tuning, and Multilingual Model fine-tuning. These\napproaches are rigorously tested on a wide range of machine-generated texts,\nproviding a benchmark of their competence in distinguishing between\nhuman-authored and machine-authored linguistic constructs. The results reveal\nconsiderable differences in performance across methods, thus emphasizing the\ncontinued need for advancement in this crucial area of NLP. This study offers\nvaluable insights and paves the way for future research aimed at creating\nrobust and highly discriminative models.\n","authors":["Muhammad Farid Adilazuarda"],"pdf_url":"https://arxiv.org/pdf/2311.12373v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04867v2","updated":"2024-01-23T06:48:45Z","published":"2024-01-10T01:02:26Z","title":"An Analysis of User Behaviors for Objectively Evaluating Spoken Dialogue\n  Systems","summary":"  Establishing evaluation schemes for spoken dialogue systems is important, but\nit can also be challenging. While subjective evaluations are commonly used in\nuser experiments, objective evaluations are necessary for research comparison\nand reproducibility. To address this issue, we propose a framework for\nindirectly but objectively evaluating systems based on users' behaviors. In\nthis paper, to this end, we investigate the relationship between user behaviors\nand subjective evaluation scores in social dialogue tasks: attentive listening,\njob interview, and first-meeting conversation. The results reveal that in\ndialogue tasks where user utterances are primary, such as attentive listening\nand job interview, indicators like the number of utterances and words play a\nsignificant role in evaluation. Observing disfluency also can indicate the\neffectiveness of formal tasks, such as job interview. On the other hand, in\ndialogue tasks with high interactivity, such as first-meeting conversation,\nbehaviors related to turn-taking, like average switch pause length, become more\nimportant. These findings suggest that selecting appropriate user behaviors can\nprovide valuable insights for objective evaluation in each social dialogue\ntask.\n","authors":["Koji Inoue","Divesh Lala","Keiko Ochi","Tatsuya Kawahara","Gabriel Skantze"],"pdf_url":"https://arxiv.org/pdf/2401.04867v2.pdf","comment":"This paper has been accepted for presentation at International\n  Workshop on Spoken Dialogue Systems Technology 2024 (IWSDS 2024) and\n  represents the author's version of the work"},{"id":"http://arxiv.org/abs/2401.12522v1","updated":"2024-01-23T06:36:49Z","published":"2024-01-23T06:36:49Z","title":"BiTA: Bi-Directional Tuning for Lossless Acceleration in Large Language\n  Models","summary":"  Large language models (LLMs) commonly employ autoregressive generation during\ninference, leading to high memory bandwidth demand and consequently extended\nlatency. To mitigate this inefficiency, we present Bi-directional Tuning for\nlossless Acceleration (BiTA), an innovative method expediting LLMs via\nstreamlined semi-autoregressive generation and draft verification. Inspired by\nthe concept of prompt tuning, we enhance LLMs with a parameter-efficient design\ncalled bi-directional tuning for the capability in semi-autoregressive\ngeneration. Employing efficient tree-based decoding, the models perform draft\ncandidate generation and verification in parallel, ensuring outputs identical\nto their autoregressive counterparts under greedy sampling. BiTA serves as a\nlightweight plug-in module, seamlessly boosting the inference efficiency of\nexisting LLMs without requiring additional assistance models or incurring\nsignificant extra memory costs. Applying the proposed BiTA, LLaMA-2-70B-Chat\nachieves a 2.7$\\times$ speedup on the MT-Bench benchmark. Extensive experiments\nconfirm our method surpasses state-of-the-art acceleration techniques.\n","authors":["Feng Lin","Hanling Yi","Hongbin Li","Yifan Yang","Xiaotian Yu","Guangming Lu","Rong Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.12522v1.pdf","comment":"Source code at https://github.com/linfeng93/BiTA"},{"id":"http://arxiv.org/abs/2401.12520v1","updated":"2024-01-23T06:30:05Z","published":"2024-01-23T06:30:05Z","title":"Key Information Retrieval to Classify the Unstructured Data Content of\n  Preferential Trade Agreements","summary":"  With the rapid proliferation of textual data, predicting long texts has\nemerged as a significant challenge in the domain of natural language\nprocessing. Traditional text prediction methods encounter substantial\ndifficulties when grappling with long texts, primarily due to the presence of\nredundant and irrelevant information, which impedes the model's capacity to\ncapture pivotal insights from the text. To address this issue, we introduce a\nnovel approach to long-text classification and prediction. Initially, we employ\nembedding techniques to condense the long texts, aiming to diminish the\nredundancy therein. Subsequently,the Bidirectional Encoder Representations from\nTransformers (BERT) embedding method is utilized for text classification\ntraining. Experimental outcomes indicate that our method realizes considerable\nperformance enhancements in classifying long texts of Preferential Trade\nAgreements. Furthermore, the condensation of text through embedding methods not\nonly augments prediction accuracy but also substantially reduces computational\ncomplexity. Overall, this paper presents a strategy for long-text prediction,\noffering a valuable reference for researchers and engineers in the natural\nlanguage processing sphere.\n","authors":["Jiahui Zhao","Ziyi Meng","Stepan Gordeev","Zijie Pan","Dongjin Song","Sandro Steinbach","Caiwen Ding"],"pdf_url":"https://arxiv.org/pdf/2401.12520v1.pdf","comment":"AI4TS Workshop@AAAI 2024 accepted publication"},{"id":"http://arxiv.org/abs/2401.12492v1","updated":"2024-01-23T05:20:35Z","published":"2024-01-23T05:20:35Z","title":"Comparing Human-Centered Language Modeling: Is it Better to Model\n  Groups, Individual Traits, or Both?","summary":"  Natural language processing has made progress in incorporating human context\ninto its models, but whether it is more effective to use group-wise attributes\n(e.g., over-45-year-olds) or model individuals remains open. Group attributes\nare technically easier but coarse: not all 45-year-olds write the same way. In\ncontrast, modeling individuals captures the complexity of each person's\nidentity. It allows for a more personalized representation, but we may have to\nmodel an infinite number of users and require data that may be impossible to\nget. We compare modeling human context via group attributes, individual users,\nand combined approaches. Combining group and individual features significantly\nbenefits user-level regression tasks like age estimation or personality\nassessment from a user's documents. Modeling individual users significantly\nimproves the performance of single document-level classification tasks like\nstance and topic detection. We also find that individual-user modeling does\nwell even without user's historical data.\n","authors":["Nikita Soni","Niranjan Balasubramanian","H. Andrew Schwartz","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2401.12492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12491v1","updated":"2024-01-23T05:19:47Z","published":"2024-01-23T05:19:47Z","title":"Assessing and Understanding Creativity in Large Language Models","summary":"  In the field of natural language processing, the rapid development of large\nlanguage model (LLM) has attracted more and more attention. LLMs have shown a\nhigh level of creativity in various tasks, but the methods for assessing such\ncreativity are inadequate. The assessment of LLM creativity needs to consider\ndifferences from humans, requiring multi-dimensional measurement while\nbalancing accuracy and efficiency. This paper aims to establish an efficient\nframework for assessing the level of creativity in LLMs. By adapting the\nmodified Torrance Tests of Creative Thinking, the research evaluates the\ncreative performance of various LLMs across 7 tasks, emphasizing 4 criteria\nincluding Fluency, Flexibility, Originality, and Elaboration. In this context,\nwe develop a comprehensive dataset of 700 questions for testing and an\nLLM-based evaluation method. In addition, this study presents a novel analysis\nof LLMs' responses to diverse prompts and role-play situations. We found that\nthe creativity of LLMs primarily falls short in originality, while excelling in\nelaboration. Besides, the use of prompts and the role-play settings of the\nmodel significantly influence creativity. Additionally, the experimental\nresults also indicate that collaboration among multiple LLMs can enhance\noriginality. Notably, our findings reveal a consensus between human evaluations\nand LLMs regarding the personality traits that influence creativity. The\nfindings underscore the significant impact of LLM design on creativity and\nbridges artificial intelligence and human creativity, offering insights into\nLLMs' creativity and potential applications.\n","authors":["Yunpu Zhao","Rui Zhang","Wenyi Li","Di Huang","Jiaming Guo","Shaohui Peng","Yifan Hao","Yuanbo Wen","Xing Hu","Zidong Du","Qi Guo","Ling Li","Yunji Chen"],"pdf_url":"https://arxiv.org/pdf/2401.12491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10225v2","updated":"2024-01-23T05:04:32Z","published":"2024-01-18T18:59:11Z","title":"ChatQA: Building GPT-4 Level Conversational QA Models","summary":"  In this work, we introduce ChatQA, a family of conversational question\nanswering (QA) models that obtain GPT-4 level accuracies. Specifically, we\npropose a two-stage instruction tuning method that can significantly improve\nthe zero-shot conversational QA results from large language models (LLMs). To\nhandle retrieval-augmented generation in conversational QA, we fine-tune a\ndense retriever on a multi-turn QA dataset, which provides comparable results\nto using the state-of-the-art query rewriting model while largely reducing\ndeployment cost. Notably, our ChatQA-70B can outperform GPT-4 in terms of\naverage score on 10 conversational QA datasets (54.14 vs. 53.90), without\nrelying on any synthetic data from OpenAI GPT models.\n","authors":["Zihan Liu","Wei Ping","Rajarshi Roy","Peng Xu","Chankyu Lee","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2401.10225v2.pdf","comment":"We added ChatQA-22B results"},{"id":"http://arxiv.org/abs/2305.09781v3","updated":"2024-01-23T05:02:03Z","published":"2023-05-16T20:12:59Z","title":"SpecInfer: Accelerating Generative Large Language Model Serving with\n  Tree-based Speculative Inference and Verification","summary":"  This paper introduces SpecInfer, a system that accelerates generative large\nlanguage model (LLM) serving with tree-based speculative inference and\nverification. The key idea behind SpecInfer is leveraging small speculative\nmodels to predict the LLM's outputs; the predictions are organized as a token\ntree, whose nodes each represent a candidate token sequence. The correctness of\nall candidate token sequences represented by a token tree is verified against\nthe LLM in parallel using a novel tree-based parallel decoding mechanism.\nSpecInfer uses an LLM as a token tree verifier instead of an incremental\ndecoder, which significantly reduces the end-to-end latency and computational\nrequirement for serving generative LLMs while provably preserving model\nquality. Our evaluation shows that SpecInfer outperforms existing LLM serving\nsystems by 1.5-2.8x for distributed LLM inference and by 2.6-3.5x for\noffloading-based LLM inference, while preserving the same generative\nperformance. SpecInfer is publicly available at\nhttps://github.com/flexflow/FlexFlow/\n","authors":["Xupeng Miao","Gabriele Oliaro","Zhihao Zhang","Xinhao Cheng","Zeyu Wang","Zhengxin Zhang","Rae Ying Yee Wong","Alan Zhu","Lijie Yang","Xiaoxiang Shi","Chunan Shi","Zhuoming Chen","Daiyaan Arfeen","Reyna Abhyankar","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2305.09781v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07213v2","updated":"2024-01-23T04:59:29Z","published":"2023-08-14T15:31:32Z","title":"Human-centered NLP Fact-checking: Co-Designing with Fact-checkers using\n  Matchmaking for AI","summary":"  While many Natural Language Processing (NLP) techniques have been proposed\nfor fact-checking, both academic research and fact-checking organizations\nreport limited adoption of such NLP work due to poor alignment with\nfact-checker practices, values, and needs. To address this, we investigate a\nco-design method, Matchmaking for AI, to enable fact-checkers, designers, and\nNLP researchers to collaboratively identify what fact-checker needs should be\naddressed by technology, and to brainstorm ideas for potential solutions.\nCo-design sessions we conducted with 22 professional fact-checkers yielded a\nset of 11 design ideas that offer a \"north star\", integrating fact-checker\ncriteria into novel NLP design concepts. These concepts range from pre-bunking\nmisinformation, efficient and personalized monitoring misinformation,\nproactively reducing fact-checker potential biases, and collaborative writing\nfact-check reports. Our work provides new insights into both human-centered\nfact-checking research and practice and AI co-design research.\n","authors":["Houjiang Liu","Anubrata Das","Alexander Boltz","Didi Zhou","Daisy Pinaroc","Matthew Lease","Min Kyung Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02994v3","updated":"2024-01-23T04:43:56Z","published":"2024-01-04T07:45:49Z","title":"Blending Is All You Need: Cheaper, Better Alternative to\n  Trillion-Parameters LLM","summary":"  In conversational AI research, there's a noticeable trend towards developing\nmodels with a larger number of parameters, exemplified by models like ChatGPT.\nWhile these expansive models tend to generate increasingly better chat\nresponses, they demand significant computational resources and memory. This\nstudy explores a pertinent question: Can a combination of smaller models\ncollaboratively achieve comparable or enhanced performance relative to a\nsingular large model? We introduce an approach termed \"blending\", a\nstraightforward yet effective method of integrating multiple chat AIs. Our\nempirical evidence suggests that when specific smaller models are\nsynergistically blended, they can potentially outperform or match the\ncapabilities of much larger counterparts. For instance, integrating just three\nmodels of moderate size (6B/13B paramaeters) can rival or even surpass the\nperformance metrics of a substantially larger model like ChatGPT (175B+\nparamaters). This hypothesis is rigorously tested using A/B testing\nmethodologies with a large user base on the Chai research platform over a span\nof thirty days. The findings underscore the potential of the \"blending\"\nstrategy as a viable approach for enhancing chat AI efficacy without a\ncorresponding surge in computational demands.\n","authors":["Xiaoding Lu","Zongyi Liu","Adian Liusie","Vyas Raina","Vineet Mudupalli","Yuwen Zhang","William Beauchamp"],"pdf_url":"https://arxiv.org/pdf/2401.02994v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12474v1","updated":"2024-01-23T03:56:22Z","published":"2024-01-23T03:56:22Z","title":"Large Language Models are Superpositions of All Characters: Attaining\n  Arbitrary Role-play via Self-Alignment","summary":"  Considerable efforts have been invested in augmenting the role-playing\nproficiency of open-source large language models (LLMs) by emulating\nproprietary counterparts. Nevertheless, we posit that LLMs inherently harbor\nrole-play capabilities, owing to the extensive knowledge of characters and\npotential dialogues ingrained in their vast training corpora. Thus, in this\nstudy, we introduce Ditto, a self-alignment method for role-play. Ditto\ncapitalizes on character knowledge, encouraging an instruction-following LLM to\nsimulate role-play dialogues as a variant of reading comprehension. This method\ncreates a role-play training set comprising 4,000 characters, surpassing the\nscale of currently available datasets by tenfold regarding the number of roles.\nSubsequently, we fine-tune the LLM using this self-generated dataset to augment\nits role-playing capabilities. Upon evaluating our meticulously constructed and\nreproducible role-play benchmark and the roleplay subset of MT-Bench, Ditto, in\nvarious parameter scales, consistently maintains a consistent role identity and\nprovides accurate role-specific knowledge in multi-turn role-play\nconversations. Notably, it outperforms all open-source role-play baselines,\nshowcasing performance levels comparable to advanced proprietary chatbots.\nFurthermore, we present the first comprehensive cross-supervision alignment\nexperiment in the role-play domain, revealing that the intrinsic capabilities\nof LLMs confine the knowledge within role-play. Meanwhile, the role-play styles\ncan be easily acquired with the guidance of smaller models. We open-source\nrelated resources at https://github.com/OFA-Sys/Ditto.\n","authors":["Keming Lu","Bowen Yu","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.12474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12472v1","updated":"2024-01-23T03:47:07Z","published":"2024-01-23T03:47:07Z","title":"Contrastive Learning in Distilled Models","summary":"  Natural Language Processing models like BERT can provide state-of-the-art\nword embeddings for downstream NLP tasks. However, these models yet to perform\nwell on Semantic Textual Similarity, and may be too large to be deployed as\nlightweight edge applications. We seek to apply a suitable contrastive learning\nmethod based on the SimCSE paper, to a model architecture adapted from a\nknowledge distillation based model, DistilBERT, to address these two issues.\nOur final lightweight model DistilFace achieves an average of 72.1 in\nSpearman's correlation on STS tasks, a 34.2 percent improvement over BERT base.\n","authors":["Valerie Lim","Kai Wen Ng","Kenneth Lim"],"pdf_url":"https://arxiv.org/pdf/2401.12472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11624v2","updated":"2024-01-23T03:35:40Z","published":"2024-01-21T23:34:42Z","title":"In-context Learning with Retrieved Demonstrations for Language Models: A\n  Survey","summary":"  Language models, especially pre-trained large language models, have showcased\nremarkable abilities as few-shot in-context learners (ICL), adept at adapting\nto new tasks with just a few demonstrations in the input context. However, the\nmodel's ability to perform ICL is sensitive to the choice of the few-shot\ndemonstrations. Instead of using a fixed set of demonstrations, one recent\ndevelopment is to retrieve demonstrations tailored to each input query. The\nimplementation of demonstration retrieval is relatively straightforward,\nleveraging existing databases and retrieval systems. This not only improves the\nefficiency and scalability of the learning process but also has been shown to\nreduce biases inherent in manual example selection. In light of the encouraging\nresults and growing research in ICL with retrieved demonstrations, we conduct\nan extensive review of studies in this area. In this survey, we discuss and\ncompare different design choices for retrieval models, retrieval training\nprocedures, and inference algorithms.\n","authors":["Man Luo","Xin Xu","Yue Liu","Panupong Pasupat","Mehran Kazemi"],"pdf_url":"https://arxiv.org/pdf/2401.11624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11033v2","updated":"2024-01-23T03:30:11Z","published":"2024-01-19T21:21:02Z","title":"FAIR Enough: How Can We Develop and Assess a FAIR-Compliant Dataset for\n  Large Language Models' Training?","summary":"  The rapid evolution of Large Language Models (LLMs) underscores the critical\nimportance of ethical considerations and data integrity in AI development,\nemphasizing the role of FAIR (Findable, Accessible, Interoperable, Reusable)\ndata principles. While these principles have long been a cornerstone of ethical\ndata stewardship, their application in LLM training data is less prevalent, an\nissue our research aims to address. Our study begins with a review of existing\nliterature, highlighting the significance of FAIR principles in data management\nfor model training. Building on this foundation, we introduce a novel framework\nthat incorporates FAIR principles into the LLM training process. A key aspect\nof this approach is a comprehensive checklist, designed to assist researchers\nand developers in consistently applying FAIR data principles throughout the\nmodel development lifecycle. The practicality and effectiveness of our\nframework are demonstrated through a case study that involves creating a\nFAIR-compliant dataset to detect and reduce biases. This case study not only\nvalidates the usefulness of our framework but also establishes new benchmarks\nfor more equitable, transparent, and ethical practices in LLM training. We\noffer this framework to the community as a means to promote technologically\nadvanced, ethically sound, and socially responsible AI models.\n","authors":["Shaina Raza","Shardul Ghuge","Chen Ding","Deval Pandya"],"pdf_url":"https://arxiv.org/pdf/2401.11033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04691v5","updated":"2024-01-23T03:25:22Z","published":"2023-10-07T05:37:41Z","title":"EMO: Earth Mover Distance Optimization for Auto-Regressive Language\n  Modeling","summary":"  Neural language models are probabilistic models of human text. They are\npredominantly trained using maximum likelihood estimation (MLE), which is\nequivalent to minimizing the forward cross-entropy between the empirical data\ndistribution and the model distribution. However, various degeneration\nphenomena are still widely observed when decoding from the distributions\nlearned by such models. We establish that the forward cross-entropy is\nsuboptimal as a distance metric for aligning human and model distribution due\nto its (1) recall-prioritization (2) negative diversity ignorance and (3)\ntrain-test mismatch. In this paper, we propose Earth Mover Distance\nOptimization (EMO) for auto-regressive language modeling. EMO capitalizes on\nthe inherent properties of earth mover distance to address the aforementioned\nchallenges. Due to the high complexity of direct computation, we further\nintroduce a feasible upper bound for EMO to ease end-to-end training. Upon\nextensive evaluation of language models trained using EMO and MLE. We find that\nEMO demonstrates a consistently better language modeling performance than MLE\nacross domains. Moreover, EMO demonstrates noteworthy enhancements in\ndownstream performance with minimal fine-tuning on merely 25,000 sentences.\nThis highlights the tremendous potential of EMO as a lightweight calibration\nmethod for enhancing large-scale pre-trained language models.\n","authors":["Siyu Ren","Zhiyong Wu","Kenny Q. Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.04691v5.pdf","comment":"To appear at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.12461v1","updated":"2024-01-23T03:03:57Z","published":"2024-01-23T03:03:57Z","title":"Fast Adversarial Training against Textual Adversarial Attacks","summary":"  Many adversarial defense methods have been proposed to enhance the\nadversarial robustness of natural language processing models. However, most of\nthem introduce additional pre-set linguistic knowledge and assume that the\nsynonym candidates used by attackers are accessible, which is an ideal\nassumption. We delve into adversarial training in the embedding space and\npropose a Fast Adversarial Training (FAT) method to improve the model\nrobustness in the synonym-unaware scenario from the perspective of single-step\nperturbation generation and perturbation initialization. Based on the\nobservation that the adversarial perturbations crafted by single-step and\nmulti-step gradient ascent are similar, FAT uses single-step gradient ascent to\ncraft adversarial examples in the embedding space to expedite the training\nprocess. Based on the observation that the perturbations generated on the\nidentical training sample in successive epochs are similar, FAT fully utilizes\nhistorical information when initializing the perturbation. Extensive\nexperiments demonstrate that FAT significantly boosts the robustness of BERT\nmodels in the synonym-unaware scenario, and outperforms the defense baselines\nunder various attacks with character-level and word-level modifications.\n","authors":["Yichen Yang","Xin Liu","Kun He"],"pdf_url":"https://arxiv.org/pdf/2401.12461v1.pdf","comment":"4 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.09552v3","updated":"2024-01-23T02:59:44Z","published":"2023-09-18T08:03:54Z","title":"A Multitask Training Approach to Enhance Whisper with Contextual Biasing\n  and Open-Vocabulary Keyword Spotting","summary":"  End-to-end automatic speech recognition (ASR) systems often struggle to\nrecognize rare name entities, such as personal names, organizations, and\nterminologies not frequently encountered in the training data. This paper\npresents Contextual Biasing Whisper (CB-Whisper), a novel ASR system based on\nOpenAI's Whisper model that can recognize user-defined name entities by\nperforming open-vocabulary keyword-spotting (OV-KWS) using the hidden states of\nWhisper encoder. The recognized entities are used as prompts for the Whisper\ndecoder. We first propose a multitask training approach with OV-KWS and ASR\ntasks to optimize the model. Experiments show that this approach substantially\nimproves the entity recalls compared to the original Whisper model on Chinese\nAishell hot word subsets and two internal code-switch test sets. However, we\nobserved a slight increase in mixed-error-rate (MER) on internal test sets due\nto catastrophic forgetting. To address this problem and use different sizes of\nthe Whisper model without finetuning, we propose to use OV-KWS as a separate\nmodule and construct a spoken form prompt to prevent hallucination. The OV-KWS\nmodule consistently improves MER and Entity Recall for whisper-small, medium,\nand large models.\n","authors":["Yuang Li","Yinglu Li","Min Zhang","Chang Su","Mengxin Ren","Xiaosong Qiao","Xiaofeng Zhao","Mengyao Piao","Jiawei Yu","Xinglin Lv","Miaomiao Ma","Yanqing Zhao","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2309.09552v3.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.02317v3","updated":"2024-01-23T02:29:35Z","published":"2023-05-03T17:58:29Z","title":"Visual Chain of Thought: Bridging Logical Gaps with Multimodal\n  Infillings","summary":"  Recent advances in large language models elicit reasoning in a\nchain-of-thought that allows models to decompose problems in a human-like\nfashion. Though this paradigm improves multi-step reasoning ability in language\nmodels, it is limited by being unimodal and applied mainly to\nquestion-answering tasks. We claim that incorporating visual augmentation into\nreasoning is essential, especially for complex, imaginative tasks.\nConsequently, we introduce VCoT, a novel method that leverages chain-of-thought\nprompting with vision-language grounding to recursively bridge the logical gaps\nwithin sequential data. Our method uses visual guidance to generate synthetic\nmultimodal infillings that add consistent and novel information to reduce the\nlogical gaps for downstream tasks that can benefit from temporal reasoning, as\nwell as provide interpretability into models' multi-step reasoning. We apply\nVCoT to the Visual Storytelling and WikiHow summarization datasets and\ndemonstrate through human evaluation that VCoT offers novel and consistent\nsynthetic data augmentation beating chain-of-thought baselines, which can be\nused to enhance downstream performance.\n","authors":["Daniel Rose","Vaishnavi Himakunthala","Andy Ouyang","Ryan He","Alex Mei","Yujie Lu","Michael Saxon","Chinmay Sonar","Diba Mirza","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2305.02317v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13010v2","updated":"2024-01-23T02:12:35Z","published":"2023-12-20T13:22:41Z","title":"AgentCoder: Multi-Agent-based Code Generation with Iterative Testing and\n  Optimisation","summary":"  The advancement of natural language processing (NLP) has been significantly\nboosted by the development of transformer-based large language models (LLMs).\nThese models have revolutionized NLP tasks, particularly in code generation,\naiding developers in creating software with enhanced efficiency. Despite their\nadvancements, challenges in balancing code snippet generation with effective\ntest case generation and execution persist. To address these issues, this paper\nintroduces Multi-Agent Assistant Code Generation (AgentCoder), a novel solution\ncomprising a multi-agent framework with specialized agents: the programmer\nagent, the test designer agent, and the test executor agent. During the coding\nprocedure, the programmer agent will focus on the code generation and\nrefinement based on the test executor agent's feedback. The test designer agent\nwill generate test cases for the generated code, and the test executor agent\nwill run the code with the test cases and write the feedback to the programmer.\nThis collaborative system ensures robust code generation, surpassing the\nlimitations of single-agent models and traditional methodologies. Our extensive\nexperiments on 9 code generation models and 12 enhancement approaches showcase\nAgentCoder's superior performance over existing code generation models and\nprompt engineering techniques across various benchmarks. For example,\nAgentCoder achieves 77.4% and 89.1% pass@1 in HumanEval-ET and MBPP-ET with\nGPT-3.5, while SOTA baselines obtain only 69.5% and 63.0%.\n","authors":["Dong Huang","Qingwen Bu","Jie M. Zhang","Michael Luck","Heming Cui"],"pdf_url":"https://arxiv.org/pdf/2312.13010v2.pdf","comment":"21 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.16692v2","updated":"2024-01-23T01:56:57Z","published":"2023-08-31T12:53:09Z","title":"SpeechTokenizer: Unified Speech Tokenizer for Speech Large Language\n  Models","summary":"  Current speech large language models build upon discrete speech\nrepresentations, which can be categorized into semantic tokens and acoustic\ntokens. However, existing speech tokens are not specifically designed for\nspeech language modeling. To assess the suitability of speech tokens for\nbuilding speech language models, we established the first benchmark,\nSLMTokBench. Our results indicate that neither semantic nor acoustic tokens are\nideal for this purpose. Therefore, we propose SpeechTokenizer, a unified speech\ntokenizer for speech large language models. SpeechTokenizer adopts the\nEncoder-Decoder architecture with residual vector quantization (RVQ). Unifying\nsemantic and acoustic tokens, SpeechTokenizer disentangles different aspects of\nspeech information hierarchically across different RVQ layers. Furthermore, We\nconstruct a Unified Speech Language Model (USLM) leveraging SpeechTokenizer.\nExperiments show that SpeechTokenizer performs comparably to EnCodec in speech\nreconstruction and demonstrates strong performance on the SLMTokBench\nbenchmark. Also, USLM outperforms VALL-E in zero-shot Text-to-Speech tasks.\nCode and models are available at\nhttps://github.com/ZhangXInFD/SpeechTokenizer/.\n","authors":["Xin Zhang","Dong Zhang","Shimin Li","Yaqian Zhou","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.16692v2.pdf","comment":"Accepted by ICLR 2024. Project page is at\n  https://0nutation.github.io/SpeechTokenizer.github.io/"},{"id":"http://arxiv.org/abs/2401.12428v1","updated":"2024-01-23T01:33:09Z","published":"2024-01-23T01:33:09Z","title":"CIM-MLC: A Multi-level Compilation Stack for Computing-In-Memory\n  Accelerators","summary":"  In recent years, various computing-in-memory (CIM) processors have been\npresented, showing superior performance over traditional architectures. To\nunleash the potential of various CIM architectures, such as device precision,\ncrossbar size, and crossbar number, it is necessary to develop compilation\ntools that are fully aware of the CIM architectural details and implementation\ndiversity. However, due to the lack of architectural support in current popular\nopen-source compiling stacks, existing CIM designs either manually deploy\nnetworks or build their own compilers, which is time-consuming and\nlabor-intensive. Although some works expose the specific CIM device programming\ninterfaces to compilers, they are often bound to a fixed CIM architecture,\nlacking the flexibility to support the CIM architectures with different\ncomputing granularity. On the other hand, existing compilation works usually\nconsider the scheduling of limited operation types (such as crossbar-bound\nmatrix-vector multiplication). Unlike conventional processors, CIM accelerators\nare featured by their diverse architecture, circuit, and device, which cannot\nbe simply abstracted by a single level if we seek to fully explore the\nadvantages brought by CIM. Therefore, we propose CIM-MLC, a universal\nmulti-level compilation framework for general CIM architectures. We first\nestablish a general hardware abstraction for CIM architectures and computing\nmodes to represent various CIM accelerators. Based on the proposed abstraction,\nCIM-MLC can compile tasks onto a wide range of CIM accelerators having\ndifferent devices, architectures, and programming interfaces. More importantly,\ncompared with existing compilation work, CIM-MLC can explore the mapping and\nscheduling strategies across multiple architectural tiers, which form a\ntractable yet effective design space, to achieve better scheduling and\ninstruction generation results.\n","authors":["Songyun Qu","Shixin Zhao","Bing Li","Yintao He","Xuyi Cai","Lei Zhang","Ying Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12428v1.pdf","comment":"16 pages, 22 figures"},{"id":"http://arxiv.org/abs/2401.12425v1","updated":"2024-01-23T01:25:00Z","published":"2024-01-23T01:25:00Z","title":"The Neglected Tails of Vision-Language Models","summary":"  Vision-language models (VLMs) excel in zero-shot recognition but exhibit\ndrastically imbalanced performance across visual concepts. For example, CLIP,\ndespite an impressive mean zero-shot accuracy on ImageNet (72.7%), yields\n$<$10% on ten concepts (e.g., gyromitra and night snake), presumably, because\nthese concepts are under-represented in VLMs' imbalanced pretraining data. Yet,\nassessing this imbalance is challenging as it is non-trivial to calculate the\nfrequency of specific concepts within VLMs' large-scale pretraining data. Our\nwork makes the first attempt to measure the concept frequency by analyzing\npretraining texts. We use off-the-shelf language models to help count relevant\ntexts that contain synonyms of the given concepts and resolve linguistic\nambiguity. We confirm that popular VLM datasets like LAION indeed exhibit\nlong-tailed concept distributions, which strongly correlate with per-class\naccuracies. Further, contemporary multimodal systems, e.g., visual chatbots and\ntext-to-image generators, also struggle with the rare concepts identified by\nour method. To mitigate VLMs' imbalanced performance in zero-shot recognition,\nwe propose REtrieval-Augmented Learning REAL. First, instead of prompting VLMs\nusing the original class names, REAL uses their most frequent synonyms found in\nVLMs' pretraining texts. This already outperforms human-engineered and\nLLM-generated prompts over nine benchmark datasets, likely because VLMs have\nseen more images associated with the frequently used synonyms. Second, REAL\nuses all the concept synonyms to retrieve a small, class-balanced set of\npretraining data to train a robust classifier. REAL surpasses the recent\nretrieval-augmented solution REACT, using 400x less storage and 10,000x less\ntraining time!\n","authors":["Shubham Parashar","Zhiqiu Lin","Tian Liu","Xiangjue Dong","Yanan Li","Deva Ramanan","James Caverlee","Shu Kong"],"pdf_url":"https://arxiv.org/pdf/2401.12425v1.pdf","comment":"Project Page:\n  https://shubhamprshr27.github.io/neglected-tails-of-vlms/"},{"id":"http://arxiv.org/abs/2401.13146v1","updated":"2024-01-23T23:46:01Z","published":"2024-01-23T23:46:01Z","title":"Locality enhanced dynamic biasing and sampling strategies for contextual\n  ASR","summary":"  Automatic Speech Recognition (ASR) still face challenges when recognizing\ntime-variant rare-phrases. Contextual biasing (CB) modules bias ASR model\ntowards such contextually-relevant phrases. During training, a list of biasing\nphrases are selected from a large pool of phrases following a sampling\nstrategy. In this work we firstly analyse different sampling strategies to\nprovide insights into the training of CB for ASR with correlation plots between\nthe bias embeddings among various training stages. Secondly, we introduce a\nneighbourhood attention (NA) that localizes self attention (SA) to the nearest\nneighbouring frames to further refine the CB output. The results show that this\nproposed approach provides on average a 25.84% relative WER improvement on\nLibriSpeech sets and rare-word evaluation compared to the baseline.\n","authors":["Md Asif Jalal","Pablo Peso Parada","George Pavlidis","Vasileios Moschopoulos","Karthikeyan Saravanan","Chrysovalantis-Giorgos Kontoulis","Jisi Zhang","Anastasios Drosou","Gil Ho Lee","Jungin Lee","Seokyeong Jung"],"pdf_url":"https://arxiv.org/pdf/2401.13146v1.pdf","comment":"Accepted for IEEE ASRU 2023"},{"id":"http://arxiv.org/abs/2309.06657v2","updated":"2024-01-23T23:16:11Z","published":"2023-09-13T01:07:25Z","title":"Statistical Rejection Sampling Improves Preference Optimization","summary":"  Improving the alignment of language models with human preferences remains an\nactive research challenge. Previous approaches have primarily utilized\nReinforcement Learning from Human Feedback (RLHF) via online RL methods such as\nProximal Policy Optimization (PPO). Recently, offline methods such as Sequence\nLikelihood Calibration (SLiC) and Direct Preference Optimization (DPO) have\nemerged as attractive alternatives, offering improvements in stability and\nscalability while maintaining competitive performance. SLiC refines its loss\nfunction using sequence pairs sampled from a supervised fine-tuned (SFT)\npolicy, while DPO directly optimizes language models based on preference data,\nforegoing the need for a separate reward model. However, the maximum likelihood\nestimator (MLE) of the target optimal policy requires labeled preference pairs\nsampled from that policy. DPO's lack of a reward model constrains its ability\nto sample preference pairs from the optimal policy, and SLiC is restricted to\nsampling preference pairs only from the SFT policy. To address these\nlimitations, we introduce a novel approach called Statistical Rejection\nSampling Optimization (RSO) that aims to source preference data from the target\noptimal policy using rejection sampling, enabling a more accurate estimation of\nthe optimal policy. We also propose a unified framework that enhances the loss\nfunctions used in both SLiC and DPO from a preference modeling standpoint.\nThrough extensive experiments across three diverse tasks, we demonstrate that\nRSO consistently outperforms both SLiC and DPO on evaluations from both Large\nLanguage Model (LLM) and human raters.\n","authors":["Tianqi Liu","Yao Zhao","Rishabh Joshi","Misha Khalman","Mohammad Saleh","Peter J. Liu","Jialu Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06657v2.pdf","comment":"Accepted in ICLR 2024"},{"id":"http://arxiv.org/abs/2401.13136v1","updated":"2024-01-23T23:12:09Z","published":"2024-01-23T23:12:09Z","title":"The Language Barrier: Dissecting Safety Challenges of LLMs in\n  Multilingual Contexts","summary":"  As the influence of large language models (LLMs) spans across global\ncommunities, their safety challenges in multilingual settings become paramount\nfor alignment research. This paper examines the variations in safety challenges\nfaced by LLMs across different languages and discusses approaches to\nalleviating such concerns. By comparing how state-of-the-art LLMs respond to\nthe same set of malicious prompts written in higher- vs. lower-resource\nlanguages, we observe that (1) LLMs tend to generate unsafe responses much more\noften when a malicious prompt is written in a lower-resource language, and (2)\nLLMs tend to generate more irrelevant responses to malicious prompts in\nlower-resource languages. To understand where the discrepancy can be\nattributed, we study the effect of instruction tuning with reinforcement\nlearning from human feedback (RLHF) or supervised finetuning (SFT) on the\nHH-RLHF dataset. Surprisingly, while training with high-resource languages\nimproves model alignment, training in lower-resource languages yields minimal\nimprovement. This suggests that the bottleneck of cross-lingual alignment is\nrooted in the pretraining stage. Our findings highlight the challenges in\ncross-lingual LLM safety, and we hope they inform future research in this\ndirection.\n","authors":["Lingfeng Shen","Weiting Tan","Sihao Chen","Yunmo Chen","Jingyu Zhang","Haoran Xu","Boyuan Zheng","Philipp Koehn","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2401.13136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13133v1","updated":"2024-01-23T22:49:19Z","published":"2024-01-23T22:49:19Z","title":"Analyzing COVID-19 Vaccination Sentiments in Nigerian Cyberspace:\n  Insights from a Manually Annotated Twitter Dataset","summary":"  Numerous successes have been achieved in combating the COVID-19 pandemic,\ninitially using various precautionary measures like lockdowns, social\ndistancing, and the use of face masks. More recently, various vaccinations have\nbeen developed to aid in the prevention or reduction of the severity of the\nCOVID-19 infection. Despite the effectiveness of the precautionary measures and\nthe vaccines, there are several controversies that are massively shared on\nsocial media platforms like Twitter. In this paper, we explore the use of\nstate-of-the-art transformer-based language models to study people's acceptance\nof vaccines in Nigeria. We developed a novel dataset by crawling multi-lingual\ntweets using relevant hashtags and keywords. Our analysis and visualizations\nrevealed that most tweets expressed neutral sentiments about COVID-19 vaccines,\nwith some individuals expressing positive views, and there was no strong\npreference for specific vaccine types, although Moderna received slightly more\npositive sentiment. We also found out that fine-tuning a pre-trained LLM with\nan appropriate dataset can yield competitive results, even if the LLM was not\ninitially pre-trained on the specific language of that dataset.\n","authors":["Ibrahim Said Ahmad","Lukman Jibril Aliyu","Abubakar Auwal Khalid","Saminu Muhammad Aliyu","Shamsuddeen Hassan Muhammad","Idris Abdulmumin","Bala Mairiga Abduljalil","Bello Shehu Bello","Amina Imam Abubakar"],"pdf_url":"https://arxiv.org/pdf/2401.13133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06373v2","updated":"2024-01-23T22:46:12Z","published":"2024-01-12T16:13:24Z","title":"How Johnny Can Persuade LLMs to Jailbreak Them: Rethinking Persuasion to\n  Challenge AI Safety by Humanizing LLMs","summary":"  Most traditional AI safety research has approached AI models as machines and\ncentered on algorithm-focused attacks developed by security experts. As large\nlanguage models (LLMs) become increasingly common and competent, non-expert\nusers can also impose risks during daily interactions. This paper introduces a\nnew perspective to jailbreak LLMs as human-like communicators, to explore this\noverlooked intersection between everyday language interaction and AI safety.\nSpecifically, we study how to persuade LLMs to jailbreak them. First, we\npropose a persuasion taxonomy derived from decades of social science research.\nThen, we apply the taxonomy to automatically generate interpretable persuasive\nadversarial prompts (PAP) to jailbreak LLMs. Results show that persuasion\nsignificantly increases the jailbreak performance across all risk categories:\nPAP consistently achieves an attack success rate of over $92\\%$ on Llama 2-7b\nChat, GPT-3.5, and GPT-4 in $10$ trials, surpassing recent algorithm-focused\nattacks. On the defense side, we explore various mechanisms against PAP and,\nfound a significant gap in existing defenses, and advocate for more fundamental\nmitigation for highly interactive LLMs\n","authors":["Yi Zeng","Hongpeng Lin","Jingwen Zhang","Diyi Yang","Ruoxi Jia","Weiyan Shi"],"pdf_url":"https://arxiv.org/pdf/2401.06373v2.pdf","comment":"14 pages of the main text, qualitative examples of jailbreaks may be\n  harmful in nature"},{"id":"http://arxiv.org/abs/2401.13129v1","updated":"2024-01-23T22:36:03Z","published":"2024-01-23T22:36:03Z","title":"Seed-Guided Fine-Grained Entity Typing in Science and Engineering\n  Domains","summary":"  Accurately typing entity mentions from text segments is a fundamental task\nfor various natural language processing applications. Many previous approaches\nrely on massive human-annotated data to perform entity typing. Nevertheless,\ncollecting such data in highly specialized science and engineering domains\n(e.g., software engineering and security) can be time-consuming and costly,\nwithout mentioning the domain gaps between training and inference data if the\nmodel needs to be applied to confidential datasets. In this paper, we study the\ntask of seed-guided fine-grained entity typing in science and engineering\ndomains, which takes the name and a few seed entities for each entity type as\nthe only supervision and aims to classify new entity mentions into both seen\nand unseen types (i.e., those without seed entities). To solve this problem, we\npropose SEType which first enriches the weak supervision by finding more\nentities for each seen type from an unlabeled corpus using the contextualized\nrepresentations of pre-trained language models. It then matches the enriched\nentities to unlabeled text to get pseudo-labeled samples and trains a textual\nentailment model that can make inferences for both seen and unseen types.\nExtensive experiments on two datasets covering four domains demonstrate the\neffectiveness of SEType in comparison with various baselines.\n","authors":["Yu Zhang","Yunyi Zhang","Yanzhen Shen","Yu Deng","Lucian Popa","Larisa Shwartz","ChengXiang Zhai","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2401.13129v1.pdf","comment":"9 pages; Accepted to AAAI 2024 (Code:\n  https://github.com/yuzhimanhua/SEType)"},{"id":"http://arxiv.org/abs/2202.12312v2","updated":"2024-01-23T22:09:07Z","published":"2022-02-24T19:00:39Z","title":"Oolong: Investigating What Makes Transfer Learning Hard with Controlled\n  Studies","summary":"  When we transfer a pretrained language model to a new language, there are\nmany axes of variation that change at once. To disentangle the impact of\ndifferent factors like syntactic similarity and vocabulary similarity, we\npropose a set of controlled transfer studies: we systematically transform the\nlanguage of the GLUE benchmark, altering one axis of crosslingual variation at\na time, and then measure the resulting drops in a pretrained model's downstream\nperformance. We find that models can largely recover from syntactic-style\nshifts, but cannot recover from vocabulary misalignment and embedding matrix\nre-initialization, even with continued pretraining on 15 million tokens. %On\nthe other hand, transferring to a dataset with an unaligned vocabulary is\nextremely hard to recover from in the low-data regime. Moreover, good-quality\ntokenizers in the transfer language do not make vocabulary alignment easier.\nOur experiments provide insights into the factors of cross-lingual transfer\nthat researchers should most focus on when designing language transfer\nscenarios.\n","authors":["Zhengxuan Wu","Alex Tamkin","Isabel Papadimitriou"],"pdf_url":"https://arxiv.org/pdf/2202.12312v2.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2303.13716v2","updated":"2024-01-23T21:52:42Z","published":"2023-03-24T00:01:24Z","title":"ReCOGS: How Incidental Details of a Logical Form Overshadow an\n  Evaluation of Semantic Interpretation","summary":"  Compositional generalization benchmarks for semantic parsing seek to assess\nwhether models can accurately compute meanings for novel sentences, but\noperationalize this in terms of logical form (LF) prediction. This raises the\nconcern that semantically irrelevant details of the chosen LFs could shape\nmodel performance. We argue that this concern is realized for the COGS\nbenchmark. COGS poses generalization splits that appear impossible for\npresent-day models, which could be taken as an indictment of those models.\nHowever, we show that the negative results trace to incidental features of COGS\nLFs. Converting these LFs to semantically equivalent ones and factoring out\ncapabilities unrelated to semantic interpretation, we find that even baseline\nmodels get traction. A recent variable-free translation of COGS LFs suggests\nsimilar conclusions, but we observe this format is not semantically equivalent;\nit is incapable of accurately representing some COGS meanings. These findings\ninform our proposal for ReCOGS, a modified version of COGS that comes closer to\nassessing the target semantic capabilities while remaining very challenging.\nOverall, our results reaffirm the importance of compositional generalization\nand careful benchmark task design.\n","authors":["Zhengxuan Wu","Christopher D. Manning","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2303.13716v2.pdf","comment":"TACL 2023"},{"id":"http://arxiv.org/abs/2310.02374v4","updated":"2024-01-23T21:27:14Z","published":"2023-10-03T18:54:10Z","title":"Conversational Health Agents: A Personalized LLM-Powered Agent Framework","summary":"  Conversational Health Agents (CHAs) are interactive systems that provide\nhealthcare services, such as assistance and diagnosis. Current CHAs, especially\nthose utilizing Large Language Models (LLMs), primarily focus on conversation\naspects. However, they offer limited agent capabilities, specifically lacking\nmulti-step problem-solving, personalized conversations, and multimodal data\nanalysis. Our aim is to overcome these limitations. We propose openCHA, an\nopen-source LLM-powered framework, to empower conversational agents to generate\na personalized response for users' healthcare queries. This framework enables\ndevelopers to integrate external sources including data sources, knowledge\nbases, and analysis models, into their LLM-based solutions. openCHA includes an\norchestrator to plan and execute actions for gathering information from\nexternal sources, essential for formulating responses to user inquiries. It\nfacilitates knowledge acquisition, problem-solving capabilities, multilingual\nand multimodal conversations, and fosters interaction with various AI\nplatforms. We illustrate the framework's proficiency in handling complex\nhealthcare tasks via three demonstrations. Moreover, we release openCHA as open\nsource available to the community via GitHub.\n","authors":["Mahyar Abbasian","Iman Azimi","Amir M. Rahmani","Ramesh Jain"],"pdf_url":"https://arxiv.org/pdf/2310.02374v4.pdf","comment":"23 pages, 6 figures, 3 tables, journal paper"},{"id":"http://arxiv.org/abs/2305.08809v2","updated":"2024-01-23T21:25:20Z","published":"2023-05-15T17:15:40Z","title":"Interpretability at Scale: Identifying Causal Mechanisms in Alpaca","summary":"  Obtaining human-interpretable explanations of large, general-purpose language\nmodels is an urgent goal for AI safety. However, it is just as important that\nour interpretability methods are faithful to the causal dynamics underlying\nmodel behavior and able to robustly generalize to unseen inputs. Distributed\nAlignment Search (DAS) is a powerful gradient descent method grounded in a\ntheory of causal abstraction that has uncovered perfect alignments between\ninterpretable symbolic algorithms and small deep learning models fine-tuned for\nspecific tasks. In the present paper, we scale DAS significantly by replacing\nthe remaining brute-force search steps with learned parameters -- an approach\nwe call Boundless DAS. This enables us to efficiently search for interpretable\ncausal structure in large language models while they follow instructions. We\napply Boundless DAS to the Alpaca model (7B parameters), which, off the shelf,\nsolves a simple numerical reasoning problem. With Boundless DAS, we discover\nthat Alpaca does this by implementing a causal model with two interpretable\nboolean variables. Furthermore, we find that the alignment of neural\nrepresentations with these variables is robust to changes in inputs and\ninstructions. These findings mark a first step toward faithfully understanding\nthe inner-workings of our ever-growing and most widely deployed language\nmodels. Our tool is extensible to larger LLMs and is released publicly at\n`https://github.com/stanfordnlp/pyvene`.\n","authors":["Zhengxuan Wu","Atticus Geiger","Christopher Potts","Noah D. Goodman"],"pdf_url":"https://arxiv.org/pdf/2305.08809v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.13086v1","updated":"2024-01-23T20:55:49Z","published":"2024-01-23T20:55:49Z","title":"Towards Trustable Language Models: Investigating Information Quality of\n  Large Language Models","summary":"  Large language models (LLM) are generating information at a rapid pace,\nrequiring users to increasingly rely and trust the data. Despite remarkable\nadvances of LLM, Information generated by LLM is not completely trustworthy,\ndue to challenges in information quality. Specifically, integrity of\nInformation quality decreases due to unreliable, biased, tokenization during\npre-training of LLM. Moreover, due to decreased information quality issues, has\nled towards hallucination, fabricated information. Unreliable information can\nlead towards flawed decisions in businesses, which impacts economic activity.\nIn this work, we introduce novel mathematical information quality evaluation of\nLLM, we furthermore analyze and highlight information quality challenges,\nscaling laws to systematically scale language models.\n","authors":["Rick Rejeleene","Xiaowei Xu","John Talburt"],"pdf_url":"https://arxiv.org/pdf/2401.13086v1.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2306.08877v3","updated":"2024-01-23T20:55:48Z","published":"2023-06-15T06:21:44Z","title":"Linguistic Binding in Diffusion Models: Enhancing Attribute\n  Correspondence through Attention Map Alignment","summary":"  Text-conditioned image generation models often generate incorrect\nassociations between entities and their visual attributes. This reflects an\nimpaired mapping between linguistic binding of entities and modifiers in the\nprompt and visual binding of the corresponding elements in the generated image.\nAs one notable example, a query like \"a pink sunflower and a yellow flamingo\"\nmay incorrectly produce an image of a yellow sunflower and a pink flamingo. To\nremedy this issue, we propose SynGen, an approach which first syntactically\nanalyses the prompt to identify entities and their modifiers, and then uses a\nnovel loss function that encourages the cross-attention maps to agree with the\nlinguistic binding reflected by the syntax. Specifically, we encourage large\noverlap between attention maps of entities and their modifiers, and small\noverlap with other entities and modifier words. The loss is optimized during\ninference, without retraining or fine-tuning the model. Human evaluation on\nthree datasets, including one new and challenging set, demonstrate significant\nimprovements of SynGen compared with current state of the art methods. This\nwork highlights how making use of sentence structure during inference can\nefficiently and substantially improve the faithfulness of text-to-image\ngeneration.\n","authors":["Royi Rassin","Eran Hirsch","Daniel Glickman","Shauli Ravfogel","Yoav Goldberg","Gal Chechik"],"pdf_url":"https://arxiv.org/pdf/2306.08877v3.pdf","comment":"Accepted to NeurIPS 2023 (oral). Our code is publicly available at\n  https://github.com/RoyiRa/Syntax-Guided-Generation"},{"id":"http://arxiv.org/abs/2401.13085v1","updated":"2024-01-23T20:54:40Z","published":"2024-01-23T20:54:40Z","title":"IndiText Boost: Text Augmentation for Low Resource India Languages","summary":"  Text Augmentation is an important task for low-resource languages. It helps\ndeal with the problem of data scarcity. A data augmentation strategy is used to\ndeal with the problem of data scarcity. Through the years, much work has been\ndone on data augmentation for the English language. In contrast, very less work\nhas been done on Indian languages. This is contrary to the fact that data\naugmentation is used to deal with data scarcity. In this work, we focus on\nimplementing techniques like Easy Data Augmentation, Back Translation,\nParaphrasing, Text Generation using LLMs, and Text Expansion using LLMs for\ntext classification on different languages. We focus on 6 Indian languages\nnamely: Sindhi, Marathi, Hindi, Gujarati, Telugu, and Sanskrit. According to\nour knowledge, no such work exists for text augmentation on Indian languages.\nWe carry out binary as well as multi-class text classification to make our\nresults more comparable. We get surprising results as basic data augmentation\ntechniques surpass LLMs.\n","authors":["Onkar Litake","Niraj Yagnik","Shreyas Labhsetwar"],"pdf_url":"https://arxiv.org/pdf/2401.13085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15812v2","updated":"2024-01-23T20:44:17Z","published":"2023-08-30T07:35:32Z","title":"Peering Through Preferences: Unraveling Feedback Acquisition for\n  Aligning Large Language Models","summary":"  Aligning large language models (LLMs) with human values and intents\ncritically involves the use of human or AI feedback. While dense feedback\nannotations are expensive to acquire and integrate, sparse feedback presents a\nstructural design choice between ratings (e.g., score Response A on a scale of\n1-7) and rankings (e.g., is Response A better than Response B?). In this work,\nwe analyze the effect of this design choice for the alignment and evaluation of\nLLMs. We uncover an inconsistency problem wherein the preferences inferred from\nratings and rankings significantly disagree 60% for both human and AI\nannotators. Our subsequent analysis identifies various facets of annotator\nbiases that explain this phenomena, such as human annotators would rate denser\nresponses higher while preferring accuracy during pairwise judgments. To our\nsurprise, we also observe that the choice of feedback protocol also has a\nsignificant effect on the evaluation of aligned LLMs. In particular, we find\nthat LLMs that leverage rankings data for alignment (say model X) are preferred\nover those that leverage ratings data (say model Y), with a rank-based\nevaluation protocol (is X/Y's response better than reference response?) but not\nwith a rating-based evaluation protocol (score Rank X/Y's response on a scale\nof 1-7). Our findings thus shed light on critical gaps in methods for\nevaluating the real-world utility of language models and their strong\ndependence on the feedback protocol used for alignment. Our code and data are\navailable at https://github.com/Hritikbansal/sparse_feedback.\n","authors":["Hritik Bansal","John Dang","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2308.15812v2.pdf","comment":"31 pages, Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10841v2","updated":"2024-01-23T20:05:30Z","published":"2024-01-19T17:40:50Z","title":"Using LLMs to discover emerging coded antisemitic hate-speech in\n  extremist social media","summary":"  Online hate speech proliferation has created a difficult problem for social\nmedia platforms. A particular challenge relates to the use of coded language by\ngroups interested in both creating a sense of belonging for its users and\nevading detection. Coded language evolves quickly and its use varies over time.\nThis paper proposes a methodology for detecting emerging coded hate-laden\nterminology. The methodology is tested in the context of online antisemitic\ndiscourse. The approach considers posts scraped from social media platforms,\noften used by extremist users. The posts are scraped using seed expressions\nrelated to previously known discourse of hatred towards Jews. The method begins\nby identifying the expressions most representative of each post and calculating\ntheir frequency in the whole corpus. It filters out grammatically incoherent\nexpressions as well as previously encountered ones so as to focus on emergent\nwell-formed terminology. This is followed by an assessment of semantic\nsimilarity to known antisemitic terminology using a fine-tuned large language\nmodel, and subsequent filtering out of the expressions that are too distant\nfrom known expressions of hatred. Emergent antisemitic expressions containing\nterms clearly relating to Jewish topics are then removed to return only coded\nexpressions of hatred.\n","authors":["Dhanush Kikkisetti","Raza Ul Mustafa","Wendy Melillo","Roberto Corizzo","Zois Boukouvalas","Jeff Gill","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2401.10841v2.pdf","comment":"9 pages, 4 figures, 2 algorithms, 3 tables"},{"id":"http://arxiv.org/abs/2401.11120v2","updated":"2024-01-23T19:43:06Z","published":"2024-01-20T05:10:46Z","title":"Enhancing Large Language Models for Clinical Decision Support by\n  Incorporating Clinical Practice Guidelines","summary":"  Background Large Language Models (LLMs), enhanced with Clinical Practice\nGuidelines (CPGs), can significantly improve Clinical Decision Support (CDS).\nHowever, methods for incorporating CPGs into LLMs are not well studied. Methods\nWe develop three distinct methods for incorporating CPGs into LLMs: Binary\nDecision Tree (BDT), Program-Aided Graph Construction (PAGC), and\nChain-of-Thought-Few-Shot Prompting (CoT-FSP). To evaluate the effectiveness of\nthe proposed methods, we create a set of synthetic patient descriptions and\nconduct both automatic and human evaluation of the responses generated by four\nLLMs: GPT-4, GPT-3.5 Turbo, LLaMA, and PaLM 2. Zero-Shot Prompting (ZSP) was\nused as the baseline method. We focus on CDS for COVID-19 outpatient treatment\nas the case study. Results All four LLMs exhibit improved performance when\nenhanced with CPGs compared to the baseline ZSP. BDT outperformed both CoT-FSP\nand PAGC in automatic evaluation. All of the proposed methods demonstrated high\nperformance in human evaluation. Conclusion LLMs enhanced with CPGs demonstrate\nsuperior performance, as compared to plain LLMs with ZSP, in providing accurate\nrecommendations for COVID-19 outpatient treatment, which also highlights the\npotential for broader applications beyond the case study.\n","authors":["David Oniani","Xizhi Wu","Shyam Visweswaran","Sumit Kapoor","Shravan Kooragayalu","Katelyn Polanska","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11803v2","updated":"2024-01-23T19:37:20Z","published":"2023-12-19T02:35:13Z","title":"NLP for Maternal Healthcare: Perspectives and Guiding Principles in the\n  Age of LLMs","summary":"  Ethical frameworks for the use of natural language processing (NLP) are\nurgently needed to shape how large language models (LLMs) and similar tools are\nused for healthcare applications. Healthcare faces existing challenges\nincluding the balance of power in clinician-patient relationships, systemic\nhealth disparities, historical injustices, and economic constraints. Drawing\ndirectly from the voices of those most affected, and focusing on a case study\nof a specific healthcare setting, we propose a set of guiding principles for\nthe use of NLP in maternal healthcare. We led an interactive session centered\non an LLM-based chatbot demonstration during a full-day workshop with 39\nparticipants, and additionally surveyed 30 healthcare workers and 30 birthing\npeople about their values, needs, and perceptions of NLP tools in the context\nof maternal health. We conducted quantitative and qualitative analyses of the\nsurvey results and interactive discussions to consolidate our findings into a\nset of guiding principles. We propose nine principles for ethical use of NLP\nfor maternal healthcare, grouped into three themes: (i) recognizing contextual\nsignificance (ii) holistic measurements, and (iii) who/what is valued. For each\nprinciple, we describe its underlying rationale and provide practical advice.\nThis set of principles can provide a methodological pattern for other\nresearchers and serve as a resource to practitioners working on maternal health\nand other healthcare fields to emphasize the importance of technical nuance,\nhistorical context, and inclusive design when developing NLP technologies for\nclinical use.\n","authors":["Maria Antoniak","Aakanksha Naik","Carla S. Alvarado","Lucy Lu Wang","Irene Y. Chen"],"pdf_url":"https://arxiv.org/pdf/2312.11803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13060v1","updated":"2024-01-23T19:32:54Z","published":"2024-01-23T19:32:54Z","title":"TCE at Qur'an QA 2023 Shared Task: Low Resource Enhanced\n  Transformer-based Ensemble Approach for Qur'anic QA","summary":"  In this paper, we present our approach to tackle Qur'an QA 2023 shared tasks\nA and B. To address the challenge of low-resourced training data, we rely on\ntransfer learning together with a voting ensemble to improve prediction\nstability across multiple runs. Additionally, we employ different architectures\nand learning mechanisms for a range of Arabic pre-trained transformer-based\nmodels for both tasks. To identify unanswerable questions, we propose using a\nthresholding mechanism. Our top-performing systems greatly surpass the baseline\nperformance on the hidden split, achieving a MAP score of 25.05% for task A and\na partial Average Precision (pAP) of 57.11% for task B.\n","authors":["Mohammed Alaa Elkomy","Amany Sarhan"],"pdf_url":"https://arxiv.org/pdf/2401.13060v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.12978v1","updated":"2024-01-23T18:59:59Z","published":"2024-01-23T18:59:59Z","title":"Zero-Shot Learning for the Primitives of 3D Affordance in General\n  Objects","summary":"  One of the major challenges in AI is teaching machines to precisely respond\nand utilize environmental functionalities, thereby achieving the affordance\nawareness that humans possess. Despite its importance, the field has been\nlagging in terms of learning, especially in 3D, as annotating affordance\naccompanies a laborious process due to the numerous variations of human-object\ninteraction. The low availability of affordance data limits the learning in\nterms of generalization for object categories, and also simplifies the\nrepresentation of affordance, capturing only a fraction of the affordance. To\novercome these challenges, we propose a novel, self-supervised method to\ngenerate the 3D affordance examples given only a 3D object, without any manual\nannotations. The method starts by capturing the 3D object into images and\ncreating 2D affordance images by inserting humans into the image via inpainting\ndiffusion models, where we present the Adaptive Mask algorithm to enable human\ninsertion without altering the original details of the object. The method\nconsequently lifts inserted humans back to 3D to create 3D human-object pairs,\nwhere the depth ambiguity is resolved within a depth optimization framework\nthat utilizes pre-generated human postures from multiple viewpoints. We also\nprovide a novel affordance representation defined on relative orientations and\nproximity between dense human and object points, that can be easily aggregated\nfrom any 3D HOI datasets. The proposed representation serves as a primitive\nthat can be manifested to conventional affordance representations via simple\ntransformations, ranging from physically exerted affordances to nonphysical\nones. We demonstrate the efficacy of our method and representation by\ngenerating the 3D affordance samples and deriving high-quality affordance\nexamples from the representation, including contact, orientation, and spatial\noccupancies.\n","authors":["Hyeonwoo Kim","Sookwan Han","Patrick Kwon","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2401.12978v1.pdf","comment":"Project Page: https://sshowbiz.github.io/ZSP3A/"},{"id":"http://arxiv.org/abs/2401.12979v1","updated":"2024-01-23T18:59:59Z","published":"2024-01-23T18:59:59Z","title":"GALA: Generating Animatable Layered Assets from a Single Scan","summary":"  We present GALA, a framework that takes as input a single-layer clothed 3D\nhuman mesh and decomposes it into complete multi-layered 3D assets. The outputs\ncan then be combined with other assets to create novel clothed human avatars\nwith any pose. Existing reconstruction approaches often treat clothed humans as\na single-layer of geometry and overlook the inherent compositionality of humans\nwith hairstyles, clothing, and accessories, thereby limiting the utility of the\nmeshes for downstream applications. Decomposing a single-layer mesh into\nseparate layers is a challenging task because it requires the synthesis of\nplausible geometry and texture for the severely occluded regions. Moreover,\neven with successful decomposition, meshes are not normalized in terms of poses\nand body shapes, failing coherent composition with novel identities and poses.\nTo address these challenges, we propose to leverage the general knowledge of a\npretrained 2D diffusion model as geometry and appearance prior for humans and\nother assets. We first separate the input mesh using the 3D surface\nsegmentation extracted from multi-view 2D segmentations. Then we synthesize the\nmissing geometry of different layers in both posed and canonical spaces using a\nnovel pose-guided Score Distillation Sampling (SDS) loss. Once we complete\ninpainting high-fidelity 3D geometry, we also apply the same SDS loss to its\ntexture to obtain the complete appearance including the initially occluded\nregions. Through a series of decomposition steps, we obtain multiple layers of\n3D assets in a shared canonical space normalized in terms of poses and human\nshapes, hence supporting effortless composition to novel identities and\nreanimation with novel poses. Our experiments demonstrate the effectiveness of\nour approach for decomposition, canonicalization, and composition tasks\ncompared to existing solutions.\n","authors":["Taeksoo Kim","Byungjun Kim","Shunsuke Saito","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2401.12979v1.pdf","comment":"The project page is available at https://snuvclab.github.io/gala/"},{"id":"http://arxiv.org/abs/2401.12977v1","updated":"2024-01-23T18:59:56Z","published":"2024-01-23T18:59:56Z","title":"IRIS: Inverse Rendering of Indoor Scenes from Low Dynamic Range Images","summary":"  While numerous 3D reconstruction and novel-view synthesis methods allow for\nphotorealistic rendering of a scene from multi-view images easily captured with\nconsumer cameras, they bake illumination in their representations and fall\nshort of supporting advanced applications like material editing, relighting,\nand virtual object insertion. The reconstruction of physically based material\nproperties and lighting via inverse rendering promises to enable such\napplications.\n  However, most inverse rendering techniques require high dynamic range (HDR)\nimages as input, a setting that is inaccessible to most users. We present a\nmethod that recovers the physically based material properties and\nspatially-varying HDR lighting of a scene from multi-view, low-dynamic-range\n(LDR) images. We model the LDR image formation process in our inverse rendering\npipeline and propose a novel optimization strategy for material, lighting, and\na camera response model. We evaluate our approach with synthetic and real\nscenes compared to the state-of-the-art inverse rendering methods that take\neither LDR or HDR input. Our method outperforms existing methods taking LDR\nimages as input, and allows for highly realistic relighting and object\ninsertion.\n","authors":["Zhi-Hao Lin","Jia-Bin Huang","Zhengqin Li","Zhao Dong","Christian Richardt","Tuotuo Li","Michael Zollhöfer","Johannes Kopf","Shenlong Wang","Changil Kim"],"pdf_url":"https://arxiv.org/pdf/2401.12977v1.pdf","comment":"Project Website: https://irisldr.github.io/"},{"id":"http://arxiv.org/abs/2401.04079v2","updated":"2024-01-23T18:59:52Z","published":"2024-01-08T18:31:38Z","title":"RudolfV: A Foundation Model by Pathologists for Pathologists","summary":"  Histopathology plays a central role in clinical medicine and biomedical\nresearch. While artificial intelligence shows promising results on many\npathological tasks, generalization and dealing with rare diseases, where\ntraining data is scarce, remains a challenge. Distilling knowledge from\nunlabeled data into a foundation model before learning from, potentially\nlimited, labeled data provides a viable path to address these challenges. In\nthis work, we extend the state of the art of foundation models for digital\npathology whole slide images by semi-automated data curation and incorporating\npathologist domain knowledge. Specifically, we combine computational and\npathologist domain knowledge (1) to curate a diverse dataset of 103k slides\ncorresponding to 750 million image patches covering data from different\nfixation, staining, and scanning protocols as well as data from different\nindications and labs across the EU and US, (2) for grouping semantically\nsimilar slides and tissue patches, and (3) to augment the input images during\ntraining. We evaluate the resulting model on a set of public and internal\nbenchmarks and show that although our foundation model is trained with an order\nof magnitude less slides, it performs on par or better than competing models.\nWe expect that scaling our approach to more data and larger models will further\nincrease its performance and capacity to deal with increasingly complex real\nworld tasks in diagnostics and biomedical research.\n","authors":["Jonas Dippel","Barbara Feulner","Tobias Winterhoff","Simon Schallenberg","Gabriel Dernbach","Andreas Kunft","Stephan Tietz","Philipp Jurmeister","David Horst","Lukas Ruff","Klaus-Robert Müller","Frederick Klauschen","Maximilian Alber"],"pdf_url":"https://arxiv.org/pdf/2401.04079v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12975v1","updated":"2024-01-23T18:59:43Z","published":"2024-01-23T18:59:43Z","title":"HAZARD Challenge: Embodied Decision Making in Dynamically Changing\n  Environments","summary":"  Recent advances in high-fidelity virtual environments serve as one of the\nmajor driving forces for building intelligent embodied agents to perceive,\nreason and interact with the physical world. Typically, these environments\nremain unchanged unless agents interact with them. However, in real-world\nscenarios, agents might also face dynamically changing environments\ncharacterized by unexpected events and need to rapidly take action accordingly.\nTo remedy this gap, we propose a new simulated embodied benchmark, called\nHAZARD, specifically designed to assess the decision-making abilities of\nembodied agents in dynamic situations. HAZARD consists of three unexpected\ndisaster scenarios, including fire, flood, and wind, and specifically supports\nthe utilization of large language models (LLMs) to assist common sense\nreasoning and decision-making. This benchmark enables us to evaluate autonomous\nagents' decision-making capabilities across various pipelines, including\nreinforcement learning (RL), rule-based, and search-based methods in\ndynamically changing environments. As a first step toward addressing this\nchallenge using large language models, we further develop an LLM-based agent\nand perform an in-depth analysis of its promise and challenge of solving these\nchallenging tasks. HAZARD is available at https://vis-www.cs.umass.edu/hazard/.\n","authors":["Qinhong Zhou","Sunli Chen","Yisong Wang","Haozhe Xu","Weihua Du","Hongxin Zhang","Yilun Du","Joshua B. Tenenbaum","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2401.12975v1.pdf","comment":"ICLR 2024. The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2312.12433v2","updated":"2024-01-23T18:59:39Z","published":"2023-12-19T18:58:40Z","title":"Tracking Any Object Amodally","summary":"  Amodal perception, the ability to comprehend complete object structures from\npartial visibility, is a fundamental skill, even for infants. Its significance\nextends to applications like autonomous driving, where a clear understanding of\nheavily occluded objects is essential. However, modern detection and tracking\nalgorithms often overlook this critical capability, perhaps due to the\nprevalence of modal annotations in most datasets. To address the scarcity of\namodal data, we introduce the TAO-Amodal benchmark, featuring 880 diverse\ncategories in thousands of video sequences. Our dataset includes amodal and\nmodal bounding boxes for visible and occluded objects, including objects that\nare partially out-of-frame. To enhance amodal tracking with object permanence,\nwe leverage a lightweight plug-in module, the amodal expander, to transform\nstandard, modal trackers into amodal ones through fine-tuning on a few hundred\nvideo sequences with data augmentation. We achieve a 3.3\\% and 1.6\\%\nimprovement on the detection and tracking of occluded objects on TAO-Amodal.\nWhen evaluated on people, our method produces dramatic improvements of 2x\ncompared to state-of-the-art modal baselines.\n","authors":["Cheng-Yen Hsieh","Tarasha Khurana","Achal Dave","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2312.12433v2.pdf","comment":"Project Page: https://tao-amodal.github.io"},{"id":"http://arxiv.org/abs/2401.12974v1","updated":"2024-01-23T18:59:25Z","published":"2024-01-23T18:59:25Z","title":"SegmentAnyBone: A Universal Model that Segments Any Bone at Any Location\n  on MRI","summary":"  Magnetic Resonance Imaging (MRI) is pivotal in radiology, offering\nnon-invasive and high-quality insights into the human body. Precise\nsegmentation of MRIs into different organs and tissues would be highly\nbeneficial since it would allow for a higher level of understanding of the\nimage content and enable important measurements, which are essential for\naccurate diagnosis and effective treatment planning. Specifically, segmenting\nbones in MRI would allow for more quantitative assessments of musculoskeletal\nconditions, while such assessments are largely absent in current radiological\npractice. The difficulty of bone MRI segmentation is illustrated by the fact\nthat limited algorithms are publicly available for use, and those contained in\nthe literature typically address a specific anatomic area. In our study, we\npropose a versatile, publicly available deep-learning model for bone\nsegmentation in MRI across multiple standard MRI locations. The proposed model\ncan operate in two modes: fully automated segmentation and prompt-based\nsegmentation. Our contributions include (1) collecting and annotating a new MRI\ndataset across various MRI protocols, encompassing over 300 annotated volumes\nand 8485 annotated slices across diverse anatomic regions; (2) investigating\nseveral standard network architectures and strategies for automated\nsegmentation; (3) introducing SegmentAnyBone, an innovative foundational\nmodel-based approach that extends Segment Anything Model (SAM); (4) comparative\nanalysis of our algorithm and previous approaches; and (5) generalization\nanalysis of our algorithm across different anatomical locations and MRI\nsequences, as well as an external dataset. We publicly release our model at\nhttps://github.com/mazurowski-lab/SegmentAnyBone.\n","authors":["Hanxue Gu","Roy Colglazier","Haoyu Dong","Jikai Zhang","Yaqian Chen","Zafer Yildiz","Yuwen Chen","Lin Li","Jichen Yang","Jay Willhite","Alex M. Meyer","Brian Guo","Yashvi Atul Shah","Emily Luo","Shipra Rajput","Sally Kuehn","Clark Bulleit","Kevin A. Wu","Jisoo Lee","Brandon Ramirez","Darui Lu","Jay M. Levin","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2401.12974v1.pdf","comment":"15 pages, 15 figures"},{"id":"http://arxiv.org/abs/2401.12972v1","updated":"2024-01-23T18:58:35Z","published":"2024-01-23T18:58:35Z","title":"On the Efficacy of Text-Based Input Modalities for Action Anticipation","summary":"  Although the task of anticipating future actions is highly uncertain,\ninformation from additional modalities help to narrow down plausible action\nchoices. Each modality provides different environmental context for the model\nto learn from. While previous multi-modal methods leverage information from\nmodalities such as video and audio, we primarily explore how text inputs for\nactions and objects can also enable more accurate action anticipation.\nTherefore, we propose a Multi-modal Anticipative Transformer (MAT), an\nattention-based video transformer architecture that jointly learns from\nmulti-modal features and text captions. We train our model in two-stages, where\nthe model first learns to predict actions in the video clip by aligning with\ncaptions, and during the second stage, we fine-tune the model to predict future\nactions. Compared to existing methods, MAT has the advantage of learning\nadditional environmental context from two kinds of text inputs: action\ndescriptions during the pre-training stage, and the text inputs for detected\nobjects and actions during modality feature fusion. Through extensive\nexperiments, we evaluate the effectiveness of the pre-training stage, and show\nthat our model outperforms previous methods on all datasets. In addition, we\nexamine the impact of object and action information obtained via text and\nperform extensive ablations. We evaluate the performance on on three datasets:\nEpicKitchens-100, EpicKitchens-55 and EGTEA GAZE+; and show that text\ndescriptions do indeed aid in more effective action anticipation.\n","authors":["Apoorva Beedu","Karan Samel","Irfan Essa"],"pdf_url":"https://arxiv.org/pdf/2401.12972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12963v1","updated":"2024-01-23T18:45:54Z","published":"2024-01-23T18:45:54Z","title":"AutoRT: Embodied Foundation Models for Large Scale Orchestration of\n  Robotic Agents","summary":"  Foundation models that incorporate language, vision, and more recently\nactions have revolutionized the ability to harness internet scale data to\nreason about useful tasks. However, one of the key challenges of training\nembodied foundation models is the lack of data grounded in the physical world.\nIn this paper, we propose AutoRT, a system that leverages existing foundation\nmodels to scale up the deployment of operational robots in completely unseen\nscenarios with minimal human supervision. AutoRT leverages vision-language\nmodels (VLMs) for scene understanding and grounding, and further uses large\nlanguage models (LLMs) for proposing diverse and novel instructions to be\nperformed by a fleet of robots. Guiding data collection by tapping into the\nknowledge of foundation models enables AutoRT to effectively reason about\nautonomy tradeoffs and safety while significantly scaling up data collection\nfor robot learning. We demonstrate AutoRT proposing instructions to over 20\nrobots across multiple buildings and collecting 77k real robot episodes via\nboth teleoperation and autonomous robot policies. We experimentally show that\nsuch \"in-the-wild\" data collected by AutoRT is significantly more diverse, and\nthat AutoRT's use of LLMs allows for instruction following data collection\nrobots that can align to human preferences.\n","authors":["Michael Ahn","Debidatta Dwibedi","Chelsea Finn","Montse Gonzalez Arenas","Keerthana Gopalakrishnan","Karol Hausman","Brian Ichter","Alex Irpan","Nikhil Joshi","Ryan Julian","Sean Kirmani","Isabel Leal","Edward Lee","Sergey Levine","Yao Lu","Isabel Leal","Sharath Maddineni","Kanishka Rao","Dorsa Sadigh","Pannag Sanketi","Pierre Sermanet","Quan Vuong","Stefan Welker","Fei Xia","Ted Xiao","Peng Xu","Steve Xu","Zhuo Xu"],"pdf_url":"https://arxiv.org/pdf/2401.12963v1.pdf","comment":"26 pages, 9 figures"},{"id":"http://arxiv.org/abs/2303.07700v3","updated":"2024-01-23T18:37:41Z","published":"2023-03-14T08:28:36Z","title":"PATS: Patch Area Transportation with Subdivision for Local Feature\n  Matching","summary":"  Local feature matching aims at establishing sparse correspondences between a\npair of images. Recently, detector-free methods present generally better\nperformance but are not satisfactory in image pairs with large scale\ndifferences. In this paper, we propose Patch Area Transportation with\nSubdivision (PATS) to tackle this issue. Instead of building an expensive image\npyramid, we start by splitting the original image pair into equal-sized patches\nand gradually resizing and subdividing them into smaller patches with the same\nscale. However, estimating scale differences between these patches is\nnon-trivial since the scale differences are determined by both relative camera\nposes and scene structures, and thus spatially varying over image pairs.\nMoreover, it is hard to obtain the ground truth for real scenes. To this end,\nwe propose patch area transportation, which enables learning scale differences\nin a self-supervised manner. In contrast to bipartite graph matching, which\nonly handles one-to-one matching, our patch area transportation can deal with\nmany-to-many relationships. PATS improves both matching accuracy and coverage,\nand shows superior performance in downstream tasks, such as relative pose\nestimation, visual localization, and optical flow estimation. The source code\nis available at \\url{https://zju3dv.github.io/pats/}.\n","authors":["Junjie Ni","Yijin Li","Zhaoyang Huang","Hongsheng Li","Hujun Bao","Zhaopeng Cui","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.07700v3.pdf","comment":"Accepted to CVPR 2023. Project page: https://zju3dv.github.io/pats"},{"id":"http://arxiv.org/abs/2401.12946v1","updated":"2024-01-23T18:07:07Z","published":"2024-01-23T18:07:07Z","title":"Coverage Axis++: Efficient Inner Point Selection for 3D Shape\n  Skeletonization","summary":"  We introduce Coverage Axis++, a novel and efficient approach to 3D shape\nskeletonization. The current state-of-the-art approaches for this task often\nrely on the watertightness of the input or suffer from substantial\ncomputational costs, thereby limiting their practicality. To address this\nchallenge, Coverage Axis++ proposes a heuristic algorithm to select skeletal\npoints, offering a high-accuracy approximation of the Medial Axis Transform\n(MAT) while significantly mitigating computational intensity for various shape\nrepresentations. We introduce a simple yet effective strategy that considers\nboth shape coverage and uniformity to derive skeletal points. The selection\nprocedure enforces consistency with the shape structure while favoring the\ndominant medial balls, which thus introduces a compact underlying shape\nrepresentation in terms of MAT. As a result, Coverage Axis++ allows for\nskeletonization for various shape representations (e.g., water-tight meshes,\ntriangle soups, point clouds), specification of the number of skeletal points,\nfew hyperparameters, and highly efficient computation with improved\nreconstruction accuracy. Extensive experiments across a wide range of 3D shapes\nvalidate the efficiency and effectiveness of Coverage Axis++. The code will be\npublicly available once the paper is published.\n","authors":["Zimeng Wang","Zhiyang Dou","Rui Xu","Cheng Lin","Yuan Liu","Xiaoxiao Long","Shiqing Xin","Lingjie Liu","Taku Komura","Xiaoming Yuan","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12945v1","updated":"2024-01-23T18:05:25Z","published":"2024-01-23T18:05:25Z","title":"Lumiere: A Space-Time Diffusion Model for Video Generation","summary":"  We introduce Lumiere -- a text-to-video diffusion model designed for\nsynthesizing videos that portray realistic, diverse and coherent motion -- a\npivotal challenge in video synthesis. To this end, we introduce a Space-Time\nU-Net architecture that generates the entire temporal duration of the video at\nonce, through a single pass in the model. This is in contrast to existing video\nmodels which synthesize distant keyframes followed by temporal super-resolution\n-- an approach that inherently makes global temporal consistency difficult to\nachieve. By deploying both spatial and (importantly) temporal down- and\nup-sampling and leveraging a pre-trained text-to-image diffusion model, our\nmodel learns to directly generate a full-frame-rate, low-resolution video by\nprocessing it in multiple space-time scales. We demonstrate state-of-the-art\ntext-to-video generation results, and show that our design easily facilitates a\nwide range of content creation tasks and video editing applications, including\nimage-to-video, video inpainting, and stylized generation.\n","authors":["Omer Bar-Tal","Hila Chefer","Omer Tov","Charles Herrmann","Roni Paiss","Shiran Zada","Ariel Ephrat","Junhwa Hur","Yuanzhen Li","Tomer Michaeli","Oliver Wang","Deqing Sun","Tali Dekel","Inbar Mosseri"],"pdf_url":"https://arxiv.org/pdf/2401.12945v1.pdf","comment":"Webpage: https://lumiere-video.github.io/ | Video:\n  https://www.youtube.com/watch?v=wxLr02Dz2Sc"},{"id":"http://arxiv.org/abs/2401.11114v2","updated":"2024-01-23T18:00:13Z","published":"2024-01-20T04:55:29Z","title":"DengueNet: Dengue Prediction using Spatiotemporal Satellite Imagery for\n  Resource-Limited Countries","summary":"  Dengue fever presents a substantial challenge in developing countries where\nsanitation infrastructure is inadequate. The absence of comprehensive\nhealthcare systems exacerbates the severity of dengue infections, potentially\nleading to life-threatening circumstances. Rapid response to dengue outbreaks\nis also challenging due to limited information exchange and integration. While\ntimely dengue outbreak forecasts have the potential to prevent such outbreaks,\nthe majority of dengue prediction studies have predominantly relied on data\nthat impose significant burdens on individual countries for collection. In this\nstudy, our aim is to improve health equity in resource-constrained countries by\nexploring the effectiveness of high-resolution satellite imagery as a\nnontraditional and readily accessible data source. By leveraging the wealth of\npublicly available and easily obtainable satellite imagery, we present a\nscalable satellite extraction framework based on Sentinel Hub, a cloud-based\ncomputing platform. Furthermore, we introduce DengueNet, an innovative\narchitecture that combines Vision Transformer, Radiomics, and Long Short-term\nMemory to extract and integrate spatiotemporal features from satellite images.\nThis enables dengue predictions on an epi-week basis. To evaluate the\neffectiveness of our proposed method, we conducted experiments on five\nmunicipalities in Colombia. We utilized a dataset comprising 780\nhigh-resolution Sentinel-2 satellite images for training and evaluation. The\nperformance of DengueNet was assessed using the mean absolute error (MAE)\nmetric. Across the five municipalities, DengueNet achieved an average MAE of\n43.92. Our findings strongly support the efficacy of satellite imagery as a\nvaluable resource for dengue prediction, particularly in informing public\nhealth policies within countries where manually collected data is scarce and\ndengue virus prevalence is severe.\n","authors":["Kuan-Ting Kuo","Dana Moukheiber","Sebastian Cajas Ordonez","David Restrepo","Atika Rahman Paddo","Tsung-Yu Chen","Lama Moukheiber","Mira Moukheiber","Sulaiman Moukheiber","Saptarshi Purkayastha","Po-Chih Kuo","Leo Anthony Celi"],"pdf_url":"https://arxiv.org/pdf/2401.11114v2.pdf","comment":"Published at the IJCAI 2023 Workshop on Bridge-AI: from Climate\n  Change to Health Equity (BridgeAICCHE)., Macao, S.A.R"},{"id":"http://arxiv.org/abs/2401.12938v1","updated":"2024-01-23T17:50:58Z","published":"2024-01-23T17:50:58Z","title":"Neural deformation fields for template-based reconstruction of cortical\n  surfaces from MRI","summary":"  The reconstruction of cortical surfaces is a prerequisite for quantitative\nanalyses of the cerebral cortex in magnetic resonance imaging (MRI). Existing\nsegmentation-based methods separate the surface registration from the surface\nextraction, which is computationally inefficient and prone to distortions. We\nintroduce Vox2Cortex-Flow (V2C-Flow), a deep mesh-deformation technique that\nlearns a deformation field from a brain template to the cortical surfaces of an\nMRI scan. To this end, we present a geometric neural network that models the\ndeformation-describing ordinary differential equation in a continuous manner.\nThe network architecture comprises convolutional and graph-convolutional\nlayers, which allows it to work with images and meshes at the same time.\nV2C-Flow is not only very fast, requiring less than two seconds to infer all\nfour cortical surfaces, but also establishes vertex-wise correspondences to the\ntemplate during reconstruction. In addition, V2C-Flow is the first approach for\ncortex reconstruction that models white matter and pial surfaces jointly,\ntherefore avoiding intersections between them. Our comprehensive experiments on\ninternal and external test data demonstrate that V2C-Flow results in cortical\nsurfaces that are state-of-the-art in terms of accuracy. Moreover, we show that\nthe established correspondences are more consistent than in FreeSurfer and that\nthey can directly be utilized for cortex parcellation and group analyses of\ncortical thickness.\n","authors":["Fabian Bongratz","Anne-Marie Rickmann","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2401.12938v1.pdf","comment":"To appear in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2401.12932v1","updated":"2024-01-23T17:37:34Z","published":"2024-01-23T17:37:34Z","title":"Segmentation of tibiofemoral joint tissues from knee MRI using MtRA-Unet\n  and incorporating shape information: Data from the Osteoarthritis Initiative","summary":"  Knee Osteoarthritis (KOA) is the third most prevalent Musculoskeletal\nDisorder (MSD) after neck and back pain. To monitor such a severe MSD, a\nsegmentation map of the femur, tibia and tibiofemoral cartilage is usually\naccessed using the automated segmentation algorithm from the Magnetic Resonance\nImaging (MRI) of the knee. But, in recent works, such segmentation is\nconceivable only from the multistage framework thus creating data handling\nissues and needing continuous manual inference rendering it unable to make a\nquick and precise clinical diagnosis. In order to solve these issues, in this\npaper the Multi-Resolution Attentive-Unet (MtRA-Unet) is proposed to segment\nthe femur, tibia and tibiofemoral cartilage automatically. The proposed work\nhas included a novel Multi-Resolution Feature Fusion (MRFF) and Shape\nReconstruction (SR) loss that focuses on multi-contextual information and\nstructural anatomical details of the femur, tibia and tibiofemoral cartilage.\nUnlike previous approaches, the proposed work is a single-stage and end-to-end\nframework producing a Dice Similarity Coefficient (DSC) of 98.5% for the femur,\n98.4% for the tibia, 89.1% for Femoral Cartilage (FC) and 86.1% for Tibial\nCartilage (TC) for critical MRI slices that can be helpful to clinicians for\nKOA grading. The time to segment MRI volume (160 slices) per subject is 22 sec.\nwhich is one of the fastest among state-of-the-art. Moreover, comprehensive\nexperimentation on the segmentation of FC and TC which is of utmost importance\nfor morphology-based studies to check KOA progression reveals that the proposed\nmethod has produced an excellent result with binary segmentation\n","authors":["Akshay Daydar","Alik Pramanick","Arijit Sur","Subramani Kanagaraj"],"pdf_url":"https://arxiv.org/pdf/2401.12932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12915v1","updated":"2024-01-23T17:07:18Z","published":"2024-01-23T17:07:18Z","title":"Red Teaming Visual Language Models","summary":"  VLMs (Vision-Language Models) extend the capabilities of LLMs (Large Language\nModels) to accept multimodal inputs. Since it has been verified that LLMs can\nbe induced to generate harmful or inaccurate content through specific test\ncases (termed as Red Teaming), how VLMs perform in similar scenarios,\nespecially with their combination of textual and visual inputs, remains a\nquestion. To explore this problem, we present a novel red teaming dataset\nRTVLM, which encompasses 10 subtasks (e.g., image misleading, multi-modal\njail-breaking, face fairness, etc) under 4 primary aspects (faithfulness,\nprivacy, safety, fairness). Our RTVLM is the first red-teaming dataset to\nbenchmark current VLMs in terms of these 4 different aspects. Detailed analysis\nshows that 10 prominent open-sourced VLMs struggle with the red teaming in\ndifferent degrees and have up to 31% performance gap with GPT-4V. Additionally,\nwe simply apply red teaming alignment to LLaVA-v1.5 with Supervised Fine-tuning\n(SFT) using RTVLM, and this bolsters the models' performance with 10% in RTVLM\ntest set, 13% in MM-Hal, and without noticeable decline in MM-Bench,\noverpassing other LLaVA-based models with regular alignment data. This reveals\nthat current open-sourced VLMs still lack red teaming alignment. Our code and\ndatasets will be open-source.\n","authors":["Mukai Li","Lei Li","Yuwei Yin","Masood Ahmed","Zhenguang Liu","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2401.12915v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2312.02218v2","updated":"2024-01-23T16:53:48Z","published":"2023-12-03T15:19:08Z","title":"WavePlanes: A compact Wavelet representation for Dynamic Neural Radiance\n  Fields","summary":"  Dynamic Neural Radiance Fields (Dynamic NeRF) enhance NeRF technology to\nmodel moving scenes. However, they are resource intensive and challenging to\ncompress. To address this issue, this paper presents WavePlanes, a fast and\nmore compact explicit model. We propose a multi-scale space and space-time\nfeature plane representation using N-level 2-D wavelet coefficients. The\ninverse discrete wavelet transform reconstructs N feature signals at varying\ndetail, which are linearly decoded to approximate the color and density of\nvolumes in a 4-D grid. Exploiting the sparsity of wavelet coefficients, we\ncompress a Hash Map containing only non-zero coefficients and their locations\non each plane. This results in a compressed model size of ~12 MB. Compared with\nstate-of-the-art plane-based models, WavePlanes is up to 15x smaller, less\ncomputationally demanding and achieves comparable results in as little as one\nhour of training - without requiring custom CUDA code or high performance\ncomputing resources. Additionally, we propose new feature fusion schemes that\nwork as well as previously proposed schemes while providing greater\ninterpretability. Our code is available at:\nhttps://github.com/azzarelli/waveplanes/\n","authors":["Adrian Azzarelli","Nantheera Anantrasirichai","David R Bull"],"pdf_url":"https://arxiv.org/pdf/2312.02218v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12902v1","updated":"2024-01-23T16:48:18Z","published":"2024-01-23T16:48:18Z","title":"Facing the Elephant in the Room: Visual Prompt Tuning or Full\n  Finetuning?","summary":"  As the scale of vision models continues to grow, the emergence of Visual\nPrompt Tuning (VPT) as a parameter-efficient transfer learning technique has\ngained attention due to its superior performance compared to traditional\nfull-finetuning. However, the conditions favoring VPT (the ``when\") and the\nunderlying rationale (the ``why\") remain unclear. In this paper, we conduct a\ncomprehensive analysis across 19 distinct datasets and tasks. To understand the\n``when\" aspect, we identify the scenarios where VPT proves favorable by two\ndimensions: task objectives and data distributions. We find that VPT is\npreferrable when there is 1) a substantial disparity between the original and\nthe downstream task objectives (e.g., transitioning from classification to\ncounting), or 2) a similarity in data distributions between the two tasks\n(e.g., both involve natural images). In exploring the ``why\" dimension, our\nresults indicate VPT's success cannot be attributed solely to overfitting and\noptimization considerations. The unique way VPT preserves original features and\nadds parameters appears to be a pivotal factor. Our study provides insights\ninto VPT's mechanisms, and offers guidance for its optimal utilization.\n","authors":["Cheng Han","Qifan Wang","Yiming Cui","Wenguan Wang","Lifu Huang","Siyuan Qi","Dongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.12902v1.pdf","comment":"29 pages, 19 figures"},{"id":"http://arxiv.org/abs/2401.12900v1","updated":"2024-01-23T16:40:47Z","published":"2024-01-23T16:40:47Z","title":"PSAvatar: A Point-based Morphable Shape Model for Real-Time Head Avatar\n  Creation with 3D Gaussian Splatting","summary":"  Despite much progress, creating real-time high-fidelity head avatar is still\ndifficult and existing methods have to trade-off between speed and quality.\n3DMM based methods often fail to model non-facial structures such as eyeglasses\nand hairstyles, while neural implicit models suffer from deformation\ninflexibility and rendering inefficiency.\n  Although 3D Gaussian has been demonstrated to possess promising capability\nfor geometry representation and radiance field reconstruction, applying 3D\nGaussian in head avatar creation remains a major challenge since it is\ndifficult for 3D Gaussian to model the head shape variations caused by changing\nposes and expressions. In this paper, we introduce PSAvatar, a novel framework\nfor animatable head avatar creation that utilizes discrete geometric primitive\nto create a parametric morphable shape model and employs 3D Gaussian for fine\ndetail representation and high fidelity rendering. The parametric morphable\nshape model is a Point-based Morphable Shape Model (PMSM) which uses points\ninstead of meshes for 3D representation to achieve enhanced representation\nflexibility. The PMSM first converts the FLAME mesh to points by sampling on\nthe surfaces as well as off the meshes to enable the reconstruction of not only\nsurface-like structures but also complex geometries such as eyeglasses and\nhairstyles. By aligning these points with the head shape in an\nanalysis-by-synthesis manner, the PMSM makes it possible to utilize 3D Gaussian\nfor fine detail representation and appearance modeling, thus enabling the\ncreation of high-fidelity avatars. We show that PSAvatar can reconstruct\nhigh-fidelity head avatars of a variety of subjects and the avatars can be\nanimated in real-time ($\\ge$ 25 fps at a resolution of 512 x 512 )\n","authors":["Zhongyuan Zhao","Zhenyu Bao","Qing Li","Guoping Qiu","Kanglin Liu"],"pdf_url":"https://arxiv.org/pdf/2401.12900v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2302.05154v2","updated":"2024-01-23T16:31:56Z","published":"2023-02-10T10:25:12Z","title":"Industrial and Medical Anomaly Detection Through Cycle-Consistent\n  Adversarial Networks","summary":"  In this study, a new Anomaly Detection (AD) approach for industrial and\nmedical images is proposed. This method leverages the theoretical strengths of\nunsupervised learning and the data availability of both normal and abnormal\nclasses. Indeed, the AD is often formulated as an unsupervised task, implying\nonly normal images during training. These normal images are devoted to be\nreconstructed, through an autoencoder architecture for instance. However, the\ninformation contained in abnormal data, when available, is also valuable for\nthis reconstruction. The model would be able to identify its weaknesses by\nbetter learning how to transform an abnormal (respectively normal) image into a\nnormal (respectively abnormal) one, helping the entire model to learn better\nthan a single normal to normal reconstruction. To address this challenge, the\nproposed method uses Cycle-Generative Adversarial Networks (Cycle-GAN) for\n(ab)normal-to-normal translation. After an input image has been reconstructed\nby the normal generator, an anomaly score quantifies the differences between\nthe input and its reconstruction. Based on a threshold set to satisfy a\nbusiness quality constraint, the input image is then flagged as normal or not.\nThe proposed method is evaluated on industrial and medical datasets. The\nresults demonstrate accurate performance with a zero false negative constraint\ncompared to state-of-the-art methods. The code is available at\nhttps://github.com/ValDelch/CycleGANS-AnomalyDetection.\n","authors":["Arnaud Bougaham","Valentin Delchevalerie","Mohammed El Adoui","Benoît Frénay"],"pdf_url":"https://arxiv.org/pdf/2302.05154v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12888v1","updated":"2024-01-23T16:28:30Z","published":"2024-01-23T16:28:30Z","title":"Data-Centric Evolution in Autonomous Driving: A Comprehensive Survey of\n  Big Data System, Data Mining, and Closed-Loop Technologies","summary":"  The aspiration of the next generation's autonomous driving (AD) technology\nrelies on the dedicated integration and interaction among intelligent\nperception, prediction, planning, and low-level control. There has been a huge\nbottleneck regarding the upper bound of autonomous driving algorithm\nperformance, a consensus from academia and industry believes that the key to\nsurmount the bottleneck lies in data-centric autonomous driving technology.\nRecent advancement in AD simulation, closed-loop model training, and AD big\ndata engine have gained some valuable experience. However, there is a lack of\nsystematic knowledge and deep understanding regarding how to build efficient\ndata-centric AD technology for AD algorithm self-evolution and better AD big\ndata accumulation. To fill in the identified research gaps, this article will\nclosely focus on reviewing the state-of-the-art data-driven autonomous driving\ntechnologies, with an emphasis on the comprehensive taxonomy of autonomous\ndriving datasets characterized by milestone generations, key features, data\nacquisition settings, etc. Furthermore, we provide a systematic review of the\nexisting benchmark closed-loop AD big data pipelines from the industrial\nfrontier, including the procedure of closed-loop frameworks, key technologies,\nand empirical studies. Finally, the future directions, potential applications,\nlimitations and concerns are discussed to arouse efforts from both academia and\nindustry for promoting the further development of autonomous driving.\n","authors":["Lincan Li","Wei Shao","Wei Dong","Yijun Tian","Kaixiang Yang","Wenjie Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.12888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12870v1","updated":"2024-01-23T16:04:19Z","published":"2024-01-23T16:04:19Z","title":"Unlocking the Potential: Multi-task Deep Learning for Spaceborne\n  Quantitative Monitoring of Fugitive Methane Plumes","summary":"  With the intensification of global warming, the monitoring of methane\nemission and detection of gas plumes from landfills have increasingly received\nattention. We decompose methane emission monitoring into three sub-tasks:\nmethane concentration inversion, plume segmentation, and emission rate\nestimation. Conventional algorithms have limitations: methane concentration\ninversion usually uses the matched filter, which is sensitive to global\nspectrum distribution and contains a large amount of noises. There is limited\nresearch on plume segmentation, with many studies resorting to manual\nsegmentation that is likely to be subjective. The estimation of methane\nemission rate often utilizes IME algorithm, which relies on obtaining\nmeteorological measurement data. Using the WENT landfill site in Hong Kong and\nPRISMA hyperspectral satellite imagery, we propose a new deep learning-based\nframework for quantitative monitoring of methane emissions from remote sensing\nimages based on physical simulation. We generate simulated methane plumes using\nlarge eddy simulation (LES) and different concentration maps of fugitive\nemission using the radiative transfer equation (RTE), while combining\naugmentation techniques to create a simulated PRISMA dataset. We train a U-Net\nnetwork for methane concentration inversion, a Mask R-CNN network for methane\nplume segmentation, and a ResNet-50 network for methane emission rate\nestimation. All three deep networks achieve higher validation accuracy compared\nto conventional algorithms. We further respectively combine the first two\nsub-tasks and the last two sub-tasks to design the multi-task learning models -\nMTL-01 and MTL-02, both of which achieve higher accuracy than single-task\nmodels. Our research serves as a demonstration of applying multi-task deep\nlearning to quantitative methane monitoring and can be extended to a broad\nrange of methane monitoring tasks.\n","authors":["Guoxin Si","Shiliang Fu","Wei Yao"],"pdf_url":"https://arxiv.org/pdf/2401.12870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12862v1","updated":"2024-01-23T15:52:57Z","published":"2024-01-23T15:52:57Z","title":"FedRSU: Federated Learning for Scene Flow Estimation on Roadside Units","summary":"  Roadside unit (RSU) can significantly improve the safety and robustness of\nautonomous vehicles through Vehicle-to-Everything (V2X) communication.\nCurrently, the usage of a single RSU mainly focuses on real-time inference and\nV2X collaboration, while neglecting the potential value of the high-quality\ndata collected by RSU sensors. Integrating the vast amounts of data from\nnumerous RSUs can provide a rich source of data for model training. However,\nthe absence of ground truth annotations and the difficulty of transmitting\nenormous volumes of data are two inevitable barriers to fully exploiting this\nhidden value. In this paper, we introduce FedRSU, an innovative federated\nlearning framework for self-supervised scene flow estimation. In FedRSU, we\npresent a recurrent self-supervision training paradigm, where for each RSU, the\nscene flow prediction of points at every timestamp can be supervised by its\nsubsequent future multi-modality observation. Another key component of FedRSU\nis federated learning, where multiple devices collaboratively train an ML model\nwhile keeping the training data local and private. With the power of the\nrecurrent self-supervised learning paradigm, FL is able to leverage innumerable\nunderutilized data from RSU. To verify the FedRSU framework, we construct a\nlarge-scale multi-modality dataset RSU-SF. The dataset consists of 17 RSU\nclients, covering various scenarios, modalities, and sensor settings. Based on\nRSU-SF, we show that FedRSU can greatly improve model performance in ITS and\nprovide a comprehensive benchmark under diverse FL scenarios. To the best of\nour knowledge, we provide the first real-world LiDAR-camera multi-modal dataset\nand benchmark for the FL community.\n","authors":["Shaoheng Fang","Rui Ye","Wenhao Wang","Zuhong Liu","Yuxiao Wang","Yafei Wang","Siheng Chen","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14391v4","updated":"2024-01-23T15:52:28Z","published":"2023-04-27T17:55:13Z","title":"Energy-based Models are Zero-Shot Planners for Compositional Scene\n  Rearrangement","summary":"  Language is compositional; an instruction can express multiple relation\nconstraints to hold among objects in a scene that a robot is tasked to\nrearrange. Our focus in this work is an instructable scene-rearranging\nframework that generalizes to longer instructions and to spatial concept\ncompositions never seen at training time. We propose to represent\nlanguage-instructed spatial concepts with energy functions over relative object\narrangements. A language parser maps instructions to corresponding energy\nfunctions and an open-vocabulary visual-language model grounds their arguments\nto relevant objects in the scene. We generate goal scene configurations by\ngradient descent on the sum of energy functions, one per language predicate in\nthe instruction. Local vision-based policies then re-locate objects to the\ninferred goal locations. We test our model on established instruction-guided\nmanipulation benchmarks, as well as benchmarks of compositional instructions we\nintroduce. We show our model can execute highly compositional instructions\nzero-shot in simulation and in the real world. It outperforms\nlanguage-to-action reactive policies and Large Language Model planners by a\nlarge margin, especially for long instructions that involve compositions of\nmultiple spatial concepts. Simulation and real-world robot execution videos, as\nwell as our code and datasets are publicly available on our website:\nhttps://ebmplanner.github.io.\n","authors":["Nikolaos Gkanatsios","Ayush Jain","Zhou Xian","Yunchu Zhang","Christopher Atkeson","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2304.14391v4.pdf","comment":"First two authors contributed equally | RSS 2023"},{"id":"http://arxiv.org/abs/2309.01141v4","updated":"2024-01-23T15:51:18Z","published":"2023-09-03T11:32:28Z","title":"VGDiffZero: Text-to-image Diffusion Models Can Be Zero-shot Visual\n  Grounders","summary":"  Large-scale text-to-image diffusion models have shown impressive capabilities\nfor generative tasks by leveraging strong vision-language alignment from\npre-training. However, most vision-language discriminative tasks require\nextensive fine-tuning on carefully-labeled datasets to acquire such alignment,\nwith great cost in time and computing resources. In this work, we explore\ndirectly applying a pre-trained generative diffusion model to the challenging\ndiscriminative task of visual grounding without any fine-tuning and additional\ntraining dataset. Specifically, we propose VGDiffZero, a simple yet effective\nzero-shot visual grounding framework based on text-to-image diffusion models.\nWe also design a comprehensive region-scoring method considering both global\nand local contexts of each isolated proposal. Extensive experiments on RefCOCO,\nRefCOCO+, and RefCOCOg show that VGDiffZero achieves strong performance on\nzero-shot visual grounding. Our code is available at\nhttps://github.com/xuyang-liu16/VGDiffZero.\n","authors":["Xuyang Liu","Siteng Huang","Yachen Kang","Honggang Chen","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2309.01141v4.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.12851v1","updated":"2024-01-23T15:35:50Z","published":"2024-01-23T15:35:50Z","title":"Classification of grapevine varieties using UAV hyperspectral imaging","summary":"  The classification of different grapevine varieties is a relevant phenotyping\ntask in Precision Viticulture since it enables estimating the growth of\nvineyard rows dedicated to different varieties, among other applications\nconcerning the wine industry. This task can be performed with destructive\nmethods that require time-consuming tasks, including data collection and\nanalysis in the laboratory. However, Unmanned Aerial Vehicles (UAV) provide a\nmore efficient and less prohibitive approach to collecting hyperspectral data,\ndespite acquiring noisier data. Therefore, the first task is the processing of\nthese data to correct and downsample large amounts of data. In addition, the\nhyperspectral signatures of grape varieties are very similar. In this work, a\nConvolutional Neural Network (CNN) is proposed for classifying seventeen\nvarieties of red and white grape variants. Rather than classifying single\nsamples, these are processed together with their neighbourhood. Hence, the\nextraction of spatial and spectral features is addressed with 1) a spatial\nattention layer and 2) Inception blocks. The pipeline goes from processing to\ndataset elaboration, finishing with the training phase. The fitted model is\nevaluated in terms of response time, accuracy and data separability, and\ncompared with other state-of-the-art CNNs for classifying hyperspectral data.\nOur network was proven to be much more lightweight with a reduced number of\ninput bands, a lower number of trainable weights and therefore, reduced\ntraining time. Despite this, the evaluated metrics showed much better results\nfor our network (~99% overall accuracy), in comparison with previous works\nbarely achieving 81% OA.\n","authors":["Alfonso López","Carlos Javier Ogayar","Francisco Ramón Feito","Joaquim João Sousa"],"pdf_url":"https://arxiv.org/pdf/2401.12851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01651v3","updated":"2024-01-23T15:31:17Z","published":"2024-01-03T10:08:40Z","title":"AIGCBench: Comprehensive Evaluation of Image-to-Video Content Generated\n  by AI","summary":"  The burgeoning field of Artificial Intelligence Generated Content (AIGC) is\nwitnessing rapid advancements, particularly in video generation. This paper\nintroduces AIGCBench, a pioneering comprehensive and scalable benchmark\ndesigned to evaluate a variety of video generation tasks, with a primary focus\non Image-to-Video (I2V) generation. AIGCBench tackles the limitations of\nexisting benchmarks, which suffer from a lack of diverse datasets, by including\na varied and open-domain image-text dataset that evaluates different\nstate-of-the-art algorithms under equivalent conditions. We employ a novel text\ncombiner and GPT-4 to create rich text prompts, which are then used to generate\nimages via advanced Text-to-Image models. To establish a unified evaluation\nframework for video generation tasks, our benchmark includes 11 metrics\nspanning four dimensions to assess algorithm performance. These dimensions are\ncontrol-video alignment, motion effects, temporal consistency, and video\nquality. These metrics are both reference video-dependent and video-free,\nensuring a comprehensive evaluation strategy. The evaluation standard proposed\ncorrelates well with human judgment, providing insights into the strengths and\nweaknesses of current I2V algorithms. The findings from our extensive\nexperiments aim to stimulate further research and development in the I2V field.\nAIGCBench represents a significant step toward creating standardized benchmarks\nfor the broader AIGC landscape, proposing an adaptable and equitable framework\nfor future assessments of video generation tasks. We have open-sourced the\ndataset and evaluation code on the project website:\nhttps://www.benchcouncil.org/AIGCBench.\n","authors":["Fanda Fan","Chunjie Luo","Wanling Gao","Jianfeng Zhan"],"pdf_url":"https://arxiv.org/pdf/2401.01651v3.pdf","comment":"Accepted to BenchCouncil Transactions on Benchmarks, Standards and\n  Evaluations (TBench)"},{"id":"http://arxiv.org/abs/2401.12074v2","updated":"2024-01-23T15:23:03Z","published":"2024-01-22T16:14:26Z","title":"DeepCERES: A Deep learning method for cerebellar lobule segmentation\n  using ultra-high resolution multimodal MRI","summary":"  This paper introduces a novel multimodal and high-resolution human brain\ncerebellum lobule segmentation method. Unlike current tools that operate at\nstandard resolution ($1 \\text{ mm}^{3}$) or using mono-modal data, the proposed\nmethod improves cerebellum lobule segmentation through the use of a multimodal\nand ultra-high resolution ($0.125 \\text{ mm}^{3}$) training dataset. To develop\nthe method, first, a database of semi-automatically labelled cerebellum lobules\nwas created to train the proposed method with ultra-high resolution T1 and T2\nMR images. Then, an ensemble of deep networks has been designed and developed,\nallowing the proposed method to excel in the complex cerebellum lobule\nsegmentation task, improving precision while being memory efficient. Notably,\nour approach deviates from the traditional U-Net model by exploring alternative\narchitectures. We have also integrated deep learning with classical machine\nlearning methods incorporating a priori knowledge from multi-atlas\nsegmentation, which improved precision and robustness. Finally, a new online\npipeline, named DeepCERES, has been developed to make available the proposed\nmethod to the scientific community requiring as input only a single T1 MR image\nat standard resolution.\n","authors":["Sergio Morell-Ortega","Marina Ruiz-Perez","Marien Gadea","Roberto Vivo-Hernando","Gregorio Rubio","Fernando Aparici","Maria de la Iglesia-Vaya","Gwenaelle Catheline","Pierrick Coupé","José V. Manjón"],"pdf_url":"https://arxiv.org/pdf/2401.12074v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2310.00367v2","updated":"2024-01-23T15:20:33Z","published":"2023-09-30T13:15:49Z","title":"AutomaTikZ: Text-Guided Synthesis of Scientific Vector Graphics with\n  TikZ","summary":"  Generating bitmap graphics from text has gained considerable attention, yet\nfor scientific figures, vector graphics are often preferred. Given that vector\ngraphics are typically encoded using low-level graphics primitives, generating\nthem directly is difficult. To address this, we propose the use of TikZ, a\nwell-known abstract graphics language that can be compiled to vector graphics,\nas an intermediate representation of scientific figures. TikZ offers\nhuman-oriented, high-level commands, thereby facilitating conditional language\nmodeling with any large language model. To this end, we introduce DaTikZ, the\nfirst large-scale TikZ dataset consisting of 120k TikZ drawings aligned with\ncaptions. We fine-tune LLaMA on DaTikZ, as well as our new model CLiMA, which\naugments LLaMA with multimodal CLIP embeddings. In both human and automatic\nevaluation, CLiMA and LLaMA outperform commercial GPT-4 and Claude 2 in terms\nof similarity to human-created figures, with CLiMA additionally improving\ntext-image alignment. Our detailed analysis shows that all models generalize\nwell and are not susceptible to memorization. GPT-4 and Claude 2, however, tend\nto generate more simplistic figures compared to both humans and our models. We\nmake our framework, AutomaTikZ, along with model weights and datasets, publicly\navailable.\n","authors":["Jonas Belouadi","Anne Lauscher","Steffen Eger"],"pdf_url":"https://arxiv.org/pdf/2310.00367v2.pdf","comment":"Accepted at ICLR 2024 (poster); Project Page:\n  https://github.com/potamides/AutomaTikZ"},{"id":"http://arxiv.org/abs/2401.12835v1","updated":"2024-01-23T15:18:20Z","published":"2024-01-23T15:18:20Z","title":"SGTR+: End-to-end Scene Graph Generation with Transformer","summary":"  Scene Graph Generation (SGG) remains a challenging visual understanding task\ndue to its compositional property. Most previous works adopt a bottom-up,\ntwo-stage or point-based, one-stage approach, which often suffers from high\ntime complexity or suboptimal designs. In this work, we propose a novel SGG\nmethod to address the aforementioned issues, formulating the task as a\nbipartite graph construction problem. To address the issues above, we create a\ntransformer-based end-to-end framework to generate the entity and entity-aware\npredicate proposal set, and infer directed edges to form relation triplets.\nMoreover, we design a graph assembling module to infer the connectivity of the\nbipartite scene graph based on our entity-aware structure, enabling us to\ngenerate the scene graph in an end-to-end manner. Based on bipartite graph\nassembling paradigm, we further propose a new technical design to address the\nefficacy of entity-aware modeling and optimization stability of graph\nassembling. Equipped with the enhanced entity-aware design, our method achieves\noptimal performance and time-complexity. Extensive experimental results show\nthat our design is able to achieve the state-of-the-art or comparable\nperformance on three challenging benchmarks, surpassing most of the existing\napproaches and enjoying higher efficiency in inference. Code is available:\nhttps://github.com/Scarecrow0/SGTR\n","authors":["Rongjie Li","Songyang Zhang","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2401.12835v1.pdf","comment":"Accepted by TPAMI: https://ieeexplore.ieee.org/document/10315230"},{"id":"http://arxiv.org/abs/2401.12820v1","updated":"2024-01-23T14:53:32Z","published":"2024-01-23T14:53:32Z","title":"DatUS^2: Data-driven Unsupervised Semantic Segmentation with Pre-trained\n  Self-supervised Vision Transformer","summary":"  Successive proposals of several self-supervised training schemes continue to\nemerge, taking one step closer to developing a universal foundation model. In\nthis process, the unsupervised downstream tasks are recognized as one of the\nevaluation methods to validate the quality of visual features learned with a\nself-supervised training scheme. However, unsupervised dense semantic\nsegmentation has not been explored as a downstream task, which can utilize and\nevaluate the quality of semantic information introduced in patch-level feature\nrepresentations during self-supervised training of a vision transformer.\nTherefore, this paper proposes a novel data-driven approach for unsupervised\nsemantic segmentation (DatUS^2) as a downstream task. DatUS^2 generates\nsemantically consistent and dense pseudo annotate segmentation masks for the\nunlabeled image dataset without using any visual-prior or synchronized data. We\ncompare these pseudo-annotated segmentation masks with ground truth masks for\nevaluating recent self-supervised training schemes to learn shared semantic\nproperties at the patch level and discriminative semantic properties at the\nsegment level. Finally, we evaluate existing state-of-the-art self-supervised\ntraining schemes with our proposed downstream task, i.e., DatUS^2. Also, the\nbest version of DatUS^2 outperforms the existing state-of-the-art method for\nthe unsupervised dense semantic segmentation task with 15.02% MiOU and 21.47%\nPixel accuracy on the SUIM dataset. It also achieves a competitive level of\naccuracy for a large-scale and complex dataset, i.e., the COCO dataset.\n","authors":["Sonal Kumar","Arijit Sur","Rashmi Dutta Baruah"],"pdf_url":"https://arxiv.org/pdf/2401.12820v1.pdf","comment":"The manuscript contains 13 pages, 9 figures and 7 tables"},{"id":"http://arxiv.org/abs/2308.14190v2","updated":"2024-01-23T14:51:41Z","published":"2023-08-27T19:43:43Z","title":"Score-Based Generative Models for PET Image Reconstruction","summary":"  Score-based generative models have demonstrated highly promising results for\nmedical image reconstruction tasks in magnetic resonance imaging or computed\ntomography. However, their application to Positron Emission Tomography (PET) is\nstill largely unexplored. PET image reconstruction involves a variety of\nchallenges, including Poisson noise with high variance and a wide dynamic\nrange. To address these challenges, we propose several PET-specific adaptations\nof score-based generative models. The proposed framework is developed for both\n2D and 3D PET. In addition, we provide an extension to guided reconstruction\nusing magnetic resonance images. We validate the approach through extensive 2D\nand 3D $\\textit{in-silico}$ experiments with a model trained on\npatient-realistic data without lesions, and evaluate on data without lesions as\nwell as out-of-distribution data with lesions. This demonstrates the proposed\nmethod's robustness and significant potential for improved PET reconstruction.\n","authors":["Imraj RD Singh","Alexander Denker","Riccardo Barbano","Željko Kereta","Bangti Jin","Kris Thielemans","Peter Maass","Simon Arridge"],"pdf_url":"https://arxiv.org/pdf/2308.14190v2.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n  Biomedical Imaging (MELBA) https://melba-journal.org/2024:001"},{"id":"http://arxiv.org/abs/2401.12761v1","updated":"2024-01-23T13:43:17Z","published":"2024-01-23T13:43:17Z","title":"MUSES: The Multi-Sensor Semantic Perception Dataset for Driving under\n  Uncertainty","summary":"  Achieving level-5 driving automation in autonomous vehicles necessitates a\nrobust semantic visual perception system capable of parsing data from different\nsensors across diverse conditions. However, existing semantic perception\ndatasets often lack important non-camera modalities typically used in\nautonomous vehicles, or they do not exploit such modalities to aid and improve\nsemantic annotations in challenging conditions. To address this, we introduce\nMUSES, the MUlti-SEnsor Semantic perception dataset for driving in adverse\nconditions under increased uncertainty. MUSES includes synchronized multimodal\nrecordings with 2D panoptic annotations for 2500 images captured under diverse\nweather and illumination. The dataset integrates a frame camera, a lidar, a\nradar, an event camera, and an IMU/GNSS sensor. Our new two-stage panoptic\nannotation protocol captures both class-level and instance-level uncertainty in\nthe ground truth and enables the novel task of uncertainty-aware panoptic\nsegmentation we introduce, along with standard semantic and panoptic\nsegmentation. MUSES proves both effective for training and challenging for\nevaluating models under diverse visual conditions, and it opens new avenues for\nresearch in multimodal and uncertainty-aware dense semantic perception. Our\ndataset and benchmark will be made publicly available.\n","authors":["Tim Brödermann","David Bruggemann","Christos Sakaridis","Kevin Ta","Odysseas Liagouris","Jason Corkill","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2401.12761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12751v1","updated":"2024-01-23T13:30:43Z","published":"2024-01-23T13:30:43Z","title":"PSDF: Prior-Driven Neural Implicit Surface Learning for Multi-view\n  Reconstruction","summary":"  Surface reconstruction has traditionally relied on the Multi-View Stereo\n(MVS)-based pipeline, which often suffers from noisy and incomplete geometry.\nThis is due to that although MVS has been proven to be an effective way to\nrecover the geometry of the scenes, especially for locally detailed areas with\nrich textures, it struggles to deal with areas with low texture and large\nvariations of illumination where the photometric consistency is unreliable.\nRecently, Neural Implicit Surface Reconstruction (NISR) combines surface\nrendering and volume rendering techniques and bypasses the MVS as an\nintermediate step, which has emerged as a promising alternative to overcome the\nlimitations of traditional pipelines. While NISR has shown impressive results\non simple scenes, it remains challenging to recover delicate geometry from\nuncontrolled real-world scenes which is caused by its underconstrained\noptimization. To this end, the framework PSDF is proposed which resorts to\nexternal geometric priors from a pretrained MVS network and internal geometric\npriors inherent in the NISR model to facilitate high-quality neural implicit\nsurface learning. Specifically, the visibility-aware feature consistency loss\nand depth prior-assisted sampling based on external geometric priors are\nintroduced. These proposals provide powerfully geometric consistency\nconstraints and aid in locating surface intersection points, thereby\nsignificantly improving the accuracy and delicate reconstruction of NISR.\nMeanwhile, the internal prior-guided importance rendering is presented to\nenhance the fidelity of the reconstructed surface mesh by mitigating the biased\nrendering issue in NISR. Extensive experiments on the Tanks and Temples dataset\nshow that PSDF achieves state-of-the-art performance on complex uncontrolled\nscenes.\n","authors":["Wanjuan Su","Chen Zhang","Qingshan Xu","Wenbing Tao"],"pdf_url":"https://arxiv.org/pdf/2401.12751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12743v1","updated":"2024-01-23T13:20:57Z","published":"2024-01-23T13:20:57Z","title":"Correlation-Embedded Transformer Tracking: A Single-Branch Framework","summary":"  Developing robust and discriminative appearance models has been a\nlong-standing research challenge in visual object tracking. In the prevalent\nSiamese-based paradigm, the features extracted by the Siamese-like networks are\noften insufficient to model the tracked targets and distractor objects, thereby\nhindering them from being robust and discriminative simultaneously. While most\nSiamese trackers focus on designing robust correlation operations, we propose a\nnovel single-branch tracking framework inspired by the transformer. Unlike the\nSiamese-like feature extraction, our tracker deeply embeds cross-image feature\ncorrelation in multiple layers of the feature network. By extensively matching\nthe features of the two images through multiple layers, it can suppress\nnon-target features, resulting in target-aware feature extraction. The output\nfeatures can be directly used for predicting target locations without\nadditional correlation steps. Thus, we reformulate the two-branch Siamese\ntracking as a conceptually simple, fully transformer-based Single-Branch\nTracking pipeline, dubbed SBT. After conducting an in-depth analysis of the SBT\nbaseline, we summarize many effective design principles and propose an improved\ntracker dubbed SuperSBT. SuperSBT adopts a hierarchical architecture with a\nlocal modeling layer to enhance shallow-level features. A unified relation\nmodeling is proposed to remove complex handcrafted layer pattern designs.\nSuperSBT is further improved by masked image modeling pre-training, integrating\ntemporal modeling, and equipping with dedicated prediction heads. Thus,\nSuperSBT outperforms the SBT baseline by 4.7%,3.0%, and 4.5% AUC scores in\nLaSOT, TrackingNet, and GOT-10K. Notably, SuperSBT greatly raises the speed of\nSBT from 37 FPS to 81 FPS. Extensive experiments show that our method achieves\nsuperior results on eight VOT benchmarks.\n","authors":["Fei Xie","Wankou Yang","Chunyu Wang","Lei Chu","Yue Cao","Chao Ma","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.12743v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2307.03212v2","updated":"2024-01-23T13:15:31Z","published":"2023-07-06T16:38:43Z","title":"Region-Wise Attentive Multi-View Representation Learning for Urban\n  Region Embeddings","summary":"  Urban region embedding is an important and yet highly challenging issue due\nto the complexity and constantly changing nature of urban data. To address the\nchallenges, we propose a Region-Wise Multi-View Representation Learning (ROMER)\nto capture multi-view dependencies and learn expressive representations of\nurban regions without the constraints of rigid neighbourhood region conditions.\nOur model focus on learn urban region representation from multi-source urban\ndata. First, we capture the multi-view correlations from mobility flow\npatterns, POI semantics and check-in dynamics. Then, we adopt global graph\nattention networks to learn similarity of any two vertices in graphs. To\ncomprehensively consider and share features of multiple views, a two-stage\nfusion module is further proposed to learn weights with external attention to\nfuse multi-view embeddings. Extensive experiments for two downstream tasks on\nreal-world datasets demonstrate that our model outperforms state-of-the-art\nmethods by up to 17\\% improvement.\n","authors":["Weiliang Chan","Qianqian Ren"],"pdf_url":"https://arxiv.org/pdf/2307.03212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12736v1","updated":"2024-01-23T13:13:45Z","published":"2024-01-23T13:13:45Z","title":"Shift-ConvNets: Small Convolutional Kernel with Large Kernel Effects","summary":"  Recent studies reveal that the remarkable performance of Vision transformers\n(ViTs) benefits from large receptive fields. For this reason, the large\nconvolutional kernel design becomes an ideal solution to make Convolutional\nNeural Networks (CNNs) great again. However, the typical large convolutional\nkernels turn out to be hardware-unfriendly operators, resulting in discount\ncompatibility of various hardware platforms. Thus, it is unwise to simply\nenlarge the convolutional kernel size. In this paper, we reveal that small\nconvolutional kernels and convolution operations can achieve the closing\neffects of large kernel sizes. Then, we propose a shift-wise operator that\nensures the CNNs capture long-range dependencies with the help of the sparse\nmechanism, while remaining hardware-friendly. Experimental results show that\nour shift-wise operator significantly improves the accuracy of a regular CNN\nwhile markedly reducing computational requirements. On the ImageNet-1k, our\nshift-wise enhanced CNN model outperforms the state-of-the-art models. Code &\nmodels at https://github.com/lidc54/shift-wiseConv.\n","authors":["Dachong Li","Li Li","Zhuangzhuang Chen","Jianqiang Li"],"pdf_url":"https://arxiv.org/pdf/2401.12736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12729v1","updated":"2024-01-23T13:02:11Z","published":"2024-01-23T13:02:11Z","title":"Enhancing Object Detection Performance for Small Objects through\n  Synthetic Data Generation and Proportional Class-Balancing Technique: A\n  Comparative Study in Industrial Scenarios","summary":"  Object Detection (OD) has proven to be a significant computer vision method\nin extracting localized class information and has multiple applications in the\nindustry. Although many of the state-of-the-art (SOTA) OD models perform well\non medium and large sized objects, they seem to under perform on small objects.\nIn most of the industrial use cases, it is difficult to collect and annotate\ndata for small objects, as it is time-consuming and prone to human errors.\nAdditionally, those datasets are likely to be unbalanced and often result in an\ninefficient model convergence. To tackle this challenge, this study presents a\nnovel approach that injects additional data points to improve the performance\nof the OD models. Using synthetic data generation, the difficulties in data\ncollection and annotations for small object data points can be minimized and to\ncreate a dataset with balanced distribution. This paper discusses the effects\nof a simple proportional class-balancing technique, to enable better anchor\nmatching of the OD models. A comparison was carried out on the performances of\nthe SOTA OD models: YOLOv5, YOLOv7 and SSD, for combinations of real and\nsynthetic datasets within an industrial use case.\n","authors":["Jibinraj Antony","Vinit Hegiste","Ali Nazeri","Hooman Tavakoli","Snehal Walunj","Christiane Plociennik","Martin Ruskowski"],"pdf_url":"https://arxiv.org/pdf/2401.12729v1.pdf","comment":"Accepted and presented in conference ESAIM23 1st European Symposium\n  on Artificial Intelligence in Manufacturing"},{"id":"http://arxiv.org/abs/2401.12725v1","updated":"2024-01-23T12:53:37Z","published":"2024-01-23T12:53:37Z","title":"Two-View Topogram-Based Anatomy-Guided CT Reconstruction for Prospective\n  Risk Minimization","summary":"  To facilitate a prospective estimation of CT effective dose and risk\nminimization process, a prospective spatial dose estimation and the known\nanatomical structures are expected. To this end, a CT reconstruction method is\nrequired to reconstruct CT volumes from as few projections as possible, i.e. by\nusing the topograms, with anatomical structures as correct as possible. In this\nwork, an optimized CT reconstruction model based on a generative adversarial\nnetwork (GAN) is proposed. The GAN is trained to reconstruct 3D volumes from an\nanterior-posterior and a lateral CT projection. To enhance anatomical\nstructures, a pre-trained organ segmentation network and the 3D perceptual loss\nare applied during the training phase, so that the model can then generate both\norgan-enhanced CT volume and the organ segmentation mask. The proposed method\ncan reconstruct CT volumes with PSNR of 26.49, RMSE of 196.17, and SSIM of\n0.64, compared to 26.21, 201.55 and 0.63 using the baseline method. In terms of\nthe anatomical structure, the proposed method effectively enhances the organ\nshape and boundary and allows for a straight-forward identification of the\nrelevant anatomical structures. We note that conventional reconstruction\nmetrics fail to indicate the enhancement of anatomical structures. In addition\nto such metrics, the evaluation is expanded with assessing the organ\nsegmentation performance. The average organ dice of the proposed method is 0.71\ncompared with 0.63 in baseline model, indicating the enhancement of anatomical\nstructures.\n","authors":["Chang Liu","Laura Klein","Yixing Huang","Edith Baader","Michael Lell","Marc Kachelrieß","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2401.12725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12694v1","updated":"2024-01-23T11:58:08Z","published":"2024-01-23T11:58:08Z","title":"Pragmatic Communication in Multi-Agent Collaborative Perception","summary":"  Collaborative perception allows each agent to enhance its perceptual\nabilities by exchanging messages with others. It inherently results in a\ntrade-off between perception ability and communication costs. Previous works\ntransmit complete full-frame high-dimensional feature maps among agents,\nresulting in substantial communication costs. To promote communication\nefficiency, we propose only transmitting the information needed for the\ncollaborator's downstream task. This pragmatic communication strategy focuses\non three key aspects: i) pragmatic message selection, which selects\ntask-critical parts from the complete data, resulting in spatially and\ntemporally sparse feature vectors; ii) pragmatic message representation, which\nachieves pragmatic approximation of high-dimensional feature vectors with a\ntask-adaptive dictionary, enabling communicating with integer indices; iii)\npragmatic collaborator selection, which identifies beneficial collaborators,\npruning unnecessary communication links. Following this strategy, we first\nformulate a mathematical optimization framework for the\nperception-communication trade-off and then propose PragComm, a multi-agent\ncollaborative perception system with two key components: i) single-agent\ndetection and tracking and ii) pragmatic collaboration. The proposed PragComm\npromotes pragmatic communication and adapts to a wide range of communication\nconditions. We evaluate PragComm for both collaborative 3D object detection and\ntracking tasks in both real-world, V2V4Real, and simulation datasets, OPV2V and\nV2X-SIM2.0. PragComm consistently outperforms previous methods with more than\n32.7K times lower communication volume on OPV2V. Code is available at\ngithub.com/PhyllisH/PragComm.\n","authors":["Yue Hu","Xianghe Pang","Xiaoqi Qin","Yonina C. Eldar","Siheng Chen","Ping Zhang","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.12694v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2401.12689v1","updated":"2024-01-23T11:54:09Z","published":"2024-01-23T11:54:09Z","title":"Energy-based Automated Model Evaluation","summary":"  The conventional evaluation protocols on machine learning models rely heavily\non a labeled, i.i.d-assumed testing dataset, which is not often present in real\nworld applications. The Automated Model Evaluation (AutoEval) shows an\nalternative to this traditional workflow, by forming a proximal prediction\npipeline of the testing performance without the presence of ground-truth\nlabels. Despite its recent successes, the AutoEval frameworks still suffer from\nan overconfidence issue, substantial storage and computational cost. In that\nregard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that\nallows the AutoEval framework to be both more efficient and effective. The core\nof the MDE is to establish a meta-distribution statistic, on the information\n(energy) associated with individual samples, then offer a smoother\nrepresentation enabled by energy-based learning. We further provide our\ntheoretical insights by connecting the MDE with the classification loss. We\nprovide extensive experiments across modalities, datasets and different\narchitectural backbones to validate MDE's validity, together with its\nsuperiority compared with prior approaches. We also prove MDE's versatility by\nshowing its seamless integration with large-scale models, and easy adaption to\nlearning scenarios with noisy- or imbalanced- labels.\n","authors":["Ru Peng","Heming Zou","Haobo Wang","Yawen Zeng","Zenan Huang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.12689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02246v3","updated":"2024-01-23T11:26:42Z","published":"2023-12-04T14:45:56Z","title":"Conditional Variational Diffusion Models","summary":"  Inverse problems aim to determine parameters from observations, a crucial\ntask in engineering and science. Lately, generative models, especially\ndiffusion models, have gained popularity in this area for their ability to\nproduce realistic solutions and their good mathematical properties. Despite\ntheir success, an important drawback of diffusion models is their sensitivity\nto the choice of variance schedule, which controls the dynamics of the\ndiffusion process. Fine-tuning this schedule for specific applications is\ncrucial but time-costly and does not guarantee an optimal result. We propose a\nnovel approach for learning the schedule as part of the training process. Our\nmethod supports probabilistic conditioning on data, provides high-quality\nsolutions, and is flexible, proving able to adapt to different applications\nwith minimum overhead. This approach is tested in two unrelated inverse\nproblems: super-resolution microscopy and quantitative phase imaging, yielding\ncomparable or superior results to previous methods and fine-tuned diffusion\nmodels. We conclude that fine-tuning the schedule by experimentation should be\navoided because it can be learned during training in a stable way that yields\nbetter results.\n","authors":["Gabriel della Maggiora","Luis Alberto Croquevielle","Nikita Deshpande","Harry Horsley","Thomas Heinis","Artur Yakimovich"],"pdf_url":"https://arxiv.org/pdf/2312.02246v3.pdf","comment":"Denoising Diffusion Probabilistic Models, Inverse Problems,\n  Generative Models, Super Resolution, Phase Quantification, Variational\n  Methods"},{"id":"http://arxiv.org/abs/2401.07709v2","updated":"2024-01-23T11:22:03Z","published":"2024-01-15T14:25:54Z","title":"Towards Efficient Diffusion-Based Image Editing with Instant Attention\n  Masks","summary":"  Diffusion-based Image Editing (DIE) is an emerging research hot-spot, which\noften applies a semantic mask to control the target area for diffusion-based\nediting. However, most existing solutions obtain these masks via manual\noperations or off-line processing, greatly reducing their efficiency. In this\npaper, we propose a novel and efficient image editing method for Text-to-Image\n(T2I) diffusion models, termed Instant Diffusion Editing(InstDiffEdit). In\nparticular, InstDiffEdit aims to employ the cross-modal attention ability of\nexisting diffusion models to achieve instant mask guidance during the diffusion\nsteps. To reduce the noise of attention maps and realize the full automatics,\nwe equip InstDiffEdit with a training-free refinement scheme to adaptively\naggregate the attention distributions for the automatic yet accurate mask\ngeneration. Meanwhile, to supplement the existing evaluations of DIE, we\npropose a new benchmark called Editing-Mask to examine the mask accuracy and\nlocal editing ability of existing methods. To validate InstDiffEdit, we also\nconduct extensive experiments on ImageNet and Imagen, and compare it with a\nbunch of the SOTA methods. The experimental results show that InstDiffEdit not\nonly outperforms the SOTA methods in both image quality and editing results,\nbut also has a much faster inference speed, i.e., +5 to +6 times.\n","authors":["Siyu Zou","Jiji Tang","Yiyi Zhou","Jing He","Chaoyi Zhao","Rongsheng Zhang","Zhipeng Hu","Xiaoshuai Sun"],"pdf_url":"https://arxiv.org/pdf/2401.07709v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2401.12665v1","updated":"2024-01-23T11:20:03Z","published":"2024-01-23T11:20:03Z","title":"ClipSAM: CLIP and SAM Collaboration for Zero-Shot Anomaly Segmentation","summary":"  Recently, foundational models such as CLIP and SAM have shown promising\nperformance for the task of Zero-Shot Anomaly Segmentation (ZSAS). However,\neither CLIP-based or SAM-based ZSAS methods still suffer from non-negligible\nkey drawbacks: 1) CLIP primarily focuses on global feature alignment across\ndifferent inputs, leading to imprecise segmentation of local anomalous parts;\n2) SAM tends to generate numerous redundant masks without proper prompt\nconstraints, resulting in complex post-processing requirements. In this work,\nwe innovatively propose a CLIP and SAM collaboration framework called ClipSAM\nfor ZSAS. The insight behind ClipSAM is to employ CLIP's semantic understanding\ncapability for anomaly localization and rough segmentation, which is further\nused as the prompt constraints for SAM to refine the anomaly segmentation\nresults. In details, we introduce a crucial Unified Multi-scale Cross-modal\nInteraction (UMCI) module for interacting language with visual features at\nmultiple scales of CLIP to reason anomaly positions. Then, we design a novel\nMulti-level Mask Refinement (MMR) module, which utilizes the positional\ninformation as multi-level prompts for SAM to acquire hierarchical levels of\nmasks and merges them. Extensive experiments validate the effectiveness of our\napproach, achieving the optimal segmentation performance on the MVTec-AD and\nVisA datasets.\n","authors":["Shengze Li","Jianjian Cao","Peng Ye","Yuhan Ding","Chongjun Tu","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2401.12665v1.pdf","comment":"7 pages,6 figures"},{"id":"http://arxiv.org/abs/2401.12648v1","updated":"2024-01-23T10:56:01Z","published":"2024-01-23T10:56:01Z","title":"Consistency Enhancement-Based Deep Multiview Clustering via Contrastive\n  Learning","summary":"  Multiview clustering (MVC) segregates data samples into meaningful clusters\nby synthesizing information across multiple views. Moreover, deep\nlearning-based methods have demonstrated their strong feature learning\ncapabilities in MVC scenarios. However, effectively generalizing feature\nrepresentations while maintaining consistency is still an intractable problem.\nIn addition, most existing deep clustering methods based on contrastive\nlearning overlook the consistency of the clustering representations during the\nclustering process. In this paper, we show how the above problems can be\novercome and propose a consistent enhancement-based deep MVC method via\ncontrastive learning (CCEC). Specifically, semantic connection blocks are\nincorporated into a feature representation to preserve the consistent\ninformation among multiple views. Furthermore, the representation process for\nclustering is enhanced through spectral clustering, and the consistency across\nmultiple views is improved. Experiments conducted on five datasets demonstrate\nthe effectiveness and superiority of our method in comparison with the\nstate-of-the-art (SOTA) methods. The code for this method can be accessed at\nhttps://anonymous.4open.science/r/CCEC-E84E/.\n","authors":["Hao Yang","Hua Mao","Wai Lok Woo","Jie Chen","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2401.12648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12609v1","updated":"2024-01-23T10:07:41Z","published":"2024-01-23T10:07:41Z","title":"Fast Semi-supervised Unmixing using Non-convex Optimization","summary":"  In this paper, we introduce a novel linear model tailored for\nsemisupervised/library-based unmixing. Our model incorporates considerations\nfor library mismatch while enabling the enforcement of the abundance sum-to-one\nconstraint (ASC). Unlike conventional sparse unmixing methods, this model\ninvolves nonconvex optimization, presenting significant computational\nchallenges. We demonstrate the efficacy of Alternating Methods of Multipliers\n(ADMM) in cyclically solving these intricate problems. We propose two\nsemisupervised unmixing approaches, each relying on distinct priors applied to\nthe new model in addition to the ASC: sparsity prior and convexity constraint.\nOur experimental results validate that enforcing the convexity constraint\noutperforms the sparsity prior for the endmember library. These results are\ncorroborated across three simulated datasets (accounting for spectral\nvariability and varying pixel purity levels) and the Cuprite dataset.\nAdditionally, our comparison with conventional sparse unmixing methods\nshowcases considerable advantages of our proposed model, which entails\nnonconvex optimization. Notably, our implementations of the proposed\nalgorithms-fast semisupervised unmixing (FaSUn) and sparse unmixing using\nsoft-shrinkage (SUnS)-prove considerably more efficient than traditional sparse\nunmixing methods. SUnS and FaSUn were implemented using PyTorch and provided in\na dedicated Python package called Fast Semisupervised Unmixing (FUnmix), which\nis open-source and available at https://github.com/BehnoodRasti/FUnmix\n","authors":["Behnood Rasti","Alexandre Zouaoui","Julien Mairal","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2401.12609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12596v1","updated":"2024-01-23T09:49:24Z","published":"2024-01-23T09:49:24Z","title":"UniHDA: Towards Universal Hybrid Domain Adaptation of Image Generators","summary":"  Generative domain adaptation has achieved remarkable progress, enabling us to\nadapt a pre-trained generator to a new target domain. However, existing methods\nsimply adapt the generator to a single target domain and are limited to a\nsingle modality, either text-driven or image-driven. Moreover, they are prone\nto overfitting domain-specific attributes, which inevitably compromises\ncross-domain consistency. In this paper, we propose UniHDA, a unified and\nversatile framework for generative hybrid domain adaptation with multi-modal\nreferences from multiple domains. We use CLIP encoder to project multi-modal\nreferences into a unified embedding space and then linear interpolate the\ndirection vectors from multiple target domains to achieve hybrid domain\nadaptation. To ensure the cross-domain consistency, we propose a novel\ncross-domain spatial structure (CSS) loss that maintains detailed spatial\nstructure information between source and target generator. Experiments show\nthat the adapted generator can synthesise realistic images with various\nattribute compositions. Additionally, our framework is versatile to multiple\ngenerators, \\eg, StyleGAN2 and Diffusion Models.\n","authors":["Hengjia Li","Yang Liu","Yuqi Lin","Zhanwei Zhang","Yibo Zhao","weihang Pan","Tu Zheng","Zheng Yang","Yuchun Jiang","Boxi Wu","Deng Cai"],"pdf_url":"https://arxiv.org/pdf/2401.12596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08673v2","updated":"2024-01-23T09:47:26Z","published":"2023-12-14T06:17:15Z","title":"Segment Beyond View: Handling Partially Missing Modality for\n  Audio-Visual Semantic Segmentation","summary":"  Augmented Reality (AR) devices, emerging as prominent mobile interaction\nplatforms, face challenges in user safety, particularly concerning oncoming\nvehicles. While some solutions leverage onboard camera arrays, these cameras\noften have limited field-of-view (FoV) with front or downward perspectives.\nAddressing this, we propose a new out-of-view semantic segmentation task and\nSegment Beyond View (SBV), a novel audio-visual semantic segmentation method.\nSBV supplements the visual modality, which miss the information beyond FoV,\nwith the auditory information using a teacher-student distillation model\n(Omni2Ego). The model consists of a vision teacher utilising panoramic\ninformation, an auditory teacher with 8-channel audio, and an audio-visual\nstudent that takes views with limited FoV and binaural audio as input and\nproduce semantic segmentation for objects outside FoV. SBV outperforms existing\nmodels in comparative evaluations and shows a consistent performance across\nvarying FoV ranges and in monaural audio settings.\n","authors":["Renjie Wu","Hu Wang","Feras Dayoub","Hsiang-Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2312.08673v2.pdf","comment":"Accepted by AAAI-24"},{"id":"http://arxiv.org/abs/2401.12592v1","updated":"2024-01-23T09:47:13Z","published":"2024-01-23T09:47:13Z","title":"RGBD Objects in the Wild: Scaling Real-World 3D Object Learning from\n  RGB-D Videos","summary":"  We introduce a new RGB-D object dataset captured in the wild called\nWildRGB-D. Unlike most existing real-world object-centric datasets which only\ncome with RGB capturing, the direct capture of the depth channel allows better\n3D annotations and broader downstream applications. WildRGB-D comprises\nlarge-scale category-level RGB-D object videos, which are taken using an iPhone\nto go around the objects in 360 degrees. It contains around 8500 recorded\nobjects and nearly 20000 RGB-D videos across 46 common object categories. These\nvideos are taken with diverse cluttered backgrounds with three setups to cover\nas many real-world scenarios as possible: (i) a single object in one video;\n(ii) multiple objects in one video; and (iii) an object with a static hand in\none video. The dataset is annotated with object masks, real-world scale camera\nposes, and reconstructed aggregated point clouds from RGBD videos. We benchmark\nfour tasks with WildRGB-D including novel view synthesis, camera pose\nestimation, object 6d pose estimation, and object surface reconstruction. Our\nexperiments show that the large-scale capture of RGB-D objects provides a large\npotential to advance 3D object learning. Our project page is\nhttps://wildrgbd.github.io/.\n","authors":["Hongchi Xia","Yang Fu","Sifei Liu","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08276v2","updated":"2024-01-23T09:42:41Z","published":"2023-10-12T12:28:47Z","title":"Direction-Oriented Visual-semantic Embedding Model for Remote Sensing\n  Image-text Retrieval","summary":"  Image-text retrieval has developed rapidly in recent years. However, it is\nstill a challenge in remote sensing due to visual-semantic imbalance, which\nleads to incorrect matching of non-semantic visual and textual features. To\nsolve this problem, we propose a novel Direction-Oriented Visual-semantic\nEmbedding Model (DOVE) to mine the relationship between vision and language.\nOur highlight is to conduct visual and textual representations in latent space,\ndirecting them as close as possible to a redundancy-free regional visual\nrepresentation. Concretely, a Regional-Oriented Attention Module (ROAM)\nadaptively adjusts the distance between the final visual and textual embeddings\nin the latent semantic space, oriented by regional visual features. Meanwhile,\na lightweight Digging Text Genome Assistant (DTGA) is designed to expand the\nrange of tractable textual representation and enhance global word-level\nsemantic connections using less attention operations. Ultimately, we exploit a\nglobal visual-semantic constraint to reduce single visual dependency and serve\nas an external constraint for the final visual and textual representations. The\neffectiveness and superiority of our method are verified by extensive\nexperiments including parameter evaluation, quantitative comparison, ablation\nstudies and visual analysis, on two benchmark datasets, RSICD and RSITMD.\n","authors":["Qing Ma","Jiancheng Pan","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2310.08276v2.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2401.12587v1","updated":"2024-01-23T09:37:58Z","published":"2024-01-23T09:37:58Z","title":"Fast Implicit Neural Representation Image Codec in Resource-limited\n  Devices","summary":"  Displaying high-quality images on edge devices, such as augmented reality\ndevices, is essential for enhancing the user experience. However, these devices\noften face power consumption and computing resource limitations, making it\nchallenging to apply many deep learning-based image compression algorithms in\nthis field. Implicit Neural Representation (INR) for image compression is an\nemerging technology that offers two key benefits compared to cutting-edge\nautoencoder models: low computational complexity and parameter-free decoding.\nIt also outperforms many traditional and early neural compression methods in\nterms of quality. In this study, we introduce a new Mixed Autoregressive Model\n(MARM) to significantly reduce the decoding time for the current INR codec,\nalong with a new synthesis network to enhance reconstruction quality. MARM\nincludes our proposed Autoregressive Upsampler (ARU) blocks, which are highly\ncomputationally efficient, and ARM from previous work to balance decoding time\nand reconstruction quality. We also propose enhancing ARU's performance using a\ncheckerboard two-stage decoding strategy. Moreover, the ratio of different\nmodules can be adjusted to maintain a balance between quality and speed.\nComprehensive experiments demonstrate that our method significantly improves\ncomputational efficiency while preserving image quality. With different\nparameter settings, our method can outperform popular AE-based codecs in\nconstrained environments in terms of both quality and decoding time, or achieve\nstate-of-the-art reconstruction quality compared to other INR codecs.\n","authors":["Xiang Liu","Jiahong Chen","Bin Chen","Zimo Liu","Baoyi An","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2401.12587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06934v2","updated":"2024-01-23T09:23:40Z","published":"2023-12-12T02:10:16Z","title":"Toward Real Text Manipulation Detection: New Dataset and New Solution","summary":"  With the surge in realistic text tampering, detecting fraudulent text in\nimages has gained prominence for maintaining information security. However, the\nhigh costs associated with professional text manipulation and annotation limit\nthe availability of real-world datasets, with most relying on synthetic\ntampering, which inadequately replicates real-world tampering attributes. To\naddress this issue, we present the Real Text Manipulation (RTM) dataset,\nencompassing 14,250 text images, which include 5,986 manually and 5,258\nautomatically tampered images, created using a variety of techniques, alongside\n3,006 unaltered text images for evaluating solution stability. Our evaluations\nindicate that existing methods falter in text forgery detection on the RTM\ndataset. We propose a robust baseline solution featuring a Consistency-aware\nAggregation Hub and a Gated Cross Neighborhood-attention Fusion module for\nefficient multi-modal information fusion, supplemented by a Tampered-Authentic\nContrastive Learning module during training, enriching feature representation\ndistinction. This framework, extendable to other dual-stream architectures,\ndemonstrated notable localization performance improvements of 7.33% and 6.38%\non manual and overall manipulations, respectively. Our contributions aim to\npropel advancements in real-world text tampering detection. Code and dataset\nwill be made available at https://github.com/DrLuo/RTM\n","authors":["Dongliang Luo","Yuliang Liu","Rui Yang","Xianjin Liu","Jishen Zeng","Yu Zhou","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2312.06934v2.pdf","comment":"The paper needs to be improved"},{"id":"http://arxiv.org/abs/2306.06075v2","updated":"2024-01-23T09:06:46Z","published":"2023-05-26T13:41:35Z","title":"DeepSeaNet: Improving Underwater Object Detection using EfficientDet","summary":"  Marine animals and deep underwater objects are difficult to recognize and\nmonitor for safety of aquatic life. There is an increasing challenge when the\nwater is saline with granular particles and impurities. In such natural\nadversarial environment, traditional approaches like CNN start to fail and are\nexpensive to compute. This project involves implementing and evaluating various\nobject detection models, including EfficientDet, YOLOv5, YOLOv8, and\nDetectron2, on an existing annotated underwater dataset, called the\nBrackish-Dataset. The dataset comprises annotated image sequences of fish,\ncrabs, starfish, and other aquatic animals captured in Limfjorden water with\nlimited visibility. The aim of this research project is to study the efficiency\nof newer models on the same dataset and contrast them with the previous results\nbased on accuracy and inference time. Firstly, I compare the results of YOLOv3\n(31.10% mean Average Precision (mAP)), YOLOv4 (83.72% mAP), YOLOv5 (97.6%),\nYOLOv8 (98.20%), EfficientDet (98.56% mAP) and Detectron2 (95.20% mAP) on the\nsame dataset. Secondly, I provide a modified BiSkFPN mechanism (BiFPN neck with\nskip connections) to perform complex feature fusion in adversarial noise which\nmakes modified EfficientDet robust to perturbations. Third, analyzed the effect\non accuracy of EfficientDet (98.63% mAP) and YOLOv5 by adversarial learning\n(98.04% mAP). Last, I provide class activation map based explanations (CAM) for\nthe two models to promote Explainability in black box models. Overall, the\nresults indicate that modified EfficientDet achieved higher accuracy with\nfive-fold cross validation than the other models with 88.54% IoU of feature\nmaps.\n","authors":["Sanyam Jain"],"pdf_url":"https://arxiv.org/pdf/2306.06075v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15939v3","updated":"2024-01-23T08:54:59Z","published":"2023-11-27T15:46:47Z","title":"Unleashing the Power of Prompt-driven Nucleus Instance Segmentation","summary":"  Nucleus instance segmentation in histology images is crucial for a broad\nspectrum of clinical applications. Current dominant algorithms rely on\nregression of nuclear proxy maps. Distinguishing nucleus instances from the\nestimated maps requires carefully curated post-processing, which is error-prone\nand parameter-sensitive. Recently, the Segment Anything Model (SAM) has earned\nhuge attention in medical image segmentation, owing to its impressive\ngeneralization ability and promptable property. Nevertheless, its potential on\nnucleus instance segmentation remains largely underexplored. In this paper, we\npresent a novel prompt-driven framework that consists of a nucleus prompter and\nSAM for automatic nucleus instance segmentation. Specifically, the prompter\nlearns to generate a unique point prompt for each nucleus while the SAM is\nfine-tuned to output the corresponding mask for the prompted nucleus.\nFurthermore, we propose the inclusion of adjacent nuclei as negative prompts to\nenhance the model's capability to identify overlapping nuclei. Without\ncomplicated post-processing, our proposed method sets a new state-of-the-art\nperformance on three challenging benchmarks. Code is available at\n\\url{github.com/windygoo/PromptNucSeg}\n","authors":["Zhongyi Shui","Yunlong Zhang","Kai Yao","Chenglu Zhu","Sunyi Zheng","Jingxiong Li","Honglin Li","Yuxuan Sun","Ruizhe Guo","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15939v3.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2401.06827v2","updated":"2024-01-23T08:54:15Z","published":"2024-01-12T04:54:01Z","title":"APLe: Token-Wise Adaptive for Multi-Modal Prompt Learning","summary":"  Pre-trained Vision-Language (V-L) models set the benchmark for generalization\nto downstream tasks among the noteworthy contenders. Many characteristics of\nthe V-L model have been explored in existing research including the challenge\nof the sensitivity to text input and the tuning process across multi-modal\nprompts. With the advanced utilization of the V-L model like CLIP, recent\napproaches deploy learnable prompts instead of hand-craft prompts to boost the\ngeneralization performance and address the aforementioned challenges. Inspired\nby layer-wise training, which is wildly used in image fusion, we note that\nusing a sequential training process to adapt different modalities branches of\nCLIP efficiently facilitates the improvement of generalization. In the context\nof addressing the multi-modal prompting challenge, we propose Token-wise\nAdaptive for Multi-modal Prompt Learning (APLe) for tuning both modalities\nprompts, vision and language, as tokens in a sequential manner. APLe addresses\nthe challenges in V-L models to promote prompt learning across both modalities,\nwhich indicates a competitive generalization performance in line with the\nstate-of-the-art. Preeminently, APLe shows robustness and favourable\nperformance in prompt-length experiments with an absolute advantage in adopting\nthe V-L models.\n","authors":["Guiming Cao","Kaize Shi","Hong Fu","Huaiwen Zhang","Guandong Xu"],"pdf_url":"https://arxiv.org/pdf/2401.06827v2.pdf","comment":"7 pages,3 figures"},{"id":"http://arxiv.org/abs/2401.12568v1","updated":"2024-01-23T08:54:10Z","published":"2024-01-23T08:54:10Z","title":"NeRF-AD: Neural Radiance Field with Attention-based Disentanglement for\n  Talking Face Synthesis","summary":"  Talking face synthesis driven by audio is one of the current research\nhotspots in the fields of multidimensional signal processing and multimedia.\nNeural Radiance Field (NeRF) has recently been brought to this research field\nin order to enhance the realism and 3D effect of the generated faces. However,\nmost existing NeRF-based methods either burden NeRF with complex learning tasks\nwhile lacking methods for supervised multimodal feature fusion, or cannot\nprecisely map audio to the facial region related to speech movements. These\nreasons ultimately result in existing methods generating inaccurate lip shapes.\nThis paper moves a portion of NeRF learning tasks ahead and proposes a talking\nface synthesis method via NeRF with attention-based disentanglement (NeRF-AD).\nIn particular, an Attention-based Disentanglement module is introduced to\ndisentangle the face into Audio-face and Identity-face using speech-related\nfacial action unit (AU) information. To precisely regulate how audio affects\nthe talking face, we only fuse the Audio-face with audio feature. In addition,\nAU information is also utilized to supervise the fusion of these two\nmodalities. Extensive qualitative and quantitative experiments demonstrate that\nour NeRF-AD outperforms state-of-the-art methods in generating realistic\ntalking face videos, including image quality and lip synchronization. To view\nvideo results, please refer to https://xiaoxingliu02.github.io/NeRF-AD.\n","authors":["Chongke Bi","Xiaoxing Liu","Zhilei Liu"],"pdf_url":"https://arxiv.org/pdf/2401.12568v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.12561v1","updated":"2024-01-23T08:44:26Z","published":"2024-01-23T08:44:26Z","title":"EndoGaussian: Gaussian Splatting for Deformable Surgical Scene\n  Reconstruction","summary":"  Reconstructing deformable tissues from endoscopic stereo videos is essential\nin many downstream surgical applications. However, existing methods suffer from\nslow inference speed, which greatly limits their practical use. In this paper,\nwe introduce EndoGaussian, a real-time surgical scene reconstruction framework\nthat builds on 3D Gaussian Splatting. Our framework represents dynamic surgical\nscenes as canonical Gaussians and a time-dependent deformation field, which\npredicts Gaussian deformations at novel timestamps. Due to the efficient\nGaussian representation and parallel rendering pipeline, our framework\nsignificantly accelerates the rendering speed compared to previous methods. In\naddition, we design the deformation field as the combination of a lightweight\nencoding voxel and an extremely tiny MLP, allowing for efficient Gaussian\ntracking with a minor rendering burden. Furthermore, we design a holistic\nGaussian initialization method to fully leverage the surface distribution\nprior, achieved by searching informative points from across the input image\nsequence. Experiments on public endoscope datasets demonstrate that our method\ncan achieve real-time rendering speed (195 FPS real-time, 100$\\times$ gain)\nwhile maintaining the state-of-the-art reconstruction quality (35.925 PSNR) and\nthe fastest training speed (within 2 min/scene), showing significant promise\nfor intraoperative surgery applications. Code is available at:\n\\url{https://yifliu3.github.io/EndoGaussian/}.\n","authors":["Yifan Liu","Chenxin Li","Chen Yang","Yixuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2401.12561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2003.13648v2","updated":"2024-01-23T08:40:55Z","published":"2020-03-30T17:32:49Z","title":"Weakly-supervised land classification for coastal zone based on deep\n  convolutional neural networks by incorporating dual-polarimetric\n  characteristics into training dataset","summary":"  In this work we explore the performance of DCNNs on semantic segmentation\nusing spaceborne polarimetric synthetic aperture radar (PolSAR) datasets. The\nsemantic segmentation task using PolSAR data can be categorized as weakly\nsupervised learning when the characteristics of SAR data and data annotating\nprocedures are factored in. Datasets are initially analyzed for selecting\nfeasible pre-training images. Then the differences between spaceborne and\nairborne datasets are examined in terms of spatial resolution and viewing\ngeometry. In this study we used two dual-polarimetric images acquired by\nTerraSAR-X DLR. A novel method to produce training dataset with more supervised\ninformation is developed. Specifically, a series of typical classified images\nas well as intensity images serve as training datasets. A field survey is\nconducted for an area of about 20 square kilometers to obtain a ground truth\ndataset used for accuracy evaluation. Several transfer learning strategies are\nmade for aforementioned training datasets which will be combined in a\npracticable order. Three DCNN models, including SegNet, U-Net, and LinkNet, are\nimplemented next.\n","authors":["Sheng Sun","Armando Marino","Wenze Shui","Zhongwen Hu"],"pdf_url":"https://arxiv.org/pdf/2003.13648v2.pdf","comment":"We are sorry we would like to improve it"},{"id":"http://arxiv.org/abs/2312.03408v2","updated":"2024-01-23T08:36:17Z","published":"2023-12-06T10:46:53Z","title":"Open-sourced Data Ecosystem in Autonomous Driving: the Present and\n  Future","summary":"  With the continuous maturation and application of autonomous driving\ntechnology, a systematic examination of open-source autonomous driving datasets\nbecomes instrumental in fostering the robust evolution of the industry\necosystem. Current autonomous driving datasets can broadly be categorized into\ntwo generations. The first-generation autonomous driving datasets are\ncharacterized by relatively simpler sensor modalities, smaller data scale, and\nis limited to perception-level tasks. KITTI, introduced in 2012, serves as a\nprominent representative of this initial wave. In contrast, the\nsecond-generation datasets exhibit heightened complexity in sensor modalities,\ngreater data scale and diversity, and an expansion of tasks from perception to\nencompass prediction and control. Leading examples of the second generation\ninclude nuScenes and Waymo, introduced around 2019. This comprehensive review,\nconducted in collaboration with esteemed colleagues from both academia and\nindustry, systematically assesses over seventy open-source autonomous driving\ndatasets from domestic and international sources. It offers insights into\nvarious aspects, such as the principles underlying the creation of high-quality\ndatasets, the pivotal role of data engine systems, and the utilization of\ngenerative foundation models to facilitate scalable data generation.\nFurthermore, this review undertakes an exhaustive analysis and discourse\nregarding the characteristics and data scales that future third-generation\nautonomous driving datasets should possess. It also delves into the scientific\nand technical challenges that warrant resolution. These endeavors are pivotal\nin advancing autonomous innovation and fostering technological enhancement in\ncritical domains. For further details, please refer to\nhttps://github.com/OpenDriveLab/DriveAGI.\n","authors":["Hongyang Li","Yang Li","Huijie Wang","Jia Zeng","Pinlong Cai","Huilin Xu","Dahua Lin","Junchi Yan","Feng Xu","Lu Xiong","Jingdong Wang","Futang Zhu","Kai Yan","Chunjing Xu","Tiancai Wang","Beipeng Mu","Shaoqing Ren","Zhihui Peng","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2312.03408v2.pdf","comment":"This article is a simplified English translation of corresponding\n  Chinese article. Please refer to Chinese version for the complete content"},{"id":"http://arxiv.org/abs/2310.07189v2","updated":"2024-01-23T08:20:05Z","published":"2023-10-11T04:38:21Z","title":"SpikePoint: An Efficient Point-based Spiking Neural Network for Event\n  Cameras Action Recognition","summary":"  Event cameras are bio-inspired sensors that respond to local changes in light\nintensity and feature low latency, high energy efficiency, and high dynamic\nrange. Meanwhile, Spiking Neural Networks (SNNs) have gained significant\nattention due to their remarkable efficiency and fault tolerance. By\nsynergistically harnessing the energy efficiency inherent in event cameras and\nthe spike-based processing capabilities of SNNs, their integration could enable\nultra-low-power application scenarios, such as action recognition tasks.\nHowever, existing approaches often entail converting asynchronous events into\nconventional frames, leading to additional data mapping efforts and a loss of\nsparsity, contradicting the design concept of SNNs and event cameras. To\naddress this challenge, we propose SpikePoint, a novel end-to-end point-based\nSNN architecture. SpikePoint excels at processing sparse event cloud data,\neffectively extracting both global and local features through a singular-stage\nstructure. Leveraging the surrogate training method, SpikePoint achieves high\naccuracy with few parameters and maintains low power consumption, specifically\nemploying the identity mapping feature extractor on diverse datasets.\nSpikePoint achieves state-of-the-art (SOTA) performance on four event-based\naction recognition datasets using only 16 timesteps, surpassing other SNN\nmethods. Moreover, it also achieves SOTA performance across all methods on\nthree datasets, utilizing approximately 0.3\\% of the parameters and 0.5\\% of\npower consumption employed by artificial neural networks (ANNs). These results\nemphasize the significance of Point Cloud and pave the way for many\nultra-low-power event-based data processing applications.\n","authors":["Hongwei Ren","Yue Zhou","Yulong Huang","Haotian Fu","Xiaopeng Lin","Jie Song","Bojun Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.07189v2.pdf","comment":"Accepted by ICLR 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2305.13208v2","updated":"2024-01-23T08:18:27Z","published":"2023-05-16T06:19:03Z","title":"Iterative Adversarial Attack on Image-guided Story Ending Generation","summary":"  Multimodal learning involves developing models that can integrate information\nfrom various sources like images and texts. In this field, multimodal text\ngeneration is a crucial aspect that involves processing data from multiple\nmodalities and outputting text. The image-guided story ending generation\n(IgSEG) is a particularly significant task, targeting on an understanding of\ncomplex relationships between text and image data with a complete story text\nending. Unfortunately, deep neural networks, which are the backbone of recent\nIgSEG models, are vulnerable to adversarial samples. Current adversarial attack\nmethods mainly focus on single-modality data and do not analyze adversarial\nattacks for multimodal text generation tasks that use cross-modal information.\nTo this end, we propose an iterative adversarial attack method\n(Iterative-attack) that fuses image and text modality attacks, allowing for an\nattack search for adversarial text and image in an more effective iterative\nway. Experimental results demonstrate that the proposed method outperforms\nexisting single-modal and non-iterative multimodal attack methods, indicating\nthe potential for improving the adversarial robustness of multimodal text\ngeneration models, such as multimodal machine translation, multimodal question\nanswering, etc.\n","authors":["Youze Wang","Wenbo Hu","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2305.13208v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13014v4","updated":"2024-01-23T08:16:09Z","published":"2023-04-25T17:38:41Z","title":"Methods and datasets for segmentation of minimally invasive surgical\n  instruments in endoscopic images and videos: A review of the state of the art","summary":"  In the field of computer- and robot-assisted minimally invasive surgery,\nenormous progress has been made in recent years based on the recognition of\nsurgical instruments in endoscopic images and videos. In particular, the\ndetermination of the position and type of instruments is of great interest.\nCurrent work involves both spatial and temporal information, with the idea that\npredicting the movement of surgical tools over time may improve the quality of\nthe final segmentations. The provision of publicly available datasets has\nrecently encouraged the development of new methods, mainly based on deep\nlearning. In this review, we identify and characterize datasets used for method\ndevelopment and evaluation and quantify their frequency of use in the\nliterature. We further present an overview of the current state of research\nregarding the segmentation and tracking of minimally invasive surgical\ninstruments in endoscopic images and videos. The paper focuses on methods that\nwork purely visually, without markers of any kind attached to the instruments,\nconsidering both single-frame semantic and instance segmentation approaches, as\nwell as those that incorporate temporal information. The publications analyzed\nwere identified through the platforms Google Scholar, Web of Science, and\nPubMed. The search terms used were \"instrument segmentation\", \"instrument\ntracking\", \"surgical tool segmentation\", and \"surgical tool tracking\",\nresulting in a total of 741 articles published between 01/2015 and 07/2023, of\nwhich 123 were included using systematic selection criteria. A discussion of\nthe reviewed literature is provided, highlighting existing shortcomings and\nemphasizing the available potential for future developments.\n","authors":["Tobias Rueckert","Daniel Rueckert","Christoph Palm"],"pdf_url":"https://arxiv.org/pdf/2304.13014v4.pdf","comment":"30 pages, 10 figures"},{"id":"http://arxiv.org/abs/2110.11334v3","updated":"2024-01-23T07:36:33Z","published":"2021-10-21T17:59:41Z","title":"Generalized Out-of-Distribution Detection: A Survey","summary":"  Out-of-distribution (OOD) detection is critical to ensuring the reliability\nand safety of machine learning systems. For instance, in autonomous driving, we\nwould like the driving system to issue an alert and hand over the control to\nhumans when it detects unusual scenes or objects that it has never seen during\ntraining time and cannot make a safe decision. The term, OOD detection, first\nemerged in 2017 and since then has received increasing attention from the\nresearch community, leading to a plethora of methods developed, ranging from\nclassification-based to density-based to distance-based ones. Meanwhile,\nseveral other problems, including anomaly detection (AD), novelty detection\n(ND), open set recognition (OSR), and outlier detection (OD), are closely\nrelated to OOD detection in terms of motivation and methodology. Despite common\ngoals, these topics develop in isolation, and their subtle differences in\ndefinition and problem setting often confuse readers and practitioners. In this\nsurvey, we first present a unified framework called generalized OOD detection,\nwhich encompasses the five aforementioned problems, i.e., AD, ND, OSR, OOD\ndetection, and OD. Under our framework, these five problems can be seen as\nspecial cases or sub-tasks, and are easier to distinguish. We then review each\nof these five areas by summarizing their recent technical developments, with a\nspecial focus on OOD detection methodologies. We conclude this survey with open\nchallenges and potential research directions.\n","authors":["Jingkang Yang","Kaiyang Zhou","Yixuan Li","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2110.11334v3.pdf","comment":"Feel free to comment on our Overleaf manuscript:\n  https://www.overleaf.com/9899719915wmccvdtwpkct#c25192"},{"id":"http://arxiv.org/abs/2401.12535v1","updated":"2024-01-23T07:24:16Z","published":"2024-01-23T07:24:16Z","title":"Self-Supervised Vision Transformers Are Efficient Segmentation Learners\n  for Imperfect Labels","summary":"  This study demonstrates a cost-effective approach to semantic segmentation\nusing self-supervised vision transformers (SSVT). By freezing the SSVT backbone\nand training a lightweight segmentation head, our approach effectively utilizes\nimperfect labels, thereby improving robustness to label imperfections.\nEmpirical experiments show significant performance improvements over existing\nmethods for various annotation types, including scribble, point-level, and\nimage-level labels. The research highlights the effectiveness of\nself-supervised vision transformers in dealing with imperfect labels, providing\na practical and efficient solution for semantic segmentation while reducing\nannotation costs. Through extensive experiments, we confirm that our method\noutperforms baseline models for all types of imperfect labels. Especially under\nthe zero-shot vision-language-model-based label, our model exhibits 11.5\\%p\nperformance gain compared to the baseline.\n","authors":["Seungho Lee","Seoungyoon Kang","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2401.12535v1.pdf","comment":"AAAI2024 Edge Intelligence Workshop (EIW) accepted"},{"id":"http://arxiv.org/abs/2401.12513v1","updated":"2024-01-23T06:08:00Z","published":"2024-01-23T06:08:00Z","title":"Detecting and recognizing characters in Greek papyri with YOLOv8, DeiT\n  and SimCLR","summary":"  The capacity to isolate and recognize individual characters from facsimile\nimages of papyrus manuscripts yields rich opportunities for digital analysis.\nFor this reason the `ICDAR 2023 Competition on Detection and Recognition of\nGreek Letters on Papyri' was held as part of the 17th International Conference\non Document Analysis and Recognition. This paper discusses our submission to\nthe competition. We used an ensemble of YOLOv8 models to detect and classify\nindividual characters and employed two different approaches for refining the\ncharacter predictions, including a transformer based DeiT approach and a\nResNet-50 model trained on a large corpus of unlabelled data using SimCLR, a\nself-supervised learning method. Our submission won the recognition challenge\nwith a mAP of 42.2%, and was runner-up in the detection challenge with a mean\naverage precision (mAP) of 51.4%. At the more relaxed intersection over union\nthreshold of 0.5, we achieved the highest mean average precision and mean\naverage recall results for both detection and classification. We ran our\nprediction pipeline on more than 4,500 images from the Oxyrhynchus Papyri to\nillustrate the utility of our approach, and we release the results publicly in\nmultiple formats.\n","authors":["Robert Turnbull","Evelyn Mannix"],"pdf_url":"https://arxiv.org/pdf/2401.12513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.03087v3","updated":"2024-01-23T06:07:45Z","published":"2022-02-07T11:55:23Z","title":"Unsupervised Long-Term Person Re-Identification with Clothes Change","summary":"  We investigate unsupervised person re-identification (Re-ID) with clothes\nchange, a new challenging problem with more practical usability and scalability\nto real-world deployment. Most existing re-id methods artificially assume the\nclothes of every single person to be stationary across space and time. This\ncondition is mostly valid for short-term re-id scenarios since an average\nperson would often change the clothes even within a single day. To alleviate\nthis assumption, several recent works have introduced the clothes change facet\nto re-id, with a focus on supervised learning person identity discriminative\nrepresentation with invariance to clothes changes. Taking a step further\ntowards this long-term re-id direction, we further eliminate the requirement of\nperson identity labels, as they are significantly more expensive and more\ntedious to annotate in comparison to short-term person re-id datasets. Compared\nto conventional unsupervised short-term re-id, this new problem is drastically\nmore challenging as different people may have similar clothes whilst the same\nperson can wear multiple suites of clothes over different locations and times\nwith very distinct appearance. To overcome such obstacles, we introduce a novel\nCurriculum Person Clustering (CPC) method that can adaptively regulate the\nunsupervised clustering criterion according to the clustering confidence.\nExperiments on three long-term person re-id datasets show that our CPC\noutperforms SOTA unsupervised re-id methods and even closely matches the\nsupervised re-id models.\n","authors":["Mingkun Li","Shupeng Cheng","Peng Xu","Xiatian Zhu","Chun-Guang Li","Jun Guo"],"pdf_url":"https://arxiv.org/pdf/2202.03087v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12511v1","updated":"2024-01-23T06:03:16Z","published":"2024-01-23T06:03:16Z","title":"Convolutional Initialization for Data-Efficient Vision Transformers","summary":"  Training vision transformer networks on small datasets poses challenges. In\ncontrast, convolutional neural networks (CNNs) can achieve state-of-the-art\nperformance by leveraging their architectural inductive bias. In this paper, we\ninvestigate whether this inductive bias can be reinterpreted as an\ninitialization bias within a vision transformer network. Our approach is\nmotivated by the finding that random impulse filters can achieve almost\ncomparable performance to learned filters in CNNs. We introduce a novel\ninitialization strategy for transformer networks that can achieve comparable\nperformance to CNNs on small datasets while preserving its architectural\nflexibility.\n","authors":["Jianqiao Zheng","Xueqian Li","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2401.12511v1.pdf","comment":"14 pages, 9 figures, 8 tables"},{"id":"http://arxiv.org/abs/2401.12507v1","updated":"2024-01-23T05:57:50Z","published":"2024-01-23T05:57:50Z","title":"Open-Set Facial Expression Recognition","summary":"  Facial expression recognition (FER) models are typically trained on datasets\nwith a fixed number of seven basic classes. However, recent research works\npoint out that there are far more expressions than the basic ones. Thus, when\nthese models are deployed in the real world, they may encounter unknown\nclasses, such as compound expressions that cannot be classified into existing\nbasic classes. To address this issue, we propose the open-set FER task for the\nfirst time. Though there are many existing open-set recognition methods, we\nargue that they do not work well for open-set FER because FER data are all\nhuman faces with very small inter-class distances, which makes the open-set\nsamples very similar to close-set samples. In this paper, we are the first to\ntransform the disadvantage of small inter-class distance into an advantage by\nproposing a new way for open-set FER. Specifically, we find that small\ninter-class distance allows for sparsely distributed pseudo labels of open-set\nsamples, which can be viewed as symmetric noisy labels. Based on this novel\nobservation, we convert the open-set FER to a noisy label detection problem. We\nfurther propose a novel method that incorporates attention map consistency and\ncycle training to detect the open-set samples. Extensive experiments on various\nFER datasets demonstrate that our method clearly outperforms state-of-the-art\nopen-set recognition methods by large margins. Code is available at\nhttps://github.com/zyh-uaiaaaa.\n","authors":["Yuhang Zhang","Yue Yao","Xuannan Liu","Lixiong Qin","Wenjing Wang","Weihong Deng"],"pdf_url":"https://arxiv.org/pdf/2401.12507v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2401.03179v2","updated":"2024-01-23T05:57:30Z","published":"2024-01-06T09:53:33Z","title":"Multimodal Informative ViT: Information Aggregation and Distribution for\n  Hyperspectral and LiDAR Classification","summary":"  In multimodal land cover classification (MLCC), a common challenge is the\nredundancy in data distribution, where irrelevant information from multiple\nmodalities can hinder the effective integration of their unique features. To\ntackle this, we introduce the Multimodal Informative Vit (MIVit), a system with\nan innovative information aggregate-distributing mechanism. This approach\nredefines redundancy levels and integrates performance-aware elements into the\nfused representation, facilitating the learning of semantics in both forward\nand backward directions. MIVit stands out by significantly reducing redundancy\nin the empirical distribution of each modality's separate and fused features.\nIt employs oriented attention fusion (OAF) for extracting shallow local\nfeatures across modalities in horizontal and vertical dimensions, and a\nTransformer feature extractor for extracting deep global features through\nlong-range attention. We also propose an information aggregation constraint\n(IAC) based on mutual information, designed to remove redundant information and\npreserve complementary information within embedded features. Additionally, the\ninformation distribution flow (IDF) in MIVit enhances performance-awareness by\ndistributing global classification information across different modalities'\nfeature maps. This architecture also addresses missing modality challenges with\nlightweight independent modality classifiers, reducing the computational load\ntypically associated with Transformers. Our results show that MIVit's\nbidirectional aggregate-distributing mechanism between modalities is highly\neffective, achieving an average overall accuracy of 95.56% across three\nmultimodal datasets. This performance surpasses current state-of-the-art\nmethods in MLCC. The code for MIVit is accessible at\nhttps://github.com/icey-zhang/MIViT.\n","authors":["Jiaqing Zhang","Jie Lei","Weiying Xie","Geng Yang","Daixun Li","Yunsong Li"],"pdf_url":"https://arxiv.org/pdf/2401.03179v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12503v1","updated":"2024-01-23T05:55:26Z","published":"2024-01-23T05:55:26Z","title":"Small Language Model Meets with Reinforced Vision Vocabulary","summary":"  Playing Large Vision Language Models (LVLMs) in 2023 is trendy among the AI\ncommunity. However, the relatively large number of parameters (more than 7B) of\npopular LVLMs makes it difficult to train and deploy on consumer GPUs,\ndiscouraging many researchers with limited resources. Imagine how cool it would\nbe to experience all the features of current LVLMs on an old GTX1080ti (our\nonly game card). Accordingly, we present Vary-toy in this report, a small-size\nVary along with Qwen-1.8B as the base ``large'' language model. In Vary-toy, we\nintroduce an improved vision vocabulary, allowing the model to not only possess\nall features of Vary but also gather more generality. Specifically, we replace\nnegative samples of natural images with positive sample data driven by object\ndetection in the procedure of generating vision vocabulary, more sufficiently\nutilizing the capacity of the vocabulary network and enabling it to efficiently\nencode visual information corresponding to natural objects. For experiments,\nVary-toy can achieve 65.6% ANLS on DocVQA, 59.1% accuracy on ChartQA, 88.1%\naccuracy on RefCOCO, and 29% on MMVet. The code will be publicly available on\nthe homepage.\n","authors":["Haoran Wei","Lingyu Kong","Jinyue Chen","Liang Zhao","Zheng Ge","En Yu","Jianjian Sun","Chunrui Han","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.12503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00334v3","updated":"2024-01-23T05:38:56Z","published":"2023-12-30T21:48:20Z","title":"Explainability-Driven Leaf Disease Classification Using Adversarial\n  Training and Knowledge Distillation","summary":"  This work focuses on plant leaf disease classification and explores three\ncrucial aspects: adversarial training, model explainability, and model\ncompression. The models' robustness against adversarial attacks is enhanced\nthrough adversarial training, ensuring accurate classification even in the\npresence of threats. Leveraging explainability techniques, we gain insights\ninto the model's decision-making process, improving trust and transparency.\nAdditionally, we explore model compression techniques to optimize computational\nefficiency while maintaining classification performance. Through our\nexperiments, we determine that on a benchmark dataset, the robustness can be\nthe price of the classification accuracy with performance reductions of 3%-20%\nfor regular tests and gains of 50%-70% for adversarial attack tests. We also\ndemonstrate that a student model can be 15-25 times more computationally\nefficient for a slight performance reduction, distilling the knowledge of more\ncomplex models.\n","authors":["Sebastian-Vasile Echim","Iulian-Marius Tăiatu","Dumitru-Clementin Cercel","Florin Pop"],"pdf_url":"https://arxiv.org/pdf/2401.00334v3.pdf","comment":"10 pages, 8 figures, Accepted by ICAART 2024"},{"id":"http://arxiv.org/abs/2304.06470v5","updated":"2024-01-23T05:03:53Z","published":"2023-03-29T15:26:44Z","title":"Qualitative Failures of Image Generation Models and Their Application in\n  Detecting Deepfakes","summary":"  The ability of image and video generation models to create photorealistic\nimages has reached unprecedented heights, making it difficult to distinguish\nbetween real and fake images in many cases. However, despite this progress, a\ngap remains between the quality of generated images and those found in the real\nworld. To address this, we have reviewed a vast body of literature from both\nacademic publications and social media to identify qualitative shortcomings in\nimage generation models, which we have classified into five categories. By\nunderstanding these failures, we can identify areas where these models need\nimprovement, as well as develop strategies for detecting deep fakes. The\nprevalence of deep fakes in today's society is a serious concern, and our\nfindings can help mitigate their negative impact.\n","authors":["Ali Borji"],"pdf_url":"https://arxiv.org/pdf/2304.06470v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12488v1","updated":"2024-01-23T05:00:02Z","published":"2024-01-23T05:00:02Z","title":"An Automated Real-Time Approach for Image Processing and Segmentation of\n  Fluoroscopic Images and Videos Using a Single Deep Learning Network","summary":"  Image segmentation in total knee arthroplasty is crucial for precise\npreoperative planning and accurate implant positioning, leading to improved\nsurgical outcomes and patient satisfaction. The biggest challenges of image\nsegmentation in total knee arthroplasty include accurately delineating complex\nanatomical structures, dealing with image artifacts and noise, and developing\nrobust algorithms that can handle anatomical variations and pathologies\ncommonly encountered in patients. The potential of using machine learning for\nimage segmentation in total knee arthroplasty lies in its ability to improve\nsegmentation accuracy, automate the process, and provide real-time assistance\nto surgeons, leading to enhanced surgical planning, implant placement, and\npatient outcomes. This paper proposes a methodology to use deep learning for\nrobust and real-time total knee arthroplasty image segmentation. The deep\nlearning model, trained on a large dataset, demonstrates outstanding\nperformance in accurately segmenting both the implanted femur and tibia,\nachieving an impressive mean-Average-Precision (mAP) of 88.83 when compared to\nthe ground truth while also achieving a real-time segmented speed of 20 frames\nper second (fps). We have introduced a novel methodology for segmenting\nimplanted knee fluoroscopic or x-ray images that showcases remarkable levels of\naccuracy and speed, paving the way for various potential extended applications.\n","authors":["Viet Dung Nguyen","Michael T. LaCour","Richard D. Komistek"],"pdf_url":"https://arxiv.org/pdf/2401.12488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11115v2","updated":"2024-01-23T04:41:12Z","published":"2024-01-20T04:58:06Z","title":"MotionMix: Weakly-Supervised Diffusion for Controllable Motion\n  Generation","summary":"  Controllable generation of 3D human motions becomes an important topic as the\nworld embraces digital transformation. Existing works, though making promising\nprogress with the advent of diffusion models, heavily rely on meticulously\ncaptured and annotated (e.g., text) high-quality motion corpus, a\nresource-intensive endeavor in the real world. This motivates our proposed\nMotionMix, a simple yet effective weakly-supervised diffusion model that\nleverages both noisy and unannotated motion sequences. Specifically, we\nseparate the denoising objectives of a diffusion model into two stages:\nobtaining conditional rough motion approximations in the initial $T-T^*$ steps\nby learning the noisy annotated motions, followed by the unconditional\nrefinement of these preliminary motions during the last $T^*$ steps using\nunannotated motions. Notably, though learning from two sources of imperfect\ndata, our model does not compromise motion generation quality compared to fully\nsupervised approaches that access gold data. Extensive experiments on several\nbenchmarks demonstrate that our MotionMix, as a versatile framework,\nconsistently achieves state-of-the-art performances on text-to-motion,\naction-to-motion, and music-to-dance tasks.\n","authors":["Nhat M. Hoang","Kehong Gong","Chuan Guo","Michael Bi Mi"],"pdf_url":"https://arxiv.org/pdf/2401.11115v2.pdf","comment":"Accepted at the 38th Association for the Advancement of Artificial\n  Intelligence (AAAI) Conference on Artificial Intelligence, Main Conference"},{"id":"http://arxiv.org/abs/2401.12480v1","updated":"2024-01-23T04:19:15Z","published":"2024-01-23T04:19:15Z","title":"Explore Synergistic Interaction Across Frames for Interactive Video\n  Object Segmentation","summary":"  Interactive Video Object Segmentation (iVOS) is a challenging task that\nrequires real-time human-computer interaction. To improve the user experience,\nit is important to consider the user's input habits, segmentation quality,\nrunning time and memory consumption.However, existing methods compromise user\nexperience with single input mode and slow running speed. Specifically, these\nmethods only allow the user to interact with one single frame, which limits the\nexpression of the user's intent.To overcome these limitations and better align\nwith people's usage habits, we propose a framework that can accept multiple\nframes simultaneously and explore synergistic interaction across frames (SIAF).\nConcretely, we designed the Across-Frame Interaction Module that enables users\nto annotate different objects freely on multiple frames. The AFI module will\nmigrate scribble information among multiple interactive frames and generate\nmulti-frame masks. Additionally, we employ the id-queried mechanism to process\nmultiple objects in batches. Furthermore, for a more efficient propagation and\nlightweight model, we design a truncated re-propagation strategy to replace the\nprevious multi-round fusion module, which employs an across-round memory that\nstores important interaction information. Our SwinB-SIAF achieves new\nstate-of-the-art performance on DAVIS 2017 (89.6%, J&F@60). Moreover, our\nR50-SIAF is more than 3 faster than the state-of-the-art competitor under\nchallenging multi-object scenarios.\n","authors":["Kexin Li","Tao Jiang","Zongxin Yang","Yi Yang","Yueting Zhuang","Jun Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.12480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12479v1","updated":"2024-01-23T04:17:42Z","published":"2024-01-23T04:17:42Z","title":"TD^2-Net: Toward Denoising and Debiasing for Dynamic Scene Graph\n  Generation","summary":"  Dynamic scene graph generation (SGG) focuses on detecting objects in a video\nand determining their pairwise relationships. Existing dynamic SGG methods\nusually suffer from several issues, including 1) Contextual noise, as some\nframes might contain occluded and blurred objects. 2) Label bias, primarily due\nto the high imbalance between a few positive relationship samples and numerous\nnegative ones. Additionally, the distribution of relationships exhibits a\nlong-tailed pattern. To address the above problems, in this paper, we introduce\na network named TD$^2$-Net that aims at denoising and debiasing for dynamic\nSGG. Specifically, we first propose a denoising spatio-temporal transformer\nmodule that enhances object representation with robust contextual information.\nThis is achieved by designing a differentiable Top-K object selector that\nutilizes the gumbel-softmax sampling strategy to select the relevant\nneighborhood for each object. Second, we introduce an asymmetrical reweighting\nloss to relieve the issue of label bias. This loss function integrates\nasymmetry focusing factors and the volume of samples to adjust the weights\nassigned to individual samples. Systematic experimental results demonstrate the\nsuperiority of our proposed TD$^2$-Net over existing state-of-the-art\napproaches on Action Genome databases. In more detail, TD$^2$-Net outperforms\nthe second-best competitors by 12.7 \\% on mean-Recall@10 for predicate\nclassification.\n","authors":["Xin Lin","Chong Shi","Yibing Zhan","Zuopeng Yang","Yaqi Wu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2401.12479v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2305.14800v6","updated":"2024-01-23T04:01:43Z","published":"2023-05-24T06:52:47Z","title":"Exploring Diverse In-Context Configurations for Image Captioning","summary":"  After discovering that Language Models (LMs) can be good in-context few-shot\nlearners, numerous strategies have been proposed to optimize in-context\nsequence configurations. Recently, researchers in Vision-Language (VL) domains\nalso develop their few-shot learners, while they only use the simplest way,\nie., randomly sampling, to configure in-context image-text pairs. In order to\nexplore the effects of varying configurations on VL in-context learning, we\ndevised four strategies for image selection and four for caption assignment to\nconfigure in-context image-text pairs for image captioning. Here Image\nCaptioning is used as the case study since it can be seen as the\nvisually-conditioned LM. Our comprehensive experiments yield two\ncounter-intuitive but valuable insights, highlighting the distinct\ncharacteristics of VL in-context learning due to multi-modal synergy, as\ncompared to the NLP case. Furthermore, in our exploration of optimal\ncombination strategies, we observed an average performance enhancement of 20.9\nof CIDEr scores compared to the baseline. The code is given in\nhttps://github.com/yongliang-wu/ExploreCfg.\n","authors":["Xu Yang","Yongliang Wu","Mingzhuo Yang","Haokun Chen","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2305.14800v6.pdf","comment":"Accepted by NeurIPS2023"},{"id":"http://arxiv.org/abs/2301.11915v2","updated":"2024-01-23T04:00:25Z","published":"2023-01-27T18:58:42Z","title":"Understanding Self-Supervised Pretraining with Part-Aware Representation\n  Learning","summary":"  In this paper, we are interested in understanding self-supervised pretraining\nthrough studying the capability that self-supervised representation pretraining\nmethods learn part-aware representations. The study is mainly motivated by that\nrandom views, used in contrastive learning, and random masked (visible)\npatches, used in masked image modeling, are often about object parts.\n  We explain that contrastive learning is a part-to-whole task: the projection\nlayer hallucinates the whole object representation from the object part\nrepresentation learned from the encoder, and that masked image modeling is a\npart-to-part task: the masked patches of the object are hallucinated from the\nvisible patches. The explanation suggests that the self-supervised pretrained\nencoder is required to understand the object part. We empirically compare the\noff-the-shelf encoders pretrained with several representative methods on\nobject-level recognition and part-level recognition. The results show that the\nfully-supervised model outperforms self-supervised models for object-level\nrecognition, and most self-supervised contrastive learning and masked image\nmodeling methods outperform the fully-supervised method for part-level\nrecognition. It is observed that the combination of contrastive learning and\nmasked image modeling further improves the performance.\n","authors":["Jie Zhu","Jiyang Qi","Mingyu Ding","Xiaokang Chen","Ping Luo","Xinggang Wang","Wenyu Liu","Leye Wang","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2301.11915v2.pdf","comment":"Accepted by TMLR"},{"id":"http://arxiv.org/abs/2203.13883v4","updated":"2024-01-23T03:54:48Z","published":"2022-03-25T19:45:33Z","title":"Multi-modal Misinformation Detection: Approaches, Challenges and\n  Opportunities","summary":"  As social media platforms are evolving from text-based forums into\nmulti-modal environments, the nature of misinformation in social media is also\ntransforming accordingly. Taking advantage of the fact that visual modalities\nsuch as images and videos are more favorable and attractive to the users and\ntextual contents are sometimes skimmed carelessly, misinformation spreaders\nhave recently targeted contextual connections between the modalities e.g., text\nand image. Hence many researchers have developed automatic techniques for\ndetecting possible cross-modal discordance in web-based content. We analyze,\ncategorize and identify existing approaches in addition to challenges and\nshortcomings they face in order to unearth new research opportunities in the\nfield of multi-modal misinformation detection.\n","authors":["Sara Abdali"],"pdf_url":"https://arxiv.org/pdf/2203.13883v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12471v1","updated":"2024-01-23T03:45:05Z","published":"2024-01-23T03:45:05Z","title":"Zero Shot Open-ended Video Inference","summary":"  Zero-shot open-ended inference on untrimmed videos poses a significant\nchallenge, especially when no annotated data is utilized to navigate the\ninference direction. In this work, we aim to address this underexplored domain\nby introducing an adaptable framework that efficiently combines both the frozen\nvision-language (VL) model and off-the-shelf large language model (LLM) for\nconducting zero-shot open-ended inference tasks without requiring any\nadditional training or fine-tuning. Our comprehensive experiments span various\nvideo action datasets for goal inference and action recognition tasks. The\nresults demonstrate the framework's superior performance in goal inference\ncompared to conventional vision-language models in open-ended and close-ended\nscenarios. Notably, the proposed framework exhibits the capability to\ngeneralize effectively to action recognition tasks, underscoring its\nversatility and potential contributions to advancing the video-based zero-shot\nunderstanding.\n","authors":["Ee Yeo Keat","Zhang Hao","Alexander Matyasko","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2401.12471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14451v2","updated":"2024-01-23T03:41:44Z","published":"2023-06-26T06:45:16Z","title":"Learning Prompt-Enhanced Context Features for Weakly-Supervised Video\n  Anomaly Detection","summary":"  Video anomaly detection under weak supervision presents significant\nchallenges, particularly due to the lack of frame-level annotations during\ntraining. While prior research has utilized graph convolution networks and\nself-attention mechanisms alongside multiple instance learning (MIL)-based\nclassification loss to model temporal relations and learn discriminative\nfeatures, these methods often employ multi-branch architectures to capture\nlocal and global dependencies separately, resulting in increased parameters and\ncomputational costs. Moreover, the coarse-grained interclass separability\nprovided by the binary constraint of MIL-based loss neglects the fine-grained\ndiscriminability within anomalous classes. In response, this paper introduces a\nweakly supervised anomaly detection framework that focuses on efficient context\nmodeling and enhanced semantic discriminability. We present a Temporal Context\nAggregation (TCA) module that captures comprehensive contextual information by\nreusing the similarity matrix and implementing adaptive fusion. Additionally,\nwe propose a Prompt-Enhanced Learning (PEL) module that integrates semantic\npriors using knowledge-based prompts to boost the discriminative capacity of\ncontext features while ensuring separability between anomaly sub-classes.\nExtensive experiments validate the effectiveness of our method's components,\ndemonstrating competitive performance with reduced parameters and computational\neffort on three challenging benchmarks: UCF-Crime, XD-Violence, and\nShanghaiTech datasets. Notably, our approach significantly improves the\ndetection accuracy of certain anomaly sub-classes, underscoring its practical\nvalue and efficacy. Our code is available at:\nhttps://github.com/yujiangpu20/PEL4VAD.\n","authors":["Yujiang Pu","Xiaoyu Wu","Lulu Yang","Shengjin Wang"],"pdf_url":"https://arxiv.org/pdf/2306.14451v2.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.12001v2","updated":"2024-01-23T03:19:12Z","published":"2024-01-22T14:52:08Z","title":"Modeling Stereo-Confidence Out of the End-to-End Stereo-Matching Network\n  via Disparity Plane Sweep","summary":"  We propose a novel stereo-confidence that can be measured externally to\nvarious stereo-matching networks, offering an alternative input modality choice\nof the cost volume for learning-based approaches, especially in safety-critical\nsystems. Grounded in the foundational concepts of disparity definition and the\ndisparity plane sweep, the proposed stereo-confidence method is built upon the\nidea that any shift in a stereo-image pair should be updated in a corresponding\namount shift in the disparity map. Based on this idea, the proposed\nstereo-confidence method can be summarized in three folds. 1) Using the\ndisparity plane sweep, multiple disparity maps can be obtained and treated as a\n3-D volume (predicted disparity volume), like the cost volume is constructed.\n2) One of these disparity maps serves as an anchor, allowing us to define a\ndesirable (or ideal) disparity profile at every spatial point. 3) By comparing\nthe desirable and predicted disparity profiles, we can quantify the level of\nmatching ambiguity between left and right images for confidence measurement.\nExtensive experimental results using various stereo-matching networks and\ndatasets demonstrate that the proposed stereo-confidence method not only shows\ncompetitive performance on its own but also consistent performance improvements\nwhen it is used as an input modality for learning-based stereo-confidence\nmethods.\n","authors":["Jae Young Lee","Woonghyun Ka","Jaehyun Choi","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2401.12001v2.pdf","comment":"AAAI 2024. The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2401.12019v2","updated":"2024-01-23T03:16:43Z","published":"2024-01-22T15:05:05Z","title":"Stereo-Matching Knowledge Distilled Monocular Depth Estimation Filtered\n  by Multiple Disparity Consistency","summary":"  In stereo-matching knowledge distillation methods of the self-supervised\nmonocular depth estimation, the stereo-matching network's knowledge is\ndistilled into a monocular depth network through pseudo-depth maps. In these\nmethods, the learning-based stereo-confidence network is generally utilized to\nidentify errors in the pseudo-depth maps to prevent transferring the errors.\nHowever, the learning-based stereo-confidence networks should be trained with\nground truth (GT), which is not feasible in a self-supervised setting. In this\npaper, we propose a method to identify and filter errors in the pseudo-depth\nmap using multiple disparity maps by checking their consistency without the\nneed for GT and a training process. Experimental results show that the proposed\nmethod outperforms the previous methods and works well on various\nconfigurations by filtering out erroneous areas where the stereo-matching is\nvulnerable, especially such as textureless regions, occlusion boundaries, and\nreflective surfaces.\n","authors":["Woonghyun Ka","Jae Young Lee","Jaehyun Choi","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2401.12019v2.pdf","comment":"ICASSP 2024. The first two authors are equally contributed"},{"id":"http://arxiv.org/abs/2401.09495v4","updated":"2024-01-23T03:09:53Z","published":"2024-01-17T01:33:40Z","title":"IPR-NeRF: Ownership Verification meets Neural Radiance Field","summary":"  Neural Radiance Field (NeRF) models have gained significant attention in the\ncomputer vision community in the recent past with state-of-the-art visual\nquality and produced impressive demonstrations. Since then, technopreneurs have\nsought to leverage NeRF models into a profitable business. Therefore, NeRF\nmodels make it worth the risk of plagiarizers illegally copying,\nre-distributing, or misusing those models. This paper proposes a comprehensive\nintellectual property (IP) protection framework for the NeRF model in both\nblack-box and white-box settings, namely IPR-NeRF. In the black-box setting, a\ndiffusion-based solution is introduced to embed and extract the watermark via a\ntwo-stage optimization process. In the white-box setting, a designated digital\nsignature is embedded into the weights of the NeRF model by adopting the sign\nloss objective. Our extensive experiments demonstrate that not only does our\napproach maintain the fidelity (\\ie, the rendering quality) of IPR-NeRF models,\nbut it is also robust against both ambiguity and removal attacks compared to\nprior arts.\n","authors":["Win Kent Ong","Kam Woh Ng","Chee Seng Chan","Yi Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2401.09495v4.pdf","comment":"Error on result tabulation of state of the art method which might\n  cause misleading to readers"},{"id":"http://arxiv.org/abs/2309.17105v4","updated":"2024-01-23T02:59:35Z","published":"2023-09-29T10:06:28Z","title":"Continual Action Assessment via Task-Consistent Score-Discriminative\n  Feature Distribution Modeling","summary":"  Action Quality Assessment (AQA) is a task that tries to answer how well an\naction is carried out. While remarkable progress has been achieved, existing\nworks on AQA assume that all the training data are visible for training in one\ntime, but do not enable continual learning on assessing new technical actions.\nIn this work, we address such a Continual Learning problem in AQA\n(Continual-AQA), which urges a unified model to learn AQA tasks sequentially\nwithout forgetting. Our idea for modeling Continual-AQA is to sequentially\nlearn a task-consistent score-discriminative feature distribution, in which the\nlatent features express a strong correlation with the score labels regardless\nof the task or action types. From this perspective, we aim to mitigate the\nforgetting in Continual-AQA from two aspects. Firstly, to fuse the features of\nnew and previous data into a score-discriminative distribution, a novel\nFeature-Score Correlation-Aware Rehearsal is proposed to store and reuse data\nfrom previous tasks with limited memory size. Secondly, an Action\nGeneral-Specific Graph is developed to learn and decouple the action-general\nand action-specific knowledge so that the task-consistent score-discriminative\nfeatures can be better extracted across various tasks. Extensive experiments\nare conducted to evaluate the contributions of proposed components. The\ncomparisons with the existing continual learning methods additionally verify\nthe effectiveness and versatility of our approach.\n","authors":["Yuan-Ming Li","Ling-An Zeng","Jing-Ke Meng","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.17105v4.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.01520v2","updated":"2024-01-23T02:59:04Z","published":"2024-01-03T03:08:32Z","title":"S$^{2}$-DMs:Skip-Step Diffusion Models","summary":"  Diffusion models have emerged as powerful generative tools, rivaling GANs in\nsample quality and mirroring the likelihood scores of autoregressive models. A\nsubset of these models, exemplified by DDIMs, exhibit an inherent asymmetry:\nthey are trained over $T$ steps but only sample from a subset of $T$ during\ngeneration. This selective sampling approach, though optimized for speed,\ninadvertently misses out on vital information from the unsampled steps, leading\nto potential compromises in sample quality. To address this issue, we present\nthe S$^{2}$-DMs, which is a new training method by using an innovative\n$L_{skip}$, meticulously designed to reintegrate the information omitted during\nthe selective sampling phase. The benefits of this approach are manifold: it\nnotably enhances sample quality, is exceptionally simple to implement, requires\nminimal code modifications, and is flexible enough to be compatible with\nvarious sampling algorithms. On the CIFAR10 dataset, models trained using our\nalgorithm showed an improvement of 3.27% to 14.06% over models trained with\ntraditional methods across various sampling algorithms (DDIMs, PNDMs, DEIS) and\ndifferent numbers of sampling steps (10, 20, ..., 1000). On the CELEBA dataset,\nthe improvement ranged from 8.97% to 27.08%. Access to the code and additional\nresources is provided in the github.\n","authors":["Yixuan Wang","Shuangyin Li"],"pdf_url":"https://arxiv.org/pdf/2401.01520v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2401.12456v1","updated":"2024-01-23T02:53:06Z","published":"2024-01-23T02:53:06Z","title":"Exploration and Improvement of Nerf-based 3D Scene Editing Techniques","summary":"  NeRF's high-quality scene synthesis capability was quickly accepted by\nscholars in the years after it was proposed, and significant progress has been\nmade in 3D scene representation and synthesis. However, the high computational\ncost limits intuitive and efficient editing of scenes, making NeRF's\ndevelopment in the scene editing field facing many challenges. This paper\nreviews the preliminary explorations of scholars on NeRF in the scene or object\nediting field in recent years, mainly changing the shape and texture of scenes\nor objects in new synthesized scenes; through the combination of residual\nmodels such as GaN and Transformer with NeRF, the generalization ability of\nNeRF scene editing has been further expanded, including realizing real-time new\nperspective editing feedback, multimodal editing of text synthesized 3D scenes,\n4D synthesis performance, and in-depth exploration in light and shadow editing,\ninitially achieving optimization of indirect touch editing and detail\nrepresentation in complex scenes. Currently, most NeRF editing methods focus on\nthe touch points and materials of indirect points, but when dealing with more\ncomplex or larger 3D scenes, it is difficult to balance accuracy, breadth,\nefficiency, and quality. Overcoming these challenges may become the direction\nof future NeRF 3D scene editing technology.\n","authors":["Shun Fang","Ming Cui","Xing Feng","Yanan Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.12456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12452v1","updated":"2024-01-23T02:41:06Z","published":"2024-01-23T02:41:06Z","title":"Self-supervised Learning of LiDAR 3D Point Clouds via 2D-3D Neural\n  Calibration","summary":"  This paper introduces a novel self-supervised learning framework for\nenhancing 3D perception in autonomous driving scenes. Specifically, our\napproach, named NCLR, focuses on 2D-3D neural calibration, a novel pretext task\nthat estimates the rigid transformation aligning camera and LiDAR coordinate\nsystems. First, we propose the learnable transformation alignment to bridge the\ndomain gap between image and point cloud data, converting features into a\nunified representation space for effective comparison and matching. Second, we\nidentify the overlapping area between the image and point cloud with the fused\nfeatures. Third, we establish dense 2D-3D correspondences to estimate the rigid\ntransformation. The framework not only learns fine-grained matching from points\nto pixels but also achieves alignment of the image and point cloud at a\nholistic level, understanding their relative pose. We demonstrate NCLR's\nefficacy by applying the pre-trained backbone to downstream tasks, such as\nLiDAR-based 3D semantic segmentation, object detection, and panoptic\nsegmentation. Comprehensive experiments on various datasets illustrate the\nsuperiority of NCLR over existing self-supervised methods. The results confirm\nthat joint learning from different modalities significantly enhances the\nnetwork's understanding abilities and effectiveness of learned representation.\nCode will be available at \\url{https://github.com/Eaphan/NCLR}.\n","authors":["Yifan Zhang","Siyu Ren","Junhui Hou","Jinjian Wu","Guangming Shi"],"pdf_url":"https://arxiv.org/pdf/2401.12452v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2401.12451v1","updated":"2024-01-23T02:30:16Z","published":"2024-01-23T02:30:16Z","title":"Methods and strategies for improving the novel view synthesis quality of\n  neural radiation field","summary":"  Neural Radiation Field (NeRF) technology can learn a 3D implicit model of a\nscene from 2D images and synthesize realistic novel view images. This\ntechnology has received widespread attention from the industry and has good\napplication prospects. In response to the problem that the rendering quality of\nNeRF images needs to be improved, many researchers have proposed various\nmethods to improve the rendering quality in the past three years. The latest\nrelevant papers are classified and reviewed, the technical principles behind\nquality improvement are analyzed, and the future evolution direction of quality\nimprovement methods is discussed. This study can help researchers quickly\nunderstand the current state and evolutionary context of technology in this\nfield, which is helpful in inspiring the development of more efficient\nalgorithms and promoting the application of NeRF technology in related fields.\n","authors":["Shun Fang","Ming Cui","Xing Feng","Yanna Lv"],"pdf_url":"https://arxiv.org/pdf/2401.12451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02317v3","updated":"2024-01-23T02:29:35Z","published":"2023-05-03T17:58:29Z","title":"Visual Chain of Thought: Bridging Logical Gaps with Multimodal\n  Infillings","summary":"  Recent advances in large language models elicit reasoning in a\nchain-of-thought that allows models to decompose problems in a human-like\nfashion. Though this paradigm improves multi-step reasoning ability in language\nmodels, it is limited by being unimodal and applied mainly to\nquestion-answering tasks. We claim that incorporating visual augmentation into\nreasoning is essential, especially for complex, imaginative tasks.\nConsequently, we introduce VCoT, a novel method that leverages chain-of-thought\nprompting with vision-language grounding to recursively bridge the logical gaps\nwithin sequential data. Our method uses visual guidance to generate synthetic\nmultimodal infillings that add consistent and novel information to reduce the\nlogical gaps for downstream tasks that can benefit from temporal reasoning, as\nwell as provide interpretability into models' multi-step reasoning. We apply\nVCoT to the Visual Storytelling and WikiHow summarization datasets and\ndemonstrate through human evaluation that VCoT offers novel and consistent\nsynthetic data augmentation beating chain-of-thought baselines, which can be\nused to enhance downstream performance.\n","authors":["Daniel Rose","Vaishnavi Himakunthala","Andy Ouyang","Ryan He","Alex Mei","Yujie Lu","Michael Saxon","Chinmay Sonar","Diba Mirza","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2305.02317v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12447v1","updated":"2024-01-23T02:25:23Z","published":"2024-01-23T02:25:23Z","title":"NIV-SSD: Neighbor IoU-Voting Single-Stage Object Detector From Point\n  Cloud","summary":"  Previous single-stage detectors typically suffer the misalignment between\nlocalization accuracy and classification confidence. To solve the misalignment\nproblem, we introduce a novel rectification method named neighbor IoU-voting\n(NIV) strategy. Typically, classification and regression are treated as\nseparate branches, making it challenging to establish a connection between\nthem. Consequently, the classification confidence cannot accurately reflect the\nregression quality. NIV strategy can serve as a bridge between classification\nand regression branches by calculating two types of statistical data from the\nregression output to correct the classification confidence. Furthermore, to\nalleviate the imbalance of detection accuracy for complete objects with dense\npoints (easy objects) and incomplete objects with sparse points (difficult\nobjects), we propose a new data augmentation scheme named object resampling. It\nundersamples easy objects and oversamples difficult objects by randomly\ntransforming part of easy objects into difficult objects. Finally, combining\nthe NIV strategy and object resampling augmentation, we design an efficient\nsingle-stage detector termed NIV-SSD. Extensive experiments on several datasets\nindicate the effectiveness of the NIV strategy and the competitive performance\nof the NIV-SSD detector. The code will be available at\nhttps://github.com/Say2L/NIV-SSD.\n","authors":["Shuai Liu","Di Wang","Quan Wang","Kai Huang"],"pdf_url":"https://arxiv.org/pdf/2401.12447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12439v1","updated":"2024-01-23T02:18:53Z","published":"2024-01-23T02:18:53Z","title":"MAST: Video Polyp Segmentation with a Mixture-Attention Siamese\n  Transformer","summary":"  Accurate segmentation of polyps from colonoscopy videos is of great\nsignificance to polyp treatment and early prevention of colorectal cancer.\nHowever, it is challenging due to the difficulties associated with modelling\nlong-range spatio-temporal relationships within a colonoscopy video. In this\npaper, we address this challenging task with a novel Mixture-Attention Siamese\nTransformer (MAST), which explicitly models the long-range spatio-temporal\nrelationships with a mixture-attention mechanism for accurate polyp\nsegmentation. Specifically, we first construct a Siamese transformer\narchitecture to jointly encode paired video frames for their feature\nrepresentations. We then design a mixture-attention module to exploit the\nintra-frame and inter-frame correlations, enhancing the features with rich\nspatio-temporal relationships. Finally, the enhanced features are fed to two\nparallel decoders for predicting the segmentation maps. To the best of our\nknowledge, our MAST is the first transformer model dedicated to video polyp\nsegmentation. Extensive experiments on the large-scale SUN-SEG benchmark\ndemonstrate the superior performance of MAST in comparison with the\ncutting-edge competitors. Our code is publicly available at\nhttps://github.com/Junqing-Yang/MAST.\n","authors":["Geng Chen","Junqing Yang","Xiaozhou Pu","Ge-Peng Ji","Huan Xiong","Yongsheng Pan","Hengfei Cui","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2401.12439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12438v1","updated":"2024-01-23T02:14:05Z","published":"2024-01-23T02:14:05Z","title":"Secure Federated Learning Approaches to Diagnosing COVID-19","summary":"  The recent pandemic has underscored the importance of accurately diagnosing\nCOVID-19 in hospital settings. A major challenge in this regard is\ndifferentiating COVID-19 from other respiratory illnesses based on chest\nX-rays, compounded by the restrictions of HIPAA compliance which limit the\ncomparison of patient X-rays. This paper introduces a HIPAA-compliant model to\naid in the diagnosis of COVID-19, utilizing federated learning. Federated\nlearning is a distributed machine learning approach that allows for algorithm\ntraining across multiple decentralized devices using local data samples,\nwithout the need for data sharing. Our model advances previous efforts in chest\nX-ray diagnostic models. We examined leading models from established\ncompetitions in this domain and developed our own models tailored to be\neffective with specific hospital data. Considering the model's operation in a\nfederated learning context, we explored the potential impact of biased data\nupdates on the model's performance. To enhance hospital understanding of the\nmodel's decision-making process and to verify that the model is not focusing on\nirrelevant features, we employed a visualization technique that highlights key\nfeatures in chest X-rays indicative of a positive COVID-19 diagnosis.\n","authors":["Rittika Adhikari","Christopher Settles"],"pdf_url":"https://arxiv.org/pdf/2401.12438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11687v2","updated":"2024-01-23T02:08:09Z","published":"2024-01-22T04:54:42Z","title":"TIM: An Efficient Temporal Interaction Module for Spiking Transformer","summary":"  Spiking Neural Networks (SNNs), as the third generation of neural networks,\nhave gained prominence for their biological plausibility and computational\nefficiency, especially in processing diverse datasets. The integration of\nattention mechanisms, inspired by advancements in neural network architectures,\nhas led to the development of Spiking Transformers. These have shown promise in\nenhancing SNNs' capabilities, particularly in the realms of both static and\nneuromorphic datasets. Despite their progress, a discernible gap exists in\nthese systems, specifically in the Spiking Self Attention (SSA) mechanism's\neffectiveness in leveraging the temporal processing potential of SNNs. To\naddress this, we introduce the Temporal Interaction Module (TIM), a novel,\nconvolution-based enhancement designed to augment the temporal data processing\nabilities within SNN architectures. TIM's integration into existing SNN\nframeworks is seamless and efficient, requiring minimal additional parameters\nwhile significantly boosting their temporal information handling capabilities.\nThrough rigorous experimentation, TIM has demonstrated its effectiveness in\nexploiting temporal information, leading to state-of-the-art performance across\nvarious neuromorphic datasets.\n","authors":["Sicheng Shen","Dongcheng Zhao","Guobin Shen","Yi Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.11687v2.pdf","comment":"9pages,6figures"},{"id":"http://arxiv.org/abs/2401.12433v1","updated":"2024-01-23T01:52:49Z","published":"2024-01-23T01:52:49Z","title":"A Novel Garment Transfer Method Supervised by Distilled Knowledge of\n  Virtual Try-on Model","summary":"  When a shopper chooses garments online, garment transfer technology wears the\ngarment from the model image onto the shopper's image, allowing the shopper to\ndecide whether the garment is suitable for them. As garment transfer leverages\nwild and cheap person image as garment condition, it has attracted tremendous\ncommunity attention and holds vast commercial potential. However, since the\nground truth of garment transfer is almost unavailable in reality, previous\nstudies have treated garment transfer as either pose transfer or garment-pose\ndisentanglement, and trained garment transfer in self-supervised learning, yet\ndo not cover garment transfer intentions completely. Therefore, the training\nsupervising the garment transfer is a rock-hard issue. Notably, virtual try-on\ntechnology has exhibited superior performance using self-supervised learning.\nWe supervise the garment transfer training via knowledge distillation from\nvirtual try-on. Specifically, we first train the transfer parsing reasoning\nmodel at multi-phases to provide shape guidance for downstream tasks. The\ntransfer parsing reasoning model learns the response and feature knowledge from\nthe try-on parsing reasoning model and absorbs the hard knowledge from the\nground truth. By leveraging the warping knowledge from virtual try-on, we\nestimate a progressive flow to precisely warp the garment by learning the shape\nand content correspondence. To enhance transfer realism, we propose a\nwell-designed arm regrowth task to infer exposed skin pixel content.\nExperiments demonstrate that our method has state-of-the-art performance in\ntransferring garments between person compared with other virtual try-on and\ngarment transfer methods.\n","authors":["Naiyu Fang","Lemiao Qiu","Shuyou Zhang","Zili Wang","Kerui Hu","Jianrong Tan"],"pdf_url":"https://arxiv.org/pdf/2401.12433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10610v3","updated":"2024-01-23T01:48:20Z","published":"2023-08-21T10:20:46Z","title":"Ultrafast and Ultralight Network-Based Intelligent System for Real-time\n  Diagnosis of Ear Diseases in Any Devices","summary":"  Traditional ear disease diagnosis heavily depends on experienced specialists\nand specialized equipment, frequently resulting in misdiagnoses, treatment\ndelays, and financial burdens for some patients. Utilizing deep learning models\nfor efficient ear disease diagnosis has proven effective and affordable.\nHowever, existing research overlooked model inference speed and parameter size\nrequired for deployment. To tackle these challenges, we constructed a\nlarge-scale dataset comprising eight ear disease categories and normal ear\ncanal samples from two hospitals. Inspired by ShuffleNetV2, we developed\nBest-EarNet, an ultrafast and ultralight network enabling real-time ear disease\ndiagnosis. Best-EarNet incorporates the novel Local-Global Spatial Feature\nFusion Module which can capture global and local spatial information\nsimultaneously and guide the network to focus on crucial regions within feature\nmaps at various levels, mitigating low accuracy issues. Moreover, our network\nuses multiple auxiliary classification heads for efficient parameter\noptimization. With 0.77M parameters, Best-EarNet achieves an average frames per\nsecond of 80 on CPU. Employing transfer learning and five-fold cross-validation\nwith 22,581 images from Hospital-1, the model achieves an impressive 95.23%\naccuracy. External testing on 1,652 images from Hospital-2 validates its\nperformance, yielding 92.14% accuracy. Compared to state-of-the-art networks,\nBest-EarNet establishes a new state-of-the-art (SOTA) in practical\napplications. Most importantly, we developed an intelligent diagnosis system\ncalled Ear Keeper, which can be deployed on common electronic devices. By\nmanipulating a compact electronic otoscope, users can perform comprehensive\nscanning and diagnosis of the ear canal using real-time video. This study\nprovides a novel paradigm for ear endoscopy and other medical endoscopic image\nrecognition applications.\n","authors":["Yubiao Yue","Xinyu Zeng","Xiaoqiang Shi","Meiping Zhang","Haihua Liang","Fan Zhang","Yanmei Chen","Zefeng Xie","Wenrui Wu","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2308.10610v3.pdf","comment":"18 pages,8 figures"},{"id":"http://arxiv.org/abs/2209.09930v2","updated":"2024-01-23T01:36:36Z","published":"2022-09-20T18:08:34Z","title":"Deep Superpixel Generation and Clustering for Weakly Supervised\n  Segmentation of Brain Tumors in MR Images","summary":"  Training machine learning models to segment tumors and other anomalies in\nmedical images is an important step for developing diagnostic tools but\ngenerally requires manually annotated ground truth segmentations, which\nnecessitates significant time and resources. This work proposes the use of a\nsuperpixel generation model and a superpixel clustering model to enable weakly\nsupervised brain tumor segmentations. The proposed method utilizes binary\nimage-level classification labels, which are readily accessible, to\nsignificantly improve the initial region of interest segmentations generated by\nstandard weakly supervised methods without requiring ground truth annotations.\nWe used 2D slices of magnetic resonance brain scans from the Multimodal Brain\nTumor Segmentation Challenge 2020 dataset and labels indicating the presence of\ntumors to train the pipeline. On the test cohort, our method achieved a mean\nDice coefficient of 0.691 and a mean 95% Hausdorff distance of 18.1,\noutperforming existing superpixel-based weakly supervised segmentation methods.\n","authors":["Jay J. Yoo","Khashayar Namdar","Farzad Khalvati"],"pdf_url":"https://arxiv.org/pdf/2209.09930v2.pdf","comment":"12 pages, LaTeX; updated methodology, added additional results,\n  revised discussion"},{"id":"http://arxiv.org/abs/2401.12425v1","updated":"2024-01-23T01:25:00Z","published":"2024-01-23T01:25:00Z","title":"The Neglected Tails of Vision-Language Models","summary":"  Vision-language models (VLMs) excel in zero-shot recognition but exhibit\ndrastically imbalanced performance across visual concepts. For example, CLIP,\ndespite an impressive mean zero-shot accuracy on ImageNet (72.7%), yields\n$<$10% on ten concepts (e.g., gyromitra and night snake), presumably, because\nthese concepts are under-represented in VLMs' imbalanced pretraining data. Yet,\nassessing this imbalance is challenging as it is non-trivial to calculate the\nfrequency of specific concepts within VLMs' large-scale pretraining data. Our\nwork makes the first attempt to measure the concept frequency by analyzing\npretraining texts. We use off-the-shelf language models to help count relevant\ntexts that contain synonyms of the given concepts and resolve linguistic\nambiguity. We confirm that popular VLM datasets like LAION indeed exhibit\nlong-tailed concept distributions, which strongly correlate with per-class\naccuracies. Further, contemporary multimodal systems, e.g., visual chatbots and\ntext-to-image generators, also struggle with the rare concepts identified by\nour method. To mitigate VLMs' imbalanced performance in zero-shot recognition,\nwe propose REtrieval-Augmented Learning REAL. First, instead of prompting VLMs\nusing the original class names, REAL uses their most frequent synonyms found in\nVLMs' pretraining texts. This already outperforms human-engineered and\nLLM-generated prompts over nine benchmark datasets, likely because VLMs have\nseen more images associated with the frequently used synonyms. Second, REAL\nuses all the concept synonyms to retrieve a small, class-balanced set of\npretraining data to train a robust classifier. REAL surpasses the recent\nretrieval-augmented solution REACT, using 400x less storage and 10,000x less\ntraining time!\n","authors":["Shubham Parashar","Zhiqiu Lin","Tian Liu","Xiangjue Dong","Yanan Li","Deva Ramanan","James Caverlee","Shu Kong"],"pdf_url":"https://arxiv.org/pdf/2401.12425v1.pdf","comment":"Project Page:\n  https://shubhamprshr27.github.io/neglected-tails-of-vlms/"},{"id":"http://arxiv.org/abs/2401.12422v1","updated":"2024-01-23T01:11:10Z","published":"2024-01-23T01:11:10Z","title":"InverseMatrixVT3D: An Efficient Projection Matrix-Based Approach for 3D\n  Occupancy Prediction","summary":"  This paper introduces InverseMatrixVT3D, an efficient method for transforming\nmulti-view image features into 3D feature volumes for 3D semantic occupancy\nprediction. Existing methods for constructing 3D volumes often rely on depth\nestimation, device-specific operators, or transformer queries, which hinders\nthe widespread adoption of 3D occupancy models. In contrast, our approach\nleverages two projection matrices to store the static mapping relationships and\nmatrix multiplications to efficiently generate global Bird's Eye View (BEV)\nfeatures and local 3D feature volumes. Specifically, we achieve this by\nperforming matrix multiplications between multi-view image feature maps and two\nsparse projection matrices. We introduce a sparse matrix handling technique for\nthe projection matrices to optimise GPU memory usage. Moreover, a global-local\nattention fusion module is proposed to integrate the global BEV features with\nthe local 3D feature volumes to obtain the final 3D volume. We also employ a\nmulti-scale supervision mechanism to further enhance performance. Comprehensive\nexperiments on the nuScenes dataset demonstrate the simplicity and\neffectiveness of our method. The code will be made available\nat:https://github.com/DanielMing123/InverseMatrixVT3D\n","authors":["Zhenxing Ming","Julie Stephany Berrio","Mao Shan","Stewart Worrall"],"pdf_url":"https://arxiv.org/pdf/2401.12422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12421v1","updated":"2024-01-23T01:10:25Z","published":"2024-01-23T01:10:25Z","title":"AdaEmbed: Semi-supervised Domain Adaptation in the Embedding Space","summary":"  Semi-supervised domain adaptation (SSDA) presents a critical hurdle in\ncomputer vision, especially given the frequent scarcity of labeled data in\nreal-world settings. This scarcity often causes foundation models, trained on\nextensive datasets, to underperform when applied to new domains. AdaEmbed, our\nnewly proposed methodology for SSDA, offers a promising solution to these\nchallenges. Leveraging the potential of unlabeled data, AdaEmbed facilitates\nthe transfer of knowledge from a labeled source domain to an unlabeled target\ndomain by learning a shared embedding space. By generating accurate and uniform\npseudo-labels based on the established embedding space, the model overcomes the\nlimitations of conventional SSDA, thus enhancing performance significantly. Our\nmethod's effectiveness is validated through extensive experiments on benchmark\ndatasets such as DomainNet, Office-Home, and VisDA-C, where AdaEmbed\nconsistently outperforms all the baselines, setting a new state of the art for\nSSDA. With its straightforward implementation and high data efficiency,\nAdaEmbed stands out as a robust and pragmatic solution for real-world\nscenarios, where labeled data is scarce. To foster further research and\napplication in this area, we are sharing the codebase of our unified framework\nfor semi-supervised domain adaptation.\n","authors":["Ali Mottaghi","Mohammad Abdullah Jamal","Serena Yeung","Omid Mohareri"],"pdf_url":"https://arxiv.org/pdf/2401.12421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11321v2","updated":"2024-01-23T01:09:46Z","published":"2023-05-18T22:09:32Z","title":"JoIN: Joint GANs Inversion for Intrinsic Image Decomposition","summary":"  In this work, we propose to solve ill-posed inverse imaging problems using a\nbank of Generative Adversarial Networks (GAN) as a prior and apply our method\nto the case of Intrinsic Image Decomposition for faces and materials. Our\nmethod builds on the demonstrated success of GANs to capture complex image\ndistributions. At the core of our approach is the idea that the latent space of\na GAN is a well-suited optimization domain to solve inverse problems. Given an\ninput image, we propose to jointly inverse the latent codes of a set of GANs\nand combine their outputs to reproduce the input. Contrary to most GAN\ninversion methods which are limited to inverting only a single GAN, we\ndemonstrate that it is possible to maintain distribution priors while inverting\nseveral GANs jointly. We show that our approach is modular, allowing various\nforward imaging models, and that it can successfully decompose both synthetic\nand real images.\n","authors":["Viraj Shah","Svetlana Lazebnik","Julien Philip"],"pdf_url":"https://arxiv.org/pdf/2305.11321v2.pdf","comment":"Project webpage is available at https://virajshah.com/join"},{"id":"http://arxiv.org/abs/2401.12419v1","updated":"2024-01-23T00:42:04Z","published":"2024-01-23T00:42:04Z","title":"Multi-modal News Understanding with Professionally Labelled Videos\n  (ReutersViLNews)","summary":"  While progress has been made in the domain of video-language understanding,\ncurrent state-of-the-art algorithms are still limited in their ability to\nunderstand videos at high levels of abstraction, such as news-oriented videos.\nAlternatively, humans easily amalgamate information from video and language to\ninfer information beyond what is visually observable in the pixels. An example\nof this is watching a news story, where the context of the event can play as\nbig of a role in understanding the story as the event itself. Towards a\nsolution for designing this ability in algorithms, we present a large-scale\nanalysis on an in-house dataset collected by the Reuters News Agency, called\nReuters Video-Language News (ReutersViLNews) dataset which focuses on\nhigh-level video-language understanding with an emphasis on long-form news. The\nReutersViLNews Dataset consists of long-form news videos collected and labeled\nby news industry professionals over several years and contains prominent news\nreporting from around the world. Each video involves a single story and\ncontains action shots of the actual event, interviews with people associated\nwith the event, footage from nearby areas, and more. ReutersViLNews dataset\ncontains videos from seven subject categories: disaster, finance,\nentertainment, health, politics, sports, and miscellaneous with annotations\nfrom high-level to low-level, title caption, visual video description,\nhigh-level story description, keywords, and location. We first present an\nanalysis of the dataset statistics of ReutersViLNews compared to previous\ndatasets. Then we benchmark state-of-the-art approaches for four different\nvideo-language tasks. The results suggest that news-oriented videos are a\nsubstantial challenge for current video-language understanding algorithms and\nwe conclude by providing future directions in designing approaches to solve the\nReutersViLNews dataset.\n","authors":["Shih-Han Chou","Matthew Kowal","Yasmin Niknam","Diana Moyano","Shayaan Mehdi","Richard Pito","Cheng Zhang","Ian Knopke","Sedef Akinli Kocak","Leonid Sigal","Yalda Mohsenzadeh"],"pdf_url":"https://arxiv.org/pdf/2401.12419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12414v1","updated":"2024-01-23T00:06:19Z","published":"2024-01-23T00:06:19Z","title":"Icy Moon Surface Simulation and Stereo Depth Estimation for Sampling\n  Autonomy","summary":"  Sampling autonomy for icy moon lander missions requires understanding of\ntopographic and photometric properties of the sampling terrain. Unavailability\nof high resolution visual datasets (either bird-eye view or point-of-view from\na lander) is an obstacle for selection, verification or development of\nperception systems. We attempt to alleviate this problem by: 1) proposing\nGraphical Utility for Icy moon Surface Simulations (GUISS) framework, for\nversatile stereo dataset generation that spans the spectrum of bulk photometric\nproperties, and 2) focusing on a stereo-based visual perception system and\nevaluating both traditional and deep learning-based algorithms for depth\nestimation from stereo matching. The surface reflectance properties of icy moon\nterrains (Enceladus and Europa) are inferred from multispectral datasets of\nprevious missions. With procedural terrain generation and physically valid\nillumination sources, our framework can fit a wide range of hypotheses with\nrespect to visual representations of icy moon terrains. This is followed by a\nstudy over the performance of stereo matching algorithms under different visual\nhypotheses. Finally, we emphasize the standing challenges to be addressed for\nsimulating perception data assets for icy moons such as Enceladus and Europa.\nOur code can be found here: https://github.com/nasa-jpl/guiss.\n","authors":["Ramchander Bhaskara","Georgios Georgakis","Jeremy Nash","Marissa Cameron","Joseph Bowkett","Adnan Ansar","Manoranjan Majji","Paul Backes"],"pdf_url":"https://arxiv.org/pdf/2401.12414v1.pdf","comment":"Software: https://github.com/nasa-jpl/guiss. IEEE Aerospace\n  Conference 2024"},{"id":"http://arxiv.org/abs/2401.13147v1","updated":"2024-01-23T23:50:04Z","published":"2024-01-23T23:50:04Z","title":"Deep Spatiotemporal Clutter Filtering of Transthoracic Echocardiographic\n  Images Using a 3D Convolutional Auto-Encoder","summary":"  This study presents a deep convolutional auto-encoder network for filtering\nreverberation artifacts, from transthoracic echocardiographic (TTE) image\nsequences. Given the spatiotemporal nature of these artifacts, the filtering\nnetwork was built using 3D convolutional layers to suppress the clutter\npatterns throughout the cardiac cycle. The network was designed by taking\nadvantage of: i) an attention mechanism to focus primarily on cluttered regions\nand ii) residual learning to preserve fine structures of the image frames. To\ntrain the deep network, a diverse set of artifact patterns was simulated and\nthe simulated patterns were superimposed onto artifact-free ultra-realistic\nsynthetic TTE sequences of six ultrasound vendors to generate input of the\nfiltering network. The artifact-free sequences served as ground-truth.\nPerformance of the filtering network was evaluated using unseen synthetic as\nwell as in-vivo artifactual sequences. Satisfactory results obtained using the\nlatter dataset confirmed the good generalization performance of the proposed\nnetwork which was trained using the synthetic sequences and simulated artifact\npatterns. Suitability of the clutter-filtered sequences for further processing\nwas assessed by computing segmental strain curves from them. The results showed\nthat the large discrepancy between the strain profiles computed from the\ncluttered segments and their corresponding segments in the clutter-free images\nwas significantly reduced after filtering the sequences using the proposed\nnetwork. The trained deep network could process an artifactual TTE sequence in\na fraction of a second and can be used for real-time clutter filtering.\nMoreover, it can improve the precision of the clinical indexes that are\ncomputed from the TTE sequences. The source code of the proposed method is\navailable at:\nhttps://github.com/MahdiTabassian/Deep-Clutter-Filtering/tree/main.\n","authors":["Mahdi Tabassian","Somayeh Akbari. S","Sandro Queirós","Jan D'hooge"],"pdf_url":"https://arxiv.org/pdf/2401.13147v1.pdf","comment":"18 pages, 14 figures"},{"id":"http://arxiv.org/abs/2401.00496v2","updated":"2024-01-23T23:30:57Z","published":"2023-12-31T13:32:18Z","title":"SAR-RARP50: Segmentation of surgical instrumentation and Action\n  Recognition on Robot-Assisted Radical Prostatectomy Challenge","summary":"  Surgical tool segmentation and action recognition are fundamental building\nblocks in many computer-assisted intervention applications, ranging from\nsurgical skills assessment to decision support systems. Nowadays,\nlearning-based action recognition and segmentation approaches outperform\nclassical methods, relying, however, on large, annotated datasets. Furthermore,\naction recognition and tool segmentation algorithms are often trained and make\npredictions in isolation from each other, without exploiting potential\ncross-task relationships. With the EndoVis 2022 SAR-RARP50 challenge, we\nrelease the first multimodal, publicly available, in-vivo, dataset for surgical\naction recognition and semantic instrumentation segmentation, containing 50\nsuturing video segments of Robotic Assisted Radical Prostatectomy (RARP). The\naim of the challenge is twofold. First, to enable researchers to leverage the\nscale of the provided dataset and develop robust and highly accurate\nsingle-task action recognition and tool segmentation approaches in the surgical\ndomain. Second, to further explore the potential of multitask-based learning\napproaches and determine their comparative advantage against their single-task\ncounterparts. A total of 12 teams participated in the challenge, contributing 7\naction recognition methods, 9 instrument segmentation techniques, and 4\nmultitask approaches that integrated both action recognition and instrument\nsegmentation. The complete SAR-RARP50 dataset is available at:\nhttps://rdr.ucl.ac.uk/projects/SARRARP50_Segmentation_of_surgical_instrumentation_and_Action_Recognition_on_Robot-Assisted_Radical_Prostatectomy_Challenge/191091\n","authors":["Dimitrios Psychogyios","Emanuele Colleoni","Beatrice Van Amsterdam","Chih-Yang Li","Shu-Yu Huang","Yuchong Li","Fucang Jia","Baosheng Zou","Guotai Wang","Yang Liu","Maxence Boels","Jiayu Huo","Rachel Sparks","Prokar Dasgupta","Alejandro Granados","Sebastien Ourselin","Mengya Xu","An Wang","Yanan Wu","Long Bai","Hongliang Ren","Atsushi Yamada","Yuriko Harai","Yuto Ishikawa","Kazuyuki Hayashi","Jente Simoens","Pieter DeBacker","Francesco Cisternino","Gabriele Furnari","Alex Mottrie","Federica Ferraguti","Satoshi Kondo","Satoshi Kasai","Kousuke Hirasawa","Soohee Kim","Seung Hyun Lee","Kyu Eun Lee","Hyoun-Joong Kong","Kui Fu","Chao Li","Shan An","Stefanie Krell","Sebastian Bodenstedt","Nicolas Ayobi","Alejandra Perez","Santiago Rodriguez","Juanita Puentes","Pablo Arbelaez","Omid Mohareri","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2401.00496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13140v1","updated":"2024-01-23T23:28:15Z","published":"2024-01-23T23:28:15Z","title":"Dual-Domain Coarse-to-Fine Progressive Estimation Network for\n  Simultaneous Denoising, Limited-View Reconstruction, and Attenuation\n  Correction of Cardiac SPECT","summary":"  Single-Photon Emission Computed Tomography (SPECT) is widely applied for the\ndiagnosis of coronary artery diseases. Low-dose (LD) SPECT aims to minimize\nradiation exposure but leads to increased image noise. Limited-view (LV) SPECT,\nsuch as the latest GE MyoSPECT ES system, enables accelerated scanning and\nreduces hardware expenses but degrades reconstruction accuracy. Additionally,\nComputed Tomography (CT) is commonly used to derive attenuation maps\n($\\mu$-maps) for attenuation correction (AC) of cardiac SPECT, but it will\nintroduce additional radiation exposure and SPECT-CT misalignments. Although\nvarious methods have been developed to solely focus on LD denoising, LV\nreconstruction, or CT-free AC in SPECT, the solution for simultaneously\naddressing these tasks remains challenging and under-explored. Furthermore, it\nis essential to explore the potential of fusing cross-domain and cross-modality\ninformation across these interrelated tasks to further enhance the accuracy of\neach task. Thus, we propose a Dual-Domain Coarse-to-Fine Progressive Network\n(DuDoCFNet), a multi-task learning method for simultaneous LD denoising, LV\nreconstruction, and CT-free $\\mu$-map generation of cardiac SPECT. Paired\ndual-domain networks in DuDoCFNet are cascaded using a multi-layer fusion\nmechanism for cross-domain and cross-modality feature fusion. Two-stage\nprogressive learning strategies are applied in both projection and image\ndomains to achieve coarse-to-fine estimations of SPECT projections and\nCT-derived $\\mu$-maps. Our experiments demonstrate DuDoCFNet's superior\naccuracy in estimating projections, generating $\\mu$-maps, and AC\nreconstructions compared to existing single- or multi-task learning methods,\nunder various iterations and LD levels. The source code of this work is\navailable at https://github.com/XiongchaoChen/DuDoCFNet-MultiTask.\n","authors":["Xiongchao Chen","Bo Zhou","Xueqi Guo","Huidong Xie","Qiong Liu","James S. Duncan","Albert J. Sinusas","Chi Liu"],"pdf_url":"https://arxiv.org/pdf/2401.13140v1.pdf","comment":"11 Pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2211.04625v2","updated":"2024-01-23T21:24:53Z","published":"2022-11-09T01:04:06Z","title":"Soft Augmentation for Image Classification","summary":"  Modern neural networks are over-parameterized and thus rely on strong\nregularization such as data augmentation and weight decay to reduce overfitting\nand improve generalization. The dominant form of data augmentation applies\ninvariant transforms, where the learning target of a sample is invariant to the\ntransform applied to that sample. We draw inspiration from human visual\nclassification studies and propose generalizing augmentation with invariant\ntransforms to soft augmentation where the learning target softens non-linearly\nas a function of the degree of the transform applied to the sample: e.g., more\naggressive image crop augmentations produce less confident learning targets. We\ndemonstrate that soft targets allow for more aggressive data augmentation,\noffer more robust performance boosts, work with other augmentation policies,\nand interestingly, produce better calibrated models (since they are trained to\nbe less confident on aggressively cropped/occluded examples). Combined with\nexisting aggressive augmentation strategies, soft target 1) doubles the top-1\naccuracy boost across Cifar-10, Cifar-100, ImageNet-1K, and ImageNet-V2, 2)\nimproves model occlusion performance by up to $4\\times$, and 3) halves the\nexpected calibration error (ECE). Finally, we show that soft augmentation\ngeneralizes to self-supervised classification tasks. Code available at\nhttps://github.com/youngleox/soft_augmentation\n","authors":["Yang Liu","Shen Yan","Laura Leal-Taixé","James Hays","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2211.04625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13097v1","updated":"2024-01-23T21:22:06Z","published":"2024-01-23T21:22:06Z","title":"Digital Divides in Scene Recognition: Uncovering Socioeconomic Biases in\n  Deep Learning Systems","summary":"  Computer-based scene understanding has influenced fields ranging from urban\nplanning to autonomous vehicle performance, yet little is known about how well\nthese technologies work across social differences. We investigate the biases of\ndeep convolutional neural networks (dCNNs) in scene classification, using\nnearly one million images from global and US sources, including user-submitted\nhome photographs and Airbnb listings. We applied statistical models to quantify\nthe impact of socioeconomic indicators such as family income, Human Development\nIndex (HDI), and demographic factors from public data sources (CIA and US\nCensus) on dCNN performance. Our analyses revealed significant socioeconomic\nbias, where pretrained dCNNs demonstrated lower classification accuracy, lower\nclassification confidence, and a higher tendency to assign labels that could be\noffensive when applied to homes (e.g., \"ruin\", \"slum\"), especially in images\nfrom homes with lower socioeconomic status (SES). This trend is consistent\nacross two datasets of international images and within the diverse economic and\nracial landscapes of the United States. This research contributes to\nunderstanding biases in computer vision, emphasizing the need for more\ninclusive and representative training datasets. By mitigating the bias in the\ncomputer vision pipelines, we can ensure fairer and more equitable outcomes for\napplied computer vision, including home valuation and smart home security\nsystems. There is urgency in addressing these biases, which can significantly\nimpact critical decisions in urban development and resource allocation. Our\nfindings also motivate the development of AI systems that better understand and\nserve diverse communities, moving towards technology that equitably benefits\nall sectors of society.\n","authors":["Michelle R. Greene","Mariam Josyula","Wentao Si","Jennifer A. Hart"],"pdf_url":"https://arxiv.org/pdf/2401.13097v1.pdf","comment":"20 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2401.13087v1","updated":"2024-01-23T20:56:16Z","published":"2024-01-23T20:56:16Z","title":"Open-source data pipeline for street-view images: a case study on\n  community mobility during COVID-19 pandemic","summary":"  Street View Images (SVI) are a common source of valuable data for\nresearchers. Researchers have used SVI data for estimating pedestrian volumes,\ndemographic surveillance, and to better understand built and natural\nenvironments in cityscapes. However, the most common source of publicly\navailable SVI data is Google Street View. Google Street View images are\ncollected infrequently, making temporal analysis challenging, especially in low\npopulation density areas. Our main contribution is the development of an\nopen-source data pipeline for processing 360-degree video recorded from a\ncar-mounted camera. The video data is used to generate SVIs, which then can be\nused as an input for temporal analysis. We demonstrate the use of the pipeline\nby collecting a SVI dataset over a 38-month longitudinal survey of Seattle, WA,\nUSA during the COVID-19 pandemic. The output of our pipeline is validated\nthrough statistical analyses of pedestrian traffic in the images. We confirm\nknown results in the literature and provide new insights into outdoor\npedestrian traffic patterns. This study demonstrates the feasibility and value\nof collecting and using SVI for research purposes beyond what is possible with\ncurrently available SVI data. Limitations and future improvements on the data\npipeline and case study are also discussed.\n","authors":["Matthew Martell","Nick Terry","Ribhu Sengupta","Chris Salazar","Nicole A. Errett","Scott B. Miles","Joseph Wartman","Youngjun Choe"],"pdf_url":"https://arxiv.org/pdf/2401.13087v1.pdf","comment":"16 pages, 4 figures, two tables. Martell and Terry are equally\n  contributing first authors"},{"id":"http://arxiv.org/abs/2306.08877v3","updated":"2024-01-23T20:55:48Z","published":"2023-06-15T06:21:44Z","title":"Linguistic Binding in Diffusion Models: Enhancing Attribute\n  Correspondence through Attention Map Alignment","summary":"  Text-conditioned image generation models often generate incorrect\nassociations between entities and their visual attributes. This reflects an\nimpaired mapping between linguistic binding of entities and modifiers in the\nprompt and visual binding of the corresponding elements in the generated image.\nAs one notable example, a query like \"a pink sunflower and a yellow flamingo\"\nmay incorrectly produce an image of a yellow sunflower and a pink flamingo. To\nremedy this issue, we propose SynGen, an approach which first syntactically\nanalyses the prompt to identify entities and their modifiers, and then uses a\nnovel loss function that encourages the cross-attention maps to agree with the\nlinguistic binding reflected by the syntax. Specifically, we encourage large\noverlap between attention maps of entities and their modifiers, and small\noverlap with other entities and modifier words. The loss is optimized during\ninference, without retraining or fine-tuning the model. Human evaluation on\nthree datasets, including one new and challenging set, demonstrate significant\nimprovements of SynGen compared with current state of the art methods. This\nwork highlights how making use of sentence structure during inference can\nefficiently and substantially improve the faithfulness of text-to-image\ngeneration.\n","authors":["Royi Rassin","Eran Hirsch","Daniel Glickman","Shauli Ravfogel","Yoav Goldberg","Gal Chechik"],"pdf_url":"https://arxiv.org/pdf/2306.08877v3.pdf","comment":"Accepted to NeurIPS 2023 (oral). Our code is publicly available at\n  https://github.com/RoyiRa/Syntax-Guided-Generation"},{"id":"http://arxiv.org/abs/2309.07254v4","updated":"2024-01-23T20:43:50Z","published":"2023-09-13T18:43:13Z","title":"Mitigate Replication and Copying in Diffusion Models with Generalized\n  Caption and Dual Fusion Enhancement","summary":"  While diffusion models demonstrate a remarkable capability for generating\nhigh-quality images, their tendency to `replicate' training data raises privacy\nconcerns. Although recent research suggests that this replication may stem from\nthe insufficient generalization of training data captions and duplication of\ntraining images, effective mitigation strategies remain elusive. To address\nthis gap, our paper first introduces a generality score that measures the\ncaption generality and employ large language model (LLM) to generalize training\ncaptions. Subsequently, we leverage generalized captions and propose a novel\ndual fusion enhancement approach to mitigate the replication of diffusion\nmodels. Our empirical results demonstrate that our proposed methods can\nsignificantly reduce replication by 43.5% compared to the original diffusion\nmodel while maintaining the diversity and quality of generations. Code is\navailable at https://github.com/HowardLi0816/dual-fusion-diffusion.\n","authors":["Chenghao Li","Dake Chen","Yuke Zhang","Peter A. Beerel"],"pdf_url":"https://arxiv.org/pdf/2309.07254v4.pdf","comment":"This paper has been accepted for presentation at 2024 IEEE\n  International Conference on Acoustics, Speech, and Signal Processing (ICASSP\n  2024)"},{"id":"http://arxiv.org/abs/2309.04447v3","updated":"2024-01-23T20:34:05Z","published":"2023-09-08T17:13:22Z","title":"Impact of Blur and Resolution on Demographic Disparities in 1-to-Many\n  Facial Identification","summary":"  Most studies to date that have examined demographic variations in face\nrecognition accuracy have analyzed 1-to-1 matching accuracy, using images that\ncould be described as \"government ID quality\". This paper analyzes the accuracy\nof 1-to-many facial identification across demographic groups, and in the\npresence of blur and reduced resolution in the probe image as might occur in\n\"surveillance camera quality\" images. Cumulative match characteristic curves\n(CMC) are not appropriate for comparing propensity for rank-one recognition\nerrors across demographics, and so we use three metrics for our analysis: (1)\nthe well-known d' metric between mated and non-mated score distributions, and\nintroduced in this work, (2) absolute score difference between thresholds in\nthe high-similarity tail of the non-mated and the low-similarity tail of the\nmated distribution, and (3) distribution of (mated - non-mated rank-one scores)\nacross the set of probe images. We find that demographic variation in 1-to-many\naccuracy does not entirely follow what has been observed in 1-to-1 matching\naccuracy. Also, different from 1-to-1 accuracy, demographic comparison of\n1-to-many accuracy can be affected by different numbers of identities and\nimages across demographics. More importantly, we show that increased blur in\nthe probe image, or reduced resolution of the face in the probe image, can\nsignificantly increase the false positive identification rate. And we show that\nthe demographic variation in these high blur or low resolution conditions is\nmuch larger for male / female than for African-American / Caucasian. The point\nthat 1-to-many accuracy can potentially collapse in the context of processing\n\"surveillance camera quality\" probe images against a \"government ID quality\"\ngallery is an important one.\n","authors":["Aman Bhatta","Gabriella Pangelinan","Michael C. King","Kevin W. Bowyer"],"pdf_url":"https://arxiv.org/pdf/2309.04447v3.pdf","comment":"9 pages, 8 figures, Conference submission"},{"id":"http://arxiv.org/abs/2401.13082v1","updated":"2024-01-23T20:28:06Z","published":"2024-01-23T20:28:06Z","title":"PlaceFormer: Transformer-based Visual Place Recognition using\n  Multi-Scale Patch Selection and Fusion","summary":"  Visual place recognition is a challenging task in the field of computer\nvision, and autonomous robotics and vehicles, which aims to identify a location\nor a place from visual inputs. Contemporary methods in visual place recognition\nemploy convolutional neural networks and utilize every region within the image\nfor the place recognition task. However, the presence of dynamic and\ndistracting elements in the image may impact the effectiveness of the place\nrecognition process. Therefore, it is meaningful to focus on task-relevant\nregions of the image for improved recognition. In this paper, we present\nPlaceFormer, a novel transformer-based approach for visual place recognition.\nPlaceFormer employs patch tokens from the transformer to create global image\ndescriptors, which are then used for image retrieval. To re-rank the retrieved\nimages, PlaceFormer merges the patch tokens from the transformer to form\nmulti-scale patches. Utilizing the transformer's self-attention mechanism, it\nselects patches that correspond to task-relevant areas in an image. These\nselected patches undergo geometric verification, generating similarity scores\nacross different patch sizes. Subsequently, spatial scores from each patch size\nare fused to produce a final similarity score. This score is then used to\nre-rank the images initially retrieved using global image descriptors.\nExtensive experiments on benchmark datasets demonstrate that PlaceFormer\noutperforms several state-of-the-art methods in terms of accuracy and\ncomputational efficiency, requiring less time and memory.\n","authors":["Shyam Sundar Kannan","Byung-Cheol Min"],"pdf_url":"https://arxiv.org/pdf/2401.13082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13081v1","updated":"2024-01-23T20:26:52Z","published":"2024-01-23T20:26:52Z","title":"Free Form Medical Visual Question Answering in Radiology","summary":"  Visual Question Answering (VQA) in the medical domain presents a unique,\ninterdisciplinary challenge, combining fields such as Computer Vision, Natural\nLanguage Processing, and Knowledge Representation. Despite its importance,\nresearch in medical VQA has been scant, only gaining momentum since 2018.\nAddressing this gap, our research delves into the effective representation of\nradiology images and the joint learning of multimodal representations,\nsurpassing existing methods. We innovatively augment the SLAKE dataset,\nenabling our model to respond to a more diverse array of questions, not limited\nto the immediate content of radiology or pathology images. Our model achieves a\ntop-1 accuracy of 79.55\\% with a less complex architecture, demonstrating\ncomparable performance to current state-of-the-art models. This research not\nonly advances medical VQA but also opens avenues for practical applications in\ndiagnostic settings.\n","authors":["Abhishek Narayanan","Rushabh Musthyala","Rahul Sankar","Anirudh Prasad Nistala","Pranav Singh","Jacopo Cirrone"],"pdf_url":"https://arxiv.org/pdf/2401.13081v1.pdf","comment":"6 pages and 4 figures"},{"id":"http://arxiv.org/abs/2401.13076v1","updated":"2024-01-23T20:02:02Z","published":"2024-01-23T20:02:02Z","title":"SemanticSLAM: Learning based Semantic Map Construction and Robust Camera\n  Localization","summary":"  Current techniques in Visual Simultaneous Localization and Mapping (VSLAM)\nestimate camera displacement by comparing image features of consecutive scenes.\nThese algorithms depend on scene continuity, hence requires frequent camera\ninputs. However, processing images frequently can lead to significant memory\nusage and computation overhead. In this study, we introduce SemanticSLAM, an\nend-to-end visual-inertial odometry system that utilizes semantic features\nextracted from an RGB-D sensor. This approach enables the creation of a\nsemantic map of the environment and ensures reliable camera localization.\nSemanticSLAM is scene-agnostic, which means it doesn't require retraining for\ndifferent environments. It operates effectively in indoor settings, even with\ninfrequent camera input, without prior knowledge. The strength of SemanticSLAM\nlies in its ability to gradually refine the semantic map and improve pose\nestimation. This is achieved by a convolutional long-short-term-memory\n(ConvLSTM) network, trained to correct errors during map construction. Compared\nto existing VSLAM algorithms, SemanticSLAM improves pose estimation by 17%. The\nresulting semantic map provides interpretable information about the environment\nand can be easily applied to various downstream tasks, such as path planning,\nobstacle avoidance, and robot navigation. The code will be publicly available\nat https://github.com/Leomingyangli/SemanticSLAM\n","authors":["Mingyang Li","Yue Ma","Qinru Qiu"],"pdf_url":"https://arxiv.org/pdf/2401.13076v1.pdf","comment":"2023 IEEE Symposium Series on Computational Intelligence (SSCI) 6\n  pages"},{"id":"http://arxiv.org/abs/2401.13068v1","updated":"2024-01-23T19:48:34Z","published":"2024-01-23T19:48:34Z","title":"Local Background Estimation for Improved Gas Plume Identification in\n  Hyperspectral Images","summary":"  Deep learning identification models have shown promise for identifying gas\nplumes in Longwave IR hyperspectral images of urban scenes, particularly when a\nlarge library of gases are being considered. Because many gases have similar\nspectral signatures, it is important to properly estimate the signal from a\ndetected plume. Typically, a scene's global mean spectrum and covariance matrix\nare estimated to whiten the plume's signal, which removes the background's\nsignature from the gas signature. However, urban scenes can have many different\nbackground materials that are spatially and spectrally heterogeneous. This can\nlead to poor identification performance when the global background estimate is\nnot representative of a given local background material. We use image\nsegmentation, along with an iterative background estimation algorithm, to\ncreate local estimates for the various background materials that reside\nunderneath a gas plume. Our method outperforms global background estimation on\na set of simulated and real gas plumes. This method shows promise in increasing\ndeep learning identification confidence, while being simple and easy to tune\nwhen considering diverse plumes.\n","authors":["Scout Jarman","Zigfried Hampel-Arias","Adra Carr","Kevin R. Moon"],"pdf_url":"https://arxiv.org/pdf/2401.13068v1.pdf","comment":"Submitted to International Geoscience and Remote Sensing Symposium\n  (IGARSS), 2024. 5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.13051v1","updated":"2024-01-23T19:20:22Z","published":"2024-01-23T19:20:22Z","title":"PA-SAM: Prompt Adapter SAM for High-Quality Image Segmentation","summary":"  The Segment Anything Model (SAM) has exhibited outstanding performance in\nvarious image segmentation tasks. Despite being trained with over a billion\nmasks, SAM faces challenges in mask prediction quality in numerous scenarios,\nespecially in real-world contexts. In this paper, we introduce a novel\nprompt-driven adapter into SAM, namely Prompt Adapter Segment Anything Model\n(PA-SAM), aiming to enhance the segmentation mask quality of the original SAM.\nBy exclusively training the prompt adapter, PA-SAM extracts detailed\ninformation from images and optimizes the mask decoder feature at both sparse\nand dense prompt levels, improving the segmentation performance of SAM to\nproduce high-quality masks. Experimental results demonstrate that our PA-SAM\noutperforms other SAM-based methods in high-quality, zero-shot, and open-set\nsegmentation. We're making the source code and models available at\nhttps://github.com/xzz2/pa-sam.\n","authors":["Zhaozhi Xie","Bochen Guan","Weihao Jiang","Muyang Yi","Yue Ding","Hongtao Lu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.13051v1.pdf","comment":"Code is available at https://github.com/xzz2/pa-sam"},{"id":"http://arxiv.org/abs/2401.13049v1","updated":"2024-01-23T19:17:20Z","published":"2024-01-23T19:17:20Z","title":"CIS-UNet: Multi-Class Segmentation of the Aorta in Computed Tomography\n  Angiography via Context-Aware Shifted Window Self-Attention","summary":"  Advancements in medical imaging and endovascular grafting have facilitated\nminimally invasive treatments for aortic diseases. Accurate 3D segmentation of\nthe aorta and its branches is crucial for interventions, as inaccurate\nsegmentation can lead to erroneous surgical planning and endograft\nconstruction. Previous methods simplified aortic segmentation as a binary image\nsegmentation problem, overlooking the necessity of distinguishing between\nindividual aortic branches. In this paper, we introduce Context Infused\nSwin-UNet (CIS-UNet), a deep learning model designed for multi-class\nsegmentation of the aorta and thirteen aortic branches. Combining the strengths\nof Convolutional Neural Networks (CNNs) and Swin transformers, CIS-UNet adopts\na hierarchical encoder-decoder structure comprising a CNN encoder, symmetric\ndecoder, skip connections, and a novel Context-aware Shifted Window\nSelf-Attention (CSW-SA) as the bottleneck block. Notably, CSW-SA introduces a\nunique utilization of the patch merging layer, distinct from conventional Swin\ntransformers. It efficiently condenses the feature map, providing a global\nspatial context and enhancing performance when applied at the bottleneck layer,\noffering superior computational efficiency and segmentation accuracy compared\nto the Swin transformers. We trained our model on computed tomography (CT)\nscans from 44 patients and tested it on 15 patients. CIS-UNet outperformed the\nstate-of-the-art SwinUNetR segmentation model, which is solely based on Swin\ntransformers, by achieving a superior mean Dice coefficient of 0.713 compared\nto 0.697, and a mean surface distance of 2.78 mm compared to 3.39 mm.\nCIS-UNet's superior 3D aortic segmentation offers improved precision and\noptimization for planning endovascular treatments. Our dataset and code will be\npublicly available.\n","authors":["Muhammad Imran","Jonathan R Krebs","Veera Rajasekhar Reddy Gopu","Brian Fazzone","Vishal Balaji Sivaraman","Amarjeet Kumar","Chelsea Viscardi","Robert Evans Heithaus","Benjamin Shickel","Yuyin Zhou","Michol A Cooper","Wei Shao"],"pdf_url":"https://arxiv.org/pdf/2401.13049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1811.08075v2","updated":"2024-01-23T19:16:31Z","published":"2018-11-20T04:55:07Z","title":"Scene Graph Generation via Conditional Random Fields","summary":"  Despite the great success object detection and segmentation models have\nachieved in recognizing individual objects in images, performance on cognitive\ntasks such as image caption, semantic image retrieval, and visual QA is far\nfrom satisfactory. To achieve better performance on these cognitive tasks,\nmerely recognizing individual object instances is insufficient. Instead, the\ninteractions between object instances need to be captured in order to\nfacilitate reasoning and understanding of the visual scenes in an image. Scene\ngraph, a graph representation of images that captures object instances and\ntheir relationships, offers a comprehensive understanding of an image. However,\nexisting techniques on scene graph generation fail to distinguish subjects and\nobjects in the visual scenes of images and thus do not perform well with\nreal-world datasets where exist ambiguous object instances. In this work, we\npropose a novel scene graph generation model for predicting object instances\nand its corresponding relationships in an image. Our model, SG-CRF, learns the\nsequential order of subject and object in a relationship triplet, and the\nsemantic compatibility of object instance nodes and relationship nodes in a\nscene graph efficiently. Experiments empirically show that SG-CRF outperforms\nthe state-of-the-art methods, on three different datasets, i.e., CLEVR, VRD,\nand Visual Genome, raising the Recall@100 from 24.99% to 49.95%, from 41.92% to\n50.47%, and from 54.69% to 54.77%, respectively.\n","authors":["Weilin Cong","William Wang","Wang-Chien Lee"],"pdf_url":"https://arxiv.org/pdf/1811.08075v2.pdf","comment":"Need to withdraw this draft as requested by collaborators"},{"id":"http://arxiv.org/abs/2401.13011v1","updated":"2024-01-23T11:46:28Z","published":"2024-01-23T11:46:28Z","title":"CCA: Collaborative Competitive Agents for Image Editing","summary":"  This paper presents a novel generative model, Collaborative Competitive\nAgents (CCA), which leverages the capabilities of multiple Large Language\nModels (LLMs) based agents to execute complex tasks. Drawing inspiration from\nGenerative Adversarial Networks (GANs), the CCA system employs two equal-status\ngenerator agents and a discriminator agent. The generators independently\nprocess user instructions and generate results, while the discriminator\nevaluates the outputs, and provides feedback for the generator agents to\nfurther reflect and improve the generation results. Unlike the previous\ngenerative model, our system can obtain the intermediate steps of generation.\nThis allows each generator agent to learn from other successful executions due\nto its transparency, enabling a collaborative competition that enhances the\nquality and robustness of the system's results. The primary focus of this study\nis image editing, demonstrating the CCA's ability to handle intricate\ninstructions robustly. The paper's main contributions include the introduction\nof a multi-agent-based generative model with controllable intermediate steps\nand iterative optimization, a detailed examination of agent relationships, and\ncomprehensive experiments on image editing. Code is available at\n\\href{https://github.com/TiankaiHang/CCA}{https://github.com/TiankaiHang/CCA}.\n","authors":["Tiankai Hang","Shuyang Gu","Dong Chen","Xin Geng","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2401.13011v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.12798v1","updated":"2024-01-23T14:31:12Z","published":"2024-01-23T14:31:12Z","title":"Gradient Flow of Energy: A General and Efficient Approach for Entity\n  Alignment Decoding","summary":"  Entity alignment (EA), a pivotal process in integrating multi-source\nKnowledge Graphs (KGs), seeks to identify equivalent entity pairs across these\ngraphs. Most existing approaches regard EA as a graph representation learning\ntask, concentrating on enhancing graph encoders. However, the decoding process\nin EA - essential for effective operation and alignment accuracy - has received\nlimited attention and remains tailored to specific datasets and model\narchitectures, necessitating both entity and additional explicit relation\nembeddings. This specificity limits its applicability, particularly in\nGNN-based models. To address this gap, we introduce a novel, generalized, and\nefficient decoding approach for EA, relying solely on entity embeddings. Our\nmethod optimizes the decoding process by minimizing Dirichlet energy, leading\nto the gradient flow within the graph, to promote graph homophily. The\ndiscretization of the gradient flow produces a fast and scalable approach,\ntermed Triple Feature Propagation (TFP). TFP innovatively channels gradient\nflow through three views: entity-to-entity, entity-to-relation, and\nrelation-to-entity. This generalized gradient flow enables TFP to harness the\nmulti-view structural information of KGs. Rigorous experimentation on diverse\nreal-world datasets demonstrates that our approach significantly enhances\nvarious EA methods. Notably, the approach achieves these advancements with less\nthan 6 seconds of additional computational time, establishing a new benchmark\nin efficiency and adaptability for future EA methods.\n","authors":["Yuanyi Wang","Haifeng Sun","Jingyu Wang","Qi Qi","Shaoling Sun","Jianxin Liao"],"pdf_url":"https://arxiv.org/pdf/2401.12798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16716v4","updated":"2024-01-23T14:05:58Z","published":"2023-11-28T12:00:06Z","title":"GraphPro: Graph Pre-training and Prompt Learning for Recommendation","summary":"  GNN-based recommenders have excelled in modeling intricate user-item\ninteractions through multi-hop message passing. However, existing methods often\noverlook the dynamic nature of evolving user-item interactions, which impedes\nthe adaption to changing user preferences and distribution shifts in newly\narriving data. Thus, their scalability and performances in real-world dynamic\nenvironments are limited. In this study, we propose GraphPro, a framework that\nincorporates parameter-efficient and dynamic graph pre-training with prompt\nlearning. This novel combination empowers GNNs to effectively capture both\nlong-term user preferences and short-term behavior dynamics, enabling the\ndelivery of accurate and timely recommendations. Our GraphPro framework\naddresses the challenge of evolving user preferences by seamlessly integrating\na temporal prompt mechanism and a graph-structural prompt learning mechanism\ninto the pre-trained GNN model. The temporal prompt mechanism encodes time\ninformation on user-item interaction, allowing the model to naturally capture\ntemporal context, while the graph-structural prompt learning mechanism enables\nthe transfer of pre-trained knowledge to adapt to behavior dynamics without the\nneed for continuous incremental training. We further bring in a dynamic\nevaluation setting for recommendation to mimic real-world dynamic scenarios and\nbridge the offline-online gap to a better level. Our extensive experiments\nincluding a large-scale industrial deployment showcases the lightweight plug-in\nscalability of our GraphPro when integrated with various state-of-the-art\nrecommenders, emphasizing the advantages of GraphPro in terms of effectiveness,\nrobustness and efficiency.\n","authors":["Yuhao Yang","Lianghao Xia","Da Luo","Kangyi Lin","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.16716v4.pdf","comment":"Accepted by WWW'2024, full paper"},{"id":"http://arxiv.org/abs/2310.14037v2","updated":"2024-01-23T13:40:30Z","published":"2023-10-21T15:21:39Z","title":"Unlock Multi-Modal Capability of Dense Retrieval via Visual Module\n  Plugin","summary":"  This paper proposes Multi-modAl Retrieval model via Visual modulE pLugin\n(MARVEL) to learn an embedding space for queries and multi-modal documents to\nconduct retrieval. MARVEL encodes queries and multi-modal documents with a\nunified encoder model, which helps to alleviate the modality gap between images\nand texts. Specifically, we enable the image understanding ability of a\nwell-trained dense retriever, T5-ANCE, by incorporating the image features\nencoded by the visual module as its inputs. To facilitate the multi-modal\nretrieval tasks, we build the ClueWeb22-MM dataset based on the ClueWeb22\ndataset, which regards anchor texts as queries, and exact the related texts and\nimage documents from anchor linked web pages. Our experiments show that MARVEL\nsignificantly outperforms the state-of-the-art methods on the multi-modal\nretrieval dataset WebQA and ClueWeb22-MM. Our further analyses show that the\nvisual module plugin method is tailored to enable the image understanding\nability for an existing dense retrieval model. Besides, we also show that the\nlanguage model has the ability to extract image semantics from image encoders\nand adapt the image features in the input space of language models. All codes\nare available at https://github.com/OpenMatch/MARVEL.\n","authors":["Tianshuo Zhou","Sen Mei","Xinze Li","Zhenghao Liu","Chenyan Xiong","Zhiyuan Liu","Yu Gu","Ge Yu"],"pdf_url":"https://arxiv.org/pdf/2310.14037v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12732v1","updated":"2024-01-23T13:06:19Z","published":"2024-01-23T13:06:19Z","title":"CDRNP: Cross-Domain Recommendation to Cold-Start Users via Neural\n  Process","summary":"  Cross-domain recommendation (CDR) has been proven as a promising way to\ntackle the user cold-start problem, which aims to make recommendations for\nusers in the target domain by transferring the user preference derived from the\nsource domain. Traditional CDR studies follow the embedding and mapping (EMCDR)\nparadigm, which transfers user representations from the source to target domain\nby learning a user-shared mapping function, neglecting the user-specific\npreference. Recent CDR studies attempt to learn user-specific mapping functions\nin meta-learning paradigm, which regards each user's CDR as an individual task,\nbut neglects the preference correlations among users, limiting the beneficial\ninformation for user representations. Moreover, both of the paradigms neglect\nthe explicit user-item interactions from both domains during the mapping\nprocess. To address the above issues, this paper proposes a novel CDR framework\nwith neural process (NP), termed as CDRNP. Particularly, it develops the\nmeta-learning paradigm to leverage user-specific preference, and further\nintroduces a stochastic process by NP to capture the preference correlations\namong the overlapping and cold-start users, thus generating more powerful\nmapping functions by mapping the user-specific preference and common preference\ncorrelations to a predictive probability distribution. In addition, we also\nintroduce a preference remainer to enhance the common preference from the\noverlapping users, and finally devises an adaptive conditional decoder with\npreference modulation to make prediction for cold-start users with items in the\ntarget domain. Experimental results demonstrate that CDRNP outperforms previous\nSOTA methods in three real-world CDR scenarios.\n","authors":["Xiaodong Li","Jiawei Sheng","Jiangxia Cao","Wenyuan Zhang","Quangang Li","Tingwen Liu"],"pdf_url":"https://arxiv.org/pdf/2401.12732v1.pdf","comment":"This paper is accepted by WSDM'2024 Oral"},{"id":"http://arxiv.org/abs/2401.12593v1","updated":"2024-01-23T09:48:08Z","published":"2024-01-23T09:48:08Z","title":"MOReGIn: Multi-Objective Recommendation at the Global and Individual\n  Levels","summary":"  Multi-Objective Recommender Systems (MORSs) emerged as a paradigm to\nguarantee multiple (often conflicting) goals. Besides accuracy, a MORS can\noperate at the global level, where additional beyond-accuracy goals are met for\nthe system as a whole, or at the individual level, meaning that the\nrecommendations are tailored to the needs of each user. The state-of-the-art\nMORSs either operate at the global or individual level, without assuming the\nco-existence of the two perspectives. In this study, we show that when global\nand individual objectives co-exist, MORSs are not able to meet both types of\ngoals. To overcome this issue, we present an approach that regulates the\nrecommendation lists so as to guarantee both global and individual\nperspectives, while preserving its effectiveness. Specifically, as individual\nperspective, we tackle genre calibration and, as global perspective, provider\nfairness. We validate our approach on two real-world datasets, publicly\nreleased with this paper.\n","authors":["Elizabeth Gómez","David Contreras","Ludovico Boratto","Maria Salamó"],"pdf_url":"https://arxiv.org/pdf/2401.12593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12590v1","updated":"2024-01-23T09:45:49Z","published":"2024-01-23T09:45:49Z","title":"PolyCF: Towards the Optimal Spectral Graph Filters for Collaborative\n  Filtering","summary":"  Collaborative Filtering (CF) is a pivotal research area in recommender\nsystems that capitalizes on collaborative similarities between users and items\nto provide personalized recommendations. With the remarkable achievements of\nnode embedding-based Graph Neural Networks (GNNs), we explore the upper bounds\nof expressiveness inherent to embedding-based methodologies and tackle the\nchallenges by reframing the CF task as a graph signal processing problem. To\nthis end, we propose PolyCF, a flexible graph signal filter that leverages\npolynomial graph filters to process interaction signals. PolyCF exhibits the\ncapability to capture spectral features across multiple eigenspaces through a\nseries of Generalized Gram filters and is able to approximate the optimal\npolynomial response function for recovering missing interactions. A graph\noptimization objective and a pair-wise ranking objective are jointly used to\noptimize the parameters of the convolution kernel. Experiments on three widely\nadopted datasets demonstrate the superiority of PolyCF over current\nstate-of-the-art CF methods. Moreover, comprehensive studies empirically\nvalidate each component's efficacy in the proposed PolyCF.\n","authors":["Yifang Qin","Wei Ju","Xiao Luo","Yiyang Gu","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.12590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12553v1","updated":"2024-01-23T08:24:44Z","published":"2024-01-23T08:24:44Z","title":"InfoRank: Unbiased Learning-to-Rank via Conditional Mutual Information\n  Minimization","summary":"  Ranking items regarding individual user interests is a core technique of\nmultiple downstream tasks such as recommender systems. Learning such a\npersonalized ranker typically relies on the implicit feedback from users' past\nclick-through behaviors. However, collected feedback is biased toward\npreviously highly-ranked items and directly learning from it would result in a\n\"rich-get-richer\" phenomenon. In this paper, we propose a simple yet sufficient\nunbiased learning-to-rank paradigm named InfoRank that aims to simultaneously\naddress both position and popularity biases. We begin by consolidating the\nimpacts of those biases into a single observation factor, thereby providing a\nunified approach to addressing bias-related issues. Subsequently, we minimize\nthe mutual information between the observation estimation and the relevance\nestimation conditioned on the input features. By doing so, our relevance\nestimation can be proved to be free of bias. To implement InfoRank, we first\nincorporate an attention mechanism to capture latent correlations within\nuser-item features, thereby generating estimations of observation and\nrelevance. We then introduce a regularization term, grounded in conditional\nmutual information, to promote conditional independence between relevance\nestimation and observation estimation. Experimental evaluations conducted\nacross three extensive recommendation and search datasets reveal that InfoRank\nlearns more precise and unbiased ranking strategies.\n","authors":["Jiarui Jin","Zexue He","Mengyue Yang","Weinan Zhang","Yong Yu","Jun Wang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2401.12553v1.pdf","comment":"WWW 2024"},{"id":"http://arxiv.org/abs/2212.12970v3","updated":"2024-01-23T07:57:55Z","published":"2022-12-25T23:19:56Z","title":"Refined Edge Usage of Graph Neural Networks for Edge Prediction","summary":"  Graph Neural Networks (GNNs), originally proposed for node classification,\nhave also motivated many recent works on edge prediction (a.k.a., link\nprediction). However, existing methods lack elaborate design regarding the\ndistinctions between two tasks that have been frequently overlooked: (i) edges\nonly constitute the topology in the node classification task but can be used as\nboth the topology and the supervisions (i.e., labels) in the edge prediction\ntask; (ii) the node classification makes prediction over each individual node,\nwhile the edge prediction is determinated by each pair of nodes. To this end,\nwe propose a novel edge prediction paradigm named Edge-aware Message PassIng\nneuRal nEtworks (EMPIRE). Concretely, we first introduce an edge splitting\ntechnique to specify use of each edge where each edge is solely used as either\nthe topology or the supervision (named as topology edge or supervision edge).\nWe then develop a new message passing mechanism that generates the messages to\nsource nodes (through topology edges) being aware of target nodes (through\nsupervision edges). In order to emphasize the differences between pairs\nconnected by supervision edges and pairs unconnected, we further weight the\nmessages to highlight the relative ones that can reflect the differences. In\naddition, we design a novel negative node-pair sampling trick that efficiently\nsamples 'hard' negative instances in the supervision instances, and can\nsignificantly improve the performance. Experimental results verify that the\nproposed method can significantly outperform existing state-of-the-art models\nregarding the edge prediction task on multiple homogeneous and heterogeneous\ngraph datasets.\n","authors":["Jiarui Jin","Yangkun Wang","Weinan Zhang","Quan Gan","Xiang Song","Yong Yu","Zheng Zhang","David Wipf"],"pdf_url":"https://arxiv.org/pdf/2212.12970v3.pdf","comment":"Need major revisions"},{"id":"http://arxiv.org/abs/2310.03025v2","updated":"2024-01-23T07:49:13Z","published":"2023-10-04T17:59:41Z","title":"Retrieval meets Long Context Large Language Models","summary":"  Extending the context window of large language models (LLMs) is getting\npopular recently, while the solution of augmenting LLMs with retrieval has\nexisted for years. The natural questions are: i) Retrieval-augmentation versus\nlong context window, which one is better for downstream tasks? ii) Can both\nmethods be combined to get the best of both worlds? In this work, we answer\nthese questions by studying both solutions using two state-of-the-art\npretrained LLMs, i.e., a proprietary 43B GPT and Llama2-70B. Perhaps\nsurprisingly, we find that LLM with 4K context window using simple\nretrieval-augmentation at generation can achieve comparable performance to\nfinetuned LLM with 16K context window via positional interpolation on long\ncontext tasks, while taking much less computation. More importantly, we\ndemonstrate that retrieval can significantly improve the performance of LLMs\nregardless of their extended context window sizes. Our best model,\nretrieval-augmented Llama2-70B with 32K context window, outperforms\nGPT-3.5-turbo-16k and Davinci003 in terms of average score on nine long context\ntasks including question answering, query-based summarization, and in-context\nfew-shot learning tasks. It also outperforms its non-retrieval Llama2-70B-32k\nbaseline by a margin, while being much faster at generation. Our study provides\ngeneral insights on the choice of retrieval-augmentation versus long context\nextension of LLM for practitioners.\n","authors":["Peng Xu","Wei Ping","Xianchao Wu","Lawrence McAfee","Chen Zhu","Zihan Liu","Sandeep Subramanian","Evelina Bakhturina","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2310.03025v2.pdf","comment":"Published at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.12540v1","updated":"2024-01-23T07:48:58Z","published":"2024-01-23T07:48:58Z","title":"DREditor: An Time-efficient Approach for Building a Domain-specific\n  Dense Retrieval Model","summary":"  Deploying dense retrieval models efficiently is becoming increasingly\nimportant across various industries. This is especially true for enterprise\nsearch services, where customizing search engines to meet the time demands of\ndifferent enterprises in different domains is crucial. Motivated by this, we\ndevelop a time-efficient approach called DREditor to edit the matching rule of\nan off-the-shelf dense retrieval model to suit a specific domain. This is\nachieved by directly calibrating the output embeddings of the model using an\nefficient and effective linear mapping. This mapping is powered by an edit\noperator that is obtained by solving a specially constructed least squares\nproblem. Compared to implicit rule modification via long-time finetuning, our\nexperimental results show that DREditor provides significant advantages on\ndifferent domain-specific datasets, dataset sources, retrieval models, and\ncomputing devices. It consistently enhances time efficiency by 100-300 times\nwhile maintaining comparable or even superior retrieval performance. In a\nbroader context, we take the first step to introduce a novel embedding\ncalibration approach for the retrieval task, filling the technical blank in the\ncurrent field of embedding calibration. This approach also paves the way for\nbuilding domain-specific dense retrieval models efficiently and inexpensively.\n","authors":["Chen Huang","Duanyu Feng","Wenqiang Lei","Jiancheng Lv"],"pdf_url":"https://arxiv.org/pdf/2401.12540v1.pdf","comment":"15 pages, 6 figures, Codes are available at\n  https://github.com/huangzichun/DREditor"},{"id":"http://arxiv.org/abs/2401.12520v1","updated":"2024-01-23T06:30:05Z","published":"2024-01-23T06:30:05Z","title":"Key Information Retrieval to Classify the Unstructured Data Content of\n  Preferential Trade Agreements","summary":"  With the rapid proliferation of textual data, predicting long texts has\nemerged as a significant challenge in the domain of natural language\nprocessing. Traditional text prediction methods encounter substantial\ndifficulties when grappling with long texts, primarily due to the presence of\nredundant and irrelevant information, which impedes the model's capacity to\ncapture pivotal insights from the text. To address this issue, we introduce a\nnovel approach to long-text classification and prediction. Initially, we employ\nembedding techniques to condense the long texts, aiming to diminish the\nredundancy therein. Subsequently,the Bidirectional Encoder Representations from\nTransformers (BERT) embedding method is utilized for text classification\ntraining. Experimental outcomes indicate that our method realizes considerable\nperformance enhancements in classifying long texts of Preferential Trade\nAgreements. Furthermore, the condensation of text through embedding methods not\nonly augments prediction accuracy but also substantially reduces computational\ncomplexity. Overall, this paper presents a strategy for long-text prediction,\noffering a valuable reference for researchers and engineers in the natural\nlanguage processing sphere.\n","authors":["Jiahui Zhao","Ziyi Meng","Stepan Gordeev","Zijie Pan","Dongjin Song","Sandro Steinbach","Caiwen Ding"],"pdf_url":"https://arxiv.org/pdf/2401.12520v1.pdf","comment":"AI4TS Workshop@AAAI 2024 accepted publication"},{"id":"http://arxiv.org/abs/2401.10225v2","updated":"2024-01-23T05:04:32Z","published":"2024-01-18T18:59:11Z","title":"ChatQA: Building GPT-4 Level Conversational QA Models","summary":"  In this work, we introduce ChatQA, a family of conversational question\nanswering (QA) models that obtain GPT-4 level accuracies. Specifically, we\npropose a two-stage instruction tuning method that can significantly improve\nthe zero-shot conversational QA results from large language models (LLMs). To\nhandle retrieval-augmented generation in conversational QA, we fine-tune a\ndense retriever on a multi-turn QA dataset, which provides comparable results\nto using the state-of-the-art query rewriting model while largely reducing\ndeployment cost. Notably, our ChatQA-70B can outperform GPT-4 in terms of\naverage score on 10 conversational QA datasets (54.14 vs. 53.90), without\nrelying on any synthetic data from OpenAI GPT models.\n","authors":["Zihan Liu","Wei Ping","Rajarshi Roy","Peng Xu","Chankyu Lee","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2401.10225v2.pdf","comment":"We added ChatQA-22B results"},{"id":"http://arxiv.org/abs/2309.09085v3","updated":"2024-01-23T05:02:45Z","published":"2023-09-16T19:40:30Z","title":"SynthTab: Leveraging Synthesized Data for Guitar Tablature Transcription","summary":"  Guitar tablature is a form of music notation widely used among guitarists. It\ncaptures not only the musical content of a piece, but also its implementation\nand ornamentation on the instrument. Guitar Tablature Transcription (GTT) is an\nimportant task with broad applications in music education, composition, and\nentertainment. Existing GTT datasets are quite limited in size and scope,\nrendering models trained on them prone to overfitting and incapable of\ngeneralizing to out-of-domain data. In order to address this issue, we present\na methodology for synthesizing large-scale GTT audio using commercial acoustic\nand electric guitar plugins. We procure SynthTab, a dataset derived from\nDadaGP, which is a vast and diverse collection of richly annotated symbolic\ntablature. The proposed synthesis pipeline produces audio which faithfully\nadheres to the original fingerings and a subset of techniques specified in the\ntablature, and covers multiple guitars and styles for each track. Experiments\nshow that pre-training a baseline GTT model on SynthTab can improve\ntranscription performance when fine-tuning and testing on an individual\ndataset. More importantly, cross-dataset experiments show that pre-training\nsignificantly mitigates issues with overfitting.\n","authors":["Yongyi Zang","Yi Zhong","Frank Cwitkowitz","Zhiyao Duan"],"pdf_url":"https://arxiv.org/pdf/2309.09085v3.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.12483v1","updated":"2024-01-23T04:32:32Z","published":"2024-01-23T04:32:32Z","title":"Persona-centric Metamorphic Relation guided Robustness Evaluation for\n  Multi-turn Dialogue Modelling","summary":"  Recently there has been significant progress in the field of dialogue system\nthanks to the introduction of training paradigms such as fine-tune and prompt\nlearning. Persona can function as the prior knowledge for maintaining the\npersonality consistency of dialogue systems, which makes it perform well on\naccuracy. Nonetheless, the conventional reference-based evaluation method falls\nshort in capturing the genuine text comprehension prowess of the model,\nsignificantly relying on the quality of data annotation. In contrast, the\napplication of metamorphic testing offers a more profound insight into the\nmodel's distinct capabilities without necessitating supplementary annotation\nlabels. This approach furnishes a more comprehensive portrayal of the model's\nintricacies and exposes intricacies concealed within reference-based validation\ntechniques. Consequently, we introduce a persona-centric metamorphic relation\nconstruction for metamorphic testing, aimed at evaluating both the persona\nconsistency and robustness of personalized dialogue models. For that reason,\nthis work evaluates several widely used training paradigms including learning\nfrom scratch, pretrain + fine-tune and prompt learning in personalized dialogue\nretrieval to know if they are more robust or if they have the same flaws as\ntheir predecessor. Under three kinds of designed metamorphic relations with\nconsistent outputs, our experimental results reveal that prompt learning shows\nstronger robustness compared to training from scratch and fine-tune. Although\ntested retrieval models gain competitively high retrieval accuracy according to\nthe traditional reference-based validation, they are still fragile and\ndemonstrate various unexpected behaviors, thus there is still room for future\nimprovement in personalized dialogue retrieval.\n","authors":["Yanbing Chen","Lin Li","Xiaohui Tao","Dong Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.12483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11624v2","updated":"2024-01-23T03:35:40Z","published":"2024-01-21T23:34:42Z","title":"In-context Learning with Retrieved Demonstrations for Language Models: A\n  Survey","summary":"  Language models, especially pre-trained large language models, have showcased\nremarkable abilities as few-shot in-context learners (ICL), adept at adapting\nto new tasks with just a few demonstrations in the input context. However, the\nmodel's ability to perform ICL is sensitive to the choice of the few-shot\ndemonstrations. Instead of using a fixed set of demonstrations, one recent\ndevelopment is to retrieve demonstrations tailored to each input query. The\nimplementation of demonstration retrieval is relatively straightforward,\nleveraging existing databases and retrieval systems. This not only improves the\nefficiency and scalability of the learning process but also has been shown to\nreduce biases inherent in manual example selection. In light of the encouraging\nresults and growing research in ICL with retrieved demonstrations, we conduct\nan extensive review of studies in this area. In this survey, we discuss and\ncompare different design choices for retrieval models, retrieval training\nprocedures, and inference algorithms.\n","authors":["Man Luo","Xin Xu","Yue Liu","Panupong Pasupat","Mehran Kazemi"],"pdf_url":"https://arxiv.org/pdf/2401.11624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12445v1","updated":"2024-01-23T02:24:17Z","published":"2024-01-23T02:24:17Z","title":"Session-level Normalization and Click-through Data Enhancement for\n  Session-based Evaluation","summary":"  Since a user usually has to issue a sequence of queries and examine multiple\ndocuments to resolve a complex information need in a search session,\nresearchers have paid much attention to evaluating search systems at the\nsession level rather than the single-query level. Most existing session-level\nmetrics evaluate each query separately and then aggregate the query-level\nscores using a session-level weighting function. The assumptions behind these\nmetrics are that all queries in the session should be involved, and their\norders are fixed. However, if a search system could make the user satisfied\nwith her first few queries, she may not need any subsequent queries. Besides,\nin most real-world search scenarios, due to a lack of explicit feedback from\nreal users, we can only leverage some implicit feedback, such as users' clicks,\nas relevance labels for offline evaluation. Such implicit feedback might be\ndifferent from the real relevance in a search session as some documents may be\nomitted in the previous query but identified in the later reformulations. To\naddress the above issues, we make two assumptions about session-based\nevaluation, which explicitly describe an ideal session-search system and how to\nenhance click-through data in computing session-level evaluation metrics. Based\non our assumptions, we design a session-level metric called Normalized\nU-Measure (NUM). NUM evaluates a session as a whole and utilizes an ideal\nsession to normalize the result of the actual session. Besides, it infers\nsession-level relevance labels based on implicit feedback. Experiments on two\npublic datasets demonstrate the effectiveness of NUM by comparing it with\nexisting session-based metrics in terms of correlation with user satisfaction\nand intuitiveness. We also conduct ablation studies to explore whether these\nassumptions hold.\n","authors":["Haonan Chen","Zhicheng Dou","Jiaxin Mao"],"pdf_url":"https://arxiv.org/pdf/2401.12445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11478v2","updated":"2024-01-23T02:22:51Z","published":"2024-01-21T12:51:28Z","title":"D2K: Turning Historical Data into Retrievable Knowledge for Recommender\n  Systems","summary":"  A vast amount of user behavior data is constantly accumulating on today's\nlarge recommendation platforms, recording users' various interests and tastes.\nPreserving knowledge from the old data while new data continually arrives is a\nvital problem for recommender systems. Existing approaches generally seek to\nsave the knowledge implicitly in the model parameters. However, such a\nparameter-centric approach lacks scalability and flexibility -- the capacity is\nhard to scale, and the knowledge is inflexible to utilize. Hence, in this work,\nwe propose a framework that turns massive user behavior data to retrievable\nknowledge (D2K). It is a data-centric approach that is model-agnostic and easy\nto scale up. Different from only storing unary knowledge such as the user-side\nor item-side information, D2K propose to store ternary knowledge for\nrecommendation, which is determined by the complete recommendation factors --\nuser, item, and context. The knowledge retrieved by target samples can be\ndirectly used to enhance the performance of any recommendation algorithms.\nSpecifically, we introduce a Transformer-based knowledge encoder to transform\nthe old data into knowledge with the user-item-context cross features. A\npersonalized knowledge adaptation unit is devised to effectively exploit the\ninformation from the knowledge base by adapting the retrieved knowledge to the\ntarget samples. Extensive experiments on two public datasets show that D2K\nsignificantly outperforms existing baselines and is compatible with a major\ncollection of recommendation algorithms.\n","authors":["Jiarui Qin","Weiwen Liu","Ruiming Tang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11478v2.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.09477v2","updated":"2024-01-23T00:46:07Z","published":"2023-09-18T04:17:44Z","title":"How Much Freedom Does An Effectiveness Metric Really Have?","summary":"  It is tempting to assume that because effectiveness metrics have free choice\nto assign scores to search engine result pages (SERPs) there must thus be a\nsimilar degree of freedom as to the relative order that SERP pairs can be put\ninto. In fact that second freedom is, to a considerable degree, illusory.\nThat's because if one SERP in a pair has been given a certain score by a\nmetric, fundamental ordering constraints in many cases then dictate that the\nscore for the second SERP must be either not less than, or not greater than,\nthe score assigned to the first SERP. We refer to these fixed relationships as\ninnate pairwise SERP orderings. Our first goal in this work is to describe and\ndefend those pairwise SERP relationship constraints, and tabulate their\nrelative occurrence via both exhaustive and empirical experimentation.\n  We then consider how to employ such innate pairwise relationships in IR\nexperiments, leading to a proposal for a new measurement paradigm.\nSpecifically, we argue that tables of results in which many different metrics\nare listed for champion versus challenger system comparisons should be avoided;\nand that instead a single metric be argued for in principled terms, with any\nrelationships identified by that metric then reinforced via an assessment of\nthe innate relationship as to whether other metrics - indeed, all other metrics\n- are likely to yield the same system-vs-system outcome.\n","authors":["Alistair Moffat","Joel Mackenzie"],"pdf_url":"https://arxiv.org/pdf/2309.09477v2.pdf","comment":"To Appear: Journal of the Association for Information Science and\n  Technology, 2024"},{"id":"http://arxiv.org/abs/2401.10841v2","updated":"2024-01-23T20:05:30Z","published":"2024-01-19T17:40:50Z","title":"Using LLMs to discover emerging coded antisemitic hate-speech in\n  extremist social media","summary":"  Online hate speech proliferation has created a difficult problem for social\nmedia platforms. A particular challenge relates to the use of coded language by\ngroups interested in both creating a sense of belonging for its users and\nevading detection. Coded language evolves quickly and its use varies over time.\nThis paper proposes a methodology for detecting emerging coded hate-laden\nterminology. The methodology is tested in the context of online antisemitic\ndiscourse. The approach considers posts scraped from social media platforms,\noften used by extremist users. The posts are scraped using seed expressions\nrelated to previously known discourse of hatred towards Jews. The method begins\nby identifying the expressions most representative of each post and calculating\ntheir frequency in the whole corpus. It filters out grammatically incoherent\nexpressions as well as previously encountered ones so as to focus on emergent\nwell-formed terminology. This is followed by an assessment of semantic\nsimilarity to known antisemitic terminology using a fine-tuned large language\nmodel, and subsequent filtering out of the expressions that are too distant\nfrom known expressions of hatred. Emergent antisemitic expressions containing\nterms clearly relating to Jewish topics are then removed to return only coded\nexpressions of hatred.\n","authors":["Dhanush Kikkisetti","Raza Ul Mustafa","Wendy Melillo","Roberto Corizzo","Zois Boukouvalas","Jeff Gill","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2401.10841v2.pdf","comment":"9 pages, 4 figures, 2 algorithms, 3 tables"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.04079v2","updated":"2024-01-23T18:59:52Z","published":"2024-01-08T18:31:38Z","title":"RudolfV: A Foundation Model by Pathologists for Pathologists","summary":"  Histopathology plays a central role in clinical medicine and biomedical\nresearch. While artificial intelligence shows promising results on many\npathological tasks, generalization and dealing with rare diseases, where\ntraining data is scarce, remains a challenge. Distilling knowledge from\nunlabeled data into a foundation model before learning from, potentially\nlimited, labeled data provides a viable path to address these challenges. In\nthis work, we extend the state of the art of foundation models for digital\npathology whole slide images by semi-automated data curation and incorporating\npathologist domain knowledge. Specifically, we combine computational and\npathologist domain knowledge (1) to curate a diverse dataset of 103k slides\ncorresponding to 750 million image patches covering data from different\nfixation, staining, and scanning protocols as well as data from different\nindications and labs across the EU and US, (2) for grouping semantically\nsimilar slides and tissue patches, and (3) to augment the input images during\ntraining. We evaluate the resulting model on a set of public and internal\nbenchmarks and show that although our foundation model is trained with an order\nof magnitude less slides, it performs on par or better than competing models.\nWe expect that scaling our approach to more data and larger models will further\nincrease its performance and capacity to deal with increasingly complex real\nworld tasks in diagnostics and biomedical research.\n","authors":["Jonas Dippel","Barbara Feulner","Tobias Winterhoff","Simon Schallenberg","Gabriel Dernbach","Andreas Kunft","Stephan Tietz","Philipp Jurmeister","David Horst","Lukas Ruff","Klaus-Robert Müller","Frederick Klauschen","Maximilian Alber"],"pdf_url":"https://arxiv.org/pdf/2401.04079v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12433v2","updated":"2024-01-23T18:59:39Z","published":"2023-12-19T18:58:40Z","title":"Tracking Any Object Amodally","summary":"  Amodal perception, the ability to comprehend complete object structures from\npartial visibility, is a fundamental skill, even for infants. Its significance\nextends to applications like autonomous driving, where a clear understanding of\nheavily occluded objects is essential. However, modern detection and tracking\nalgorithms often overlook this critical capability, perhaps due to the\nprevalence of modal annotations in most datasets. To address the scarcity of\namodal data, we introduce the TAO-Amodal benchmark, featuring 880 diverse\ncategories in thousands of video sequences. Our dataset includes amodal and\nmodal bounding boxes for visible and occluded objects, including objects that\nare partially out-of-frame. To enhance amodal tracking with object permanence,\nwe leverage a lightweight plug-in module, the amodal expander, to transform\nstandard, modal trackers into amodal ones through fine-tuning on a few hundred\nvideo sequences with data augmentation. We achieve a 3.3\\% and 1.6\\%\nimprovement on the detection and tracking of occluded objects on TAO-Amodal.\nWhen evaluated on people, our method produces dramatic improvements of 2x\ncompared to state-of-the-art modal baselines.\n","authors":["Cheng-Yen Hsieh","Tarasha Khurana","Achal Dave","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2312.12433v2.pdf","comment":"Project Page: https://tao-amodal.github.io"},{"id":"http://arxiv.org/abs/2401.12973v1","updated":"2024-01-23T18:59:21Z","published":"2024-01-23T18:59:21Z","title":"In-Context Language Learning: Arhitectures and Algorithms","summary":"  Large-scale neural language models exhibit a remarkable capacity for\nin-context learning (ICL): they can infer novel functions from datasets\nprovided as input. Most of our current understanding of when and how ICL arises\ncomes from LMs trained on extremely simple learning problems like linear\nregression and associative recall. There remains a significant gap between\nthese model problems and the \"real\" ICL exhibited by LMs trained on large text\ncorpora, which involves not just retrieval and function approximation but\nfree-form generation of language and other structured outputs. In this paper,\nwe study ICL through the lens of a new family of model problems we term in\ncontext language learning (ICLL). In ICLL, LMs are presented with a set of\nstrings from a formal language, and must generate additional strings from the\nsame language. We focus on in-context learning of regular languages generated\nby random finite automata. We evaluate a diverse set of neural sequence models\n(including several RNNs, Transformers, and state-space model variants) on\nregular ICLL tasks, aiming to answer three questions: (1) Which model classes\nare empirically capable of ICLL? (2) What algorithmic solutions do successful\nmodels implement to perform ICLL? (3) What architectural changes can improve\nICLL in less performant models? We first show that Transformers significantly\noutperform neural sequence models with recurrent or convolutional\nrepresentations on ICLL tasks. Next, we provide evidence that their ability to\ndo so relies on specialized \"n-gram heads\" (higher-order variants of induction\nheads) that compute input-conditional next-token distributions. Finally, we\nshow that hard-wiring these heads into recurrent and convolutional models\nimproves performance not just on ICLL, but natural language modeling --\nimproving the perplexity of 340M-parameter models by up to 1.14 points (6.7%)\non the SlimPajama dataset.\n","authors":["Ekin Akyürek","Bailin Wang","Yoon Kim","Jacob Andreas"],"pdf_url":"https://arxiv.org/pdf/2401.12973v1.pdf","comment":"29 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.12972v1","updated":"2024-01-23T18:58:35Z","published":"2024-01-23T18:58:35Z","title":"On the Efficacy of Text-Based Input Modalities for Action Anticipation","summary":"  Although the task of anticipating future actions is highly uncertain,\ninformation from additional modalities help to narrow down plausible action\nchoices. Each modality provides different environmental context for the model\nto learn from. While previous multi-modal methods leverage information from\nmodalities such as video and audio, we primarily explore how text inputs for\nactions and objects can also enable more accurate action anticipation.\nTherefore, we propose a Multi-modal Anticipative Transformer (MAT), an\nattention-based video transformer architecture that jointly learns from\nmulti-modal features and text captions. We train our model in two-stages, where\nthe model first learns to predict actions in the video clip by aligning with\ncaptions, and during the second stage, we fine-tune the model to predict future\nactions. Compared to existing methods, MAT has the advantage of learning\nadditional environmental context from two kinds of text inputs: action\ndescriptions during the pre-training stage, and the text inputs for detected\nobjects and actions during modality feature fusion. Through extensive\nexperiments, we evaluate the effectiveness of the pre-training stage, and show\nthat our model outperforms previous methods on all datasets. In addition, we\nexamine the impact of object and action information obtained via text and\nperform extensive ablations. We evaluate the performance on on three datasets:\nEpicKitchens-100, EpicKitchens-55 and EGTEA GAZE+; and show that text\ndescriptions do indeed aid in more effective action anticipation.\n","authors":["Apoorva Beedu","Karan Samel","Irfan Essa"],"pdf_url":"https://arxiv.org/pdf/2401.12972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12963v1","updated":"2024-01-23T18:45:54Z","published":"2024-01-23T18:45:54Z","title":"AutoRT: Embodied Foundation Models for Large Scale Orchestration of\n  Robotic Agents","summary":"  Foundation models that incorporate language, vision, and more recently\nactions have revolutionized the ability to harness internet scale data to\nreason about useful tasks. However, one of the key challenges of training\nembodied foundation models is the lack of data grounded in the physical world.\nIn this paper, we propose AutoRT, a system that leverages existing foundation\nmodels to scale up the deployment of operational robots in completely unseen\nscenarios with minimal human supervision. AutoRT leverages vision-language\nmodels (VLMs) for scene understanding and grounding, and further uses large\nlanguage models (LLMs) for proposing diverse and novel instructions to be\nperformed by a fleet of robots. Guiding data collection by tapping into the\nknowledge of foundation models enables AutoRT to effectively reason about\nautonomy tradeoffs and safety while significantly scaling up data collection\nfor robot learning. We demonstrate AutoRT proposing instructions to over 20\nrobots across multiple buildings and collecting 77k real robot episodes via\nboth teleoperation and autonomous robot policies. We experimentally show that\nsuch \"in-the-wild\" data collected by AutoRT is significantly more diverse, and\nthat AutoRT's use of LLMs allows for instruction following data collection\nrobots that can align to human preferences.\n","authors":["Michael Ahn","Debidatta Dwibedi","Chelsea Finn","Montse Gonzalez Arenas","Keerthana Gopalakrishnan","Karol Hausman","Brian Ichter","Alex Irpan","Nikhil Joshi","Ryan Julian","Sean Kirmani","Isabel Leal","Edward Lee","Sergey Levine","Yao Lu","Isabel Leal","Sharath Maddineni","Kanishka Rao","Dorsa Sadigh","Pannag Sanketi","Pierre Sermanet","Quan Vuong","Stefan Welker","Fei Xia","Ted Xiao","Peng Xu","Steve Xu","Zhuo Xu"],"pdf_url":"https://arxiv.org/pdf/2401.12963v1.pdf","comment":"26 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.12961v1","updated":"2024-01-23T18:45:27Z","published":"2024-01-23T18:45:27Z","title":"Chatterbox: Robust Transport for LLM Token Streaming under Unstable\n  Network","summary":"  To render each generated token in real time, the LLM server generates\nresponse tokens one by one and streams each generated token (or group of a few\ntokens) through the network to the user right after it is generated, which we\nrefer to as LLM token streaming. However, under unstable network conditions,\nthe LLM token streaming experience could suffer greatly from stalls since one\npacket loss could block the rendering of tokens contained in subsequent packets\neven if they arrive on time. With a real-world measurement study, we show that\ncurrent applications including ChatGPT, Claude, and Bard all suffer from\nincreased stall under unstable network.\n  For this emerging token streaming problem in LLM Chatbots, we propose a novel\ntransport layer scheme, called Chatterbox, which puts new generated tokens as\nwell as currently unacknowledged tokens in the next outgoing packet. This\nensures that each packet contains some new tokens and can be independently\nrendered when received, thus avoiding aforementioned stalls caused by missing\npackets. Through simulation under various network conditions, we show\nChatterbox reduces stall ratio (proportion of token rendering wait time) by\n71.0% compared to the token streaming method commonly used by real chatbot\napplications and by 31.6% compared to a custom packet duplication scheme. By\ntailoring Chatterbox to fit the token-by-token generation of LLM, we enable the\nChatbots to respond like an eloquent speaker for users to better enjoy\npervasive AI.\n","authors":["Hanchen Li","Yuhan Liu","Yihua Cheng","Siddhant Ray","Kuntai Du","Junchen Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.12961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.00775v2","updated":"2024-01-23T18:31:01Z","published":"2022-06-01T21:37:03Z","title":"Adaptive Local Neighborhood-based Neural Networks for MR Image\n  Reconstruction from Undersampled Data","summary":"  Recent medical image reconstruction techniques focus on generating\nhigh-quality medical images suitable for clinical use at the lowest possible\ncost and with the fewest possible adverse effects on patients. Recent works\nhave shown significant promise for reconstructing MR images from sparsely\nsampled k-space data using deep learning. In this work, we propose a technique\nthat rapidly estimates deep neural networks directly at reconstruction time by\nfitting them on small adaptively estimated neighborhoods of a training set. In\nbrief, our algorithm alternates between searching for neighbors in a data set\nthat are similar to the test reconstruction, and training a local network on\nthese neighbors followed by updating the test reconstruction. Because our\nreconstruction model is learned on a dataset that is in some sense similar to\nthe image being reconstructed rather than being fit on a large, diverse\ntraining set, it is more adaptive to new scans. It can also handle changes in\ntraining sets and flexible scan settings, while being relatively fast. Our\napproach, dubbed LONDN-MRI, was validated on multiple data sets using deep\nunrolled reconstruction networks. Reconstructions were performed at four fold\nand eight fold undersampling of k-space with 1D variable-density random\nphase-encode undersampling masks. Our results demonstrate that our proposed\nlocally-trained method produces higher-quality reconstructions compared to\nmodels trained globally on larger datasets as well as other scan-adaptive\nmethods.\n","authors":["Shijun Liang","Anish Lahiri","Saiprasad Ravishankar"],"pdf_url":"https://arxiv.org/pdf/2206.00775v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15318v3","updated":"2024-01-23T18:27:30Z","published":"2023-10-23T19:35:57Z","title":"HetGPT: Harnessing the Power of Prompt Tuning in Pre-Trained\n  Heterogeneous Graph Neural Networks","summary":"  Graphs have emerged as a natural choice to represent and analyze the\nintricate patterns and rich information of the Web, enabling applications such\nas online page classification and social recommendation. The prevailing\n\"pre-train, fine-tune\" paradigm has been widely adopted in graph machine\nlearning tasks, particularly in scenarios with limited labeled nodes. However,\nthis approach often exhibits a misalignment between the training objectives of\npretext tasks and those of downstream tasks. This gap can result in the\n\"negative transfer\" problem, wherein the knowledge gained from pre-training\nadversely affects performance in the downstream tasks. The surge in\nprompt-based learning within Natural Language Processing (NLP) suggests the\npotential of adapting a \"pre-train, prompt\" paradigm to graphs as an\nalternative. However, existing graph prompting techniques are tailored to\nhomogeneous graphs, neglecting the inherent heterogeneity of Web graphs. To\nbridge this gap, we propose HetGPT, a general post-training prompting framework\nto improve the predictive performance of pre-trained heterogeneous graph neural\nnetworks (HGNNs). The key is the design of a novel prompting function that\nintegrates a virtual class prompt and a heterogeneous feature prompt, with the\naim to reformulate downstream tasks to mirror pretext tasks. Moreover, HetGPT\nintroduces a multi-view neighborhood aggregation mechanism, capturing the\ncomplex neighborhood structure in heterogeneous graphs. Extensive experiments\non three benchmark datasets demonstrate HetGPT's capability to enhance the\nperformance of state-of-the-art HGNNs on semi-supervised node classification.\n","authors":["Yihong Ma","Ning Yan","Jiayu Li","Masood Mortazavi","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2310.15318v3.pdf","comment":"Accepted to WWW 2024 as research paper"},{"id":"http://arxiv.org/abs/2401.12950v1","updated":"2024-01-23T18:15:58Z","published":"2024-01-23T18:15:58Z","title":"Bayesian Semi-structured Subspace Inference","summary":"  Semi-structured regression models enable the joint modeling of interpretable\nstructured and complex unstructured feature effects. The structured model part\nis inspired by statistical models and can be used to infer the input-output\nrelationship for features of particular importance. The complex unstructured\npart defines an arbitrary deep neural network and thereby provides enough\nflexibility to achieve competitive prediction performance. While these models\ncan also account for aleatoric uncertainty, there is still a lack of work on\naccounting for epistemic uncertainty. In this paper, we address this problem by\npresenting a Bayesian approximation for semi-structured regression models using\nsubspace inference. To this end, we extend subspace inference for joint\nposterior sampling from a full parameter space for structured effects and a\nsubspace for unstructured effects. Apart from this hybrid sampling scheme, our\nmethod allows for tunable complexity of the subspace and can capture multiple\nminima in the loss landscape. Numerical experiments validate our approach's\nefficacy in recovering structured effect parameter posteriors in\nsemi-structured models and approaching the full-space posterior distribution of\nMCMC for increasing subspace dimension. Further, our approach exhibits\ncompetitive predictive performance across simulated and real-world datasets.\n","authors":["Daniel Dold","David Rügamer","Beate Sick","Oliver Dürr"],"pdf_url":"https://arxiv.org/pdf/2401.12950v1.pdf","comment":"Accepted at AISTATS 2024"},{"id":"http://arxiv.org/abs/2309.10140v2","updated":"2024-01-23T18:08:34Z","published":"2023-09-18T20:39:12Z","title":"A Geometric Framework for Neural Feature Learning","summary":"  We present a novel framework for learning system design based on neural\nfeature extractors. First, we introduce the feature geometry, which unifies\nstatistical dependence and features in the same function space with geometric\nstructures. By applying the feature geometry, we formulate each learning\nproblem as solving the optimal feature approximation of the dependence\ncomponent specified by the learning setting. We propose a nesting technique for\ndesigning learning algorithms to learn the optimal features from data samples,\nwhich can be applied to off-the-shelf network architectures and optimizers. To\ndemonstrate the applications of the nesting technique, we further discuss\nmultivariate learning problems, including conditioned inference and multimodal\nlearning, where we present the optimal features and reveal their connections to\nclassical approaches.\n","authors":["Xiangxiang Xu","Lizhong Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.10140v2.pdf","comment":"76 pages, 24 figures"},{"id":"http://arxiv.org/abs/2205.13743v5","updated":"2024-01-23T17:53:23Z","published":"2022-05-27T03:12:18Z","title":"Personalized Algorithmic Recourse with Preference Elicitation","summary":"  Algorithmic Recourse (AR) is the problem of computing a sequence of actions\nthat -- once performed by a user -- overturns an undesirable machine decision.\nIt is paramount that the sequence of actions does not require too much effort\nfor users to implement. Yet, most approaches to AR assume that actions cost the\nsame for all users, and thus may recommend unfairly expensive recourse plans to\ncertain users. Prompted by this observation, we introduce PEAR, the first\nhuman-in-the-loop approach capable of providing personalized algorithmic\nrecourse tailored to the needs of any end-user. PEAR builds on insights from\nBayesian Preference Elicitation to iteratively refine an estimate of the costs\nof actions by asking choice set queries to the target user. The queries\nthemselves are computed by maximizing the Expected Utility of Selection, a\nprincipled measure of information gain accounting for uncertainty on both the\ncost estimate and the user's responses. PEAR integrates elicitation into a\nReinforcement Learning agent coupled with Monte Carlo Tree Search to quickly\nidentify promising recourse plans. Our empirical evaluation on real-world\ndatasets highlights how PEAR produces high-quality personalized recourse in\nonly a handful of iterations.\n","authors":["Giovanni De Toni","Paolo Viappiani","Stefano Teso","Bruno Lepri","Andrea Passerini"],"pdf_url":"https://arxiv.org/pdf/2205.13743v5.pdf","comment":"Published in Transactions in Machine Learning Research (TMLR),\n  January 2024. See https://openreview.net/forum?id=8sg2I9zXgO for the official\n  submission"},{"id":"http://arxiv.org/abs/2401.11488v2","updated":"2024-01-23T17:49:42Z","published":"2024-01-21T13:24:41Z","title":"HARDCORE: H-field and power loss estimation for arbitrary waveforms with\n  residual, dilated convolutional neural networks in ferrite cores","summary":"  The MagNet Challenge 2023 calls upon competitors to develop data-driven\nmodels for the material-specific, waveform-agnostic estimation of steady-state\npower losses in toroidal ferrite cores. The following HARDCORE (H-field and\npower loss estimation for Arbitrary waveforms with Residual, Dilated\nconvolutional neural networks in ferrite COREs) approach shows that a residual\nconvolutional neural network with physics-informed extensions can serve this\ntask efficiently when trained on observational data beforehand. One key\nsolution element is an intermediate model layer which first reconstructs the bh\ncurve and then estimates the power losses based on the curve's area rendering\nthe proposed topology physically interpretable. In addition, emphasis was\nplaced on expert-based feature engineering and information-rich inputs in order\nto enable a lean model architecture. A model is trained from scratch for each\nmaterial, while the topology remains the same. A Pareto-style trade-off between\nmodel size and estimation accuracy is demonstrated, which yields an optimum at\nas low as 1755 parameters and down to below 8\\,\\% for the 95-th percentile of\nthe relative error for the worst-case material with sufficient samples.\n","authors":["Wilhelm Kirchgässner","Nikolas Förster","Till Piepenbrock","Oliver Schweins","Oliver Wallscheid"],"pdf_url":"https://arxiv.org/pdf/2401.11488v2.pdf","comment":"Competition submission version, slightly change author order"},{"id":"http://arxiv.org/abs/2401.12934v1","updated":"2024-01-23T17:42:17Z","published":"2024-01-23T17:42:17Z","title":"Reward-Relevance-Filtered Linear Offline Reinforcement Learning","summary":"  This paper studies offline reinforcement learning with linear function\napproximation in a setting with decision-theoretic, but not estimation\nsparsity. The structural restrictions of the data-generating process presume\nthat the transitions factor into a sparse component that affects the reward and\ncould affect additional exogenous dynamics that do not affect the reward.\nAlthough the minimally sufficient adjustment set for estimation of full-state\ntransition properties depends on the whole state, the optimal policy and\ntherefore state-action value function depends only on the sparse component: we\ncall this causal/decision-theoretic sparsity. We develop a method for\nreward-filtering the estimation of the state-action value function to the\nsparse component by a modification of thresholded lasso in least-squares policy\nevaluation. We provide theoretical guarantees for our reward-filtered linear\nfitted-Q-iteration, with sample complexity depending only on the size of the\nsparse component.\n","authors":["Angela Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.12934v1.pdf","comment":"conference version accepted at AISTATS 2024"},{"id":"http://arxiv.org/abs/2401.12930v1","updated":"2024-01-23T17:33:41Z","published":"2024-01-23T17:33:41Z","title":"pyAKI - An Open Source Solution to Automated KDIGO classification","summary":"  Acute Kidney Injury (AKI) is a frequent complication in critically ill\npatients, affecting up to 50% of patients in the intensive care units. The lack\nof standardized and open-source tools for applying the Kidney Disease Improving\nGlobal Outcomes (KDIGO) criteria to time series data has a negative impact on\nworkload and study quality. This project introduces pyAKI, an open-source\npipeline addressing this gap by providing a comprehensive solution for\nconsistent KDIGO criteria implementation.\n  The pyAKI pipeline was developed and validated using a subset of the Medical\nInformation Mart for Intensive Care (MIMIC)-IV database, a commonly used\ndatabase in critical care research. We defined a standardized data model in\norder to ensure reproducibility. Validation against expert annotations\ndemonstrated pyAKI's robust performance in implementing KDIGO criteria.\nComparative analysis revealed its ability to surpass the quality of human\nlabels.\n  This work introduces pyAKI as an open-source solution for implementing the\nKDIGO criteria for AKI diagnosis using time series data with high accuracy and\nperformance.\n","authors":["Christian Porschen","Jan Ernsting","Paul Brauckmann","Raphael Weiss","Till Würdemann","Hendrik Booke","Wida Amini","Ludwig Maidowski","Benjamin Risse","Tim Hahn","Thilo von Groote"],"pdf_url":"https://arxiv.org/pdf/2401.12930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.13209v2","updated":"2024-01-23T17:31:48Z","published":"2022-04-27T21:58:07Z","title":"Robust stabilization of polytopic systems via fast and reliable neural\n  network-based approximations","summary":"  We consider the design of fast and reliable neural network (NN)-based\napproximations of traditional stabilizing controllers for linear systems with\npolytopic uncertainty, including control laws with variable structure and those\nbased on a (minimal) selection policy. Building upon recent approaches for the\ndesign of reliable control surrogates with guaranteed structural properties, we\ndevelop a systematic procedure to certify the closed-loop stability and\nperformance of a linear uncertain system when a trained rectified linear unit\n(ReLU)-based approximation replaces such traditional controllers. First, we\nprovide a sufficient condition, which involves the worst-case approximation\nerror between ReLU-based and traditional controller-based state-to-input\nmappings, ensuring that the system is ultimately bounded within a set with\nadjustable size and convergence rate. Then, we develop an offline,\nmixed-integer optimization-based method that allows us to compute that quantity\nexactly.\n","authors":["Filippo Fabiani","Paul J. Goulart"],"pdf_url":"https://arxiv.org/pdf/2204.13209v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03131v2","updated":"2024-01-23T17:29:54Z","published":"2023-11-06T14:28:11Z","title":"Reservoir-Computing Model for Mapping and Forecasting Neuronal\n  Interactions from Electrophysiological Data","summary":"  Electrophysiological nature of neuronal networks allows to reveal various\ninteractions between different cell units at a very short time-scales. One of\nthe many challenges in analyzing these signals is to retrieve the morphology\nand functionality of a given network. In this work we developed a computational\nmodel, based on Reservoir Computing Network (RCN) architecture, which decodes\nthe spatio-temporal data from electro-physiological measurements of neuronal\ncultures and reconstructs the network structure on a macroscopic domain,\nrepresenting the connectivity between neuronal units. We demonstrate that the\nmodel can predict the connectivity map of the network with higher accuracy than\nthe common methods such as Cross-Correlation and Transfer-Entropy. In addition,\nwe experimentally demonstrate the ability of the model to predict a network\nresponse to a specific input, such as localized stimulus.\n","authors":["Ilya Auslender","Giorgio Letti","Yasaman Heydari","Clara Zaccaria","Lorenzo Pavesi"],"pdf_url":"https://arxiv.org/pdf/2311.03131v2.pdf","comment":"Pre-submission draft"},{"id":"http://arxiv.org/abs/2205.05587v3","updated":"2024-01-23T17:26:09Z","published":"2022-05-11T16:00:14Z","title":"Choice of training label matters: how to best use deep learning for\n  quantitative MRI parameter estimation","summary":"  Deep learning (DL) is gaining popularity as a parameter estimation method for\nquantitative MRI. A range of competing implementations have been proposed,\nrelying on either supervised or self-supervised learning. Self-supervised\napproaches, sometimes referred to as unsupervised, have been loosely based on\nauto-encoders, whereas supervised methods have, to date, been trained on\ngroundtruth labels. These two learning paradigms have been shown to have\ndistinct strengths. Notably, self-supervised approaches have offered lower-bias\nparameter estimates than their supervised alternatives. This result is\ncounterintuitive - incorporating prior knowledge with supervised labels should,\nin theory, lead to improved accuracy. In this work, we show that this apparent\nlimitation of supervised approaches stems from the naive choice of groundtruth\ntraining labels. By training on labels which are deliberately not groundtruth,\nwe show that the low-bias parameter estimation previously associated with\nself-supervised methods can be replicated - and improved on - within a\nsupervised learning framework. This approach sets the stage for a single,\nunifying, deep learning parameter estimation framework, based on supervised\nlearning, where trade-offs between bias and variance are made by careful\nadjustment of training label.\n","authors":["Sean C. Epstein","Timothy J. P. Bray","Margaret Hall-Craggs","Hui Zhang"],"pdf_url":"https://arxiv.org/pdf/2205.05587v3.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n  Biomedical Imaging (MELBA) https://melba-journal.org/2024:002"},{"id":"http://arxiv.org/abs/2401.12926v1","updated":"2024-01-23T17:22:00Z","published":"2024-01-23T17:22:00Z","title":"DsDm: Model-Aware Dataset Selection with Datamodels","summary":"  When selecting data for training large-scale models, standard practice is to\nfilter for examples that match human notions of data quality. Such filtering\nyields qualitatively clean datapoints that intuitively should improve model\nbehavior. However, in practice the opposite can often happen: we find that\nselecting according to similarity with \"high quality\" data sources may not\nincrease (and can even hurt) performance compared to randomly selecting data.\n  To develop better methods for selecting data, we start by framing dataset\nselection as an optimization problem that we can directly solve for: given\ntarget tasks, a learning algorithm, and candidate data, select the subset that\nmaximizes model performance. This framework thus avoids handpicked notions of\ndata quality, and instead models explicitly how the learning process uses train\ndatapoints to predict on the target tasks. Our resulting method greatly\nimproves language model (LM) performance on both pre-specified tasks and\npreviously unseen tasks. Specifically, choosing target tasks representative of\nstandard LM problems and evaluating on diverse held-out benchmarks, our\nselected datasets provide a 2x compute multiplier over baseline methods.\n","authors":["Logan Engstrom","Axel Feldmann","Aleksander Madry"],"pdf_url":"https://arxiv.org/pdf/2401.12926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12924v1","updated":"2024-01-23T17:20:52Z","published":"2024-01-23T17:20:52Z","title":"Performance Analysis of Support Vector Machine (SVM) on Challenging\n  Datasets for Forest Fire Detection","summary":"  This article delves into the analysis of performance and utilization of\nSupport Vector Machines (SVMs) for the critical task of forest fire detection\nusing image datasets. With the increasing threat of forest fires to ecosystems\nand human settlements, the need for rapid and accurate detection systems is of\nutmost importance. SVMs, renowned for their strong classification capabilities,\nexhibit proficiency in recognizing patterns associated with fire within images.\nBy training on labeled data, SVMs acquire the ability to identify distinctive\nattributes associated with fire, such as flames, smoke, or alterations in the\nvisual characteristics of the forest area. The document thoroughly examines the\nuse of SVMs, covering crucial elements like data preprocessing, feature\nextraction, and model training. It rigorously evaluates parameters such as\naccuracy, efficiency, and practical applicability. The knowledge gained from\nthis study aids in the development of efficient forest fire detection systems,\nenabling prompt responses and improving disaster management. Moreover, the\ncorrelation between SVM accuracy and the difficulties presented by\nhigh-dimensional datasets is carefully investigated, demonstrated through a\nrevealing case study. The relationship between accuracy scores and the\ndifferent resolutions used for resizing the training datasets has also been\ndiscussed in this article. These comprehensive studies result in a definitive\noverview of the difficulties faced and the potential sectors requiring further\nimprovement and focus.\n","authors":["Ankan Kar","Nirjhar Nath","Utpalraj Kemprai"," Aman"],"pdf_url":"https://arxiv.org/pdf/2401.12924v1.pdf","comment":"19 pages, 8 figures, accepted in IJCNS of SCIRP (not yet published)"},{"id":"http://arxiv.org/abs/2401.12923v1","updated":"2024-01-23T17:20:48Z","published":"2024-01-23T17:20:48Z","title":"Deep multitask neural networks for solving some stochastic optimal\n  control problems","summary":"  Most existing neural network-based approaches for solving stochastic optimal\ncontrol problems using the associated backward dynamic programming principle\nrely on the ability to simulate the underlying state variables. However, in\nsome problems, this simulation is infeasible, leading to the discretization of\nstate variable space and the need to train one neural network for each data\npoint. This approach becomes computationally inefficient when dealing with\nlarge state variable spaces. In this paper, we consider a class of this type of\nstochastic optimal control problems and introduce an effective solution\nemploying multitask neural networks. To train our multitask neural network, we\nintroduce a novel scheme that dynamically balances the learning across tasks.\nThrough numerical experiments on real-world derivatives pricing problems, we\nprove that our method outperforms state-of-the-art approaches.\n","authors":["Christian Yeo"],"pdf_url":"https://arxiv.org/pdf/2401.12923v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2209.07805v4","updated":"2024-01-23T17:14:20Z","published":"2022-09-16T09:09:15Z","title":"A Comprehensive Benchmark for COVID-19 Predictive Modeling Using\n  Electronic Health Records in Intensive Care","summary":"  The COVID-19 pandemic has posed a heavy burden to the healthcare system\nworldwide and caused huge social disruption and economic loss. Many deep\nlearning models have been proposed to conduct clinical predictive tasks such as\nmortality prediction for COVID-19 patients in intensive care units using\nElectronic Health Record (EHR) data. Despite their initial success in certain\nclinical applications, there is currently a lack of benchmarking results to\nachieve a fair comparison so that we can select the optimal model for clinical\nuse. Furthermore, there is a discrepancy between the formulation of traditional\nprediction tasks and real-world clinical practice in intensive care. To fill\nthese gaps, we propose two clinical prediction tasks, Outcome-specific\nlength-of-stay prediction and Early mortality prediction for COVID-19 patients\nin intensive care units. The two tasks are adapted from the naive\nlength-of-stay and mortality prediction tasks to accommodate the clinical\npractice for COVID-19 patients. We propose fair, detailed, open-source\ndata-preprocessing pipelines and evaluate 17 state-of-the-art predictive models\non two tasks, including 5 machine learning models, 6 basic deep learning models\nand 6 deep learning predictive models specifically designed for EHR data. We\nprovide benchmarking results using data from two real-world COVID-19 EHR\ndatasets. One dataset is publicly available without needing any inquiry and\nanother dataset can be accessed on request. We provide fair, reproducible\nbenchmarking results for two tasks. We deploy all experiment results and models\non an online platform. We also allow clinicians and researchers to upload their\ndata to the platform and get quick prediction results using our trained models.\nWe hope our efforts can further facilitate deep learning and machine learning\nresearch for COVID-19 predictive modeling.\n","authors":["Junyi Gao","Yinghao Zhu","Wenqing Wang","Yasha Wang","Wen Tang","Ewen M. Harrison","Liantao Ma"],"pdf_url":"https://arxiv.org/pdf/2209.07805v4.pdf","comment":"Junyi Gao, Yinghao Zhu and Wenqing Wang contributed equally"},{"id":"http://arxiv.org/abs/2106.01135v4","updated":"2024-01-23T16:52:16Z","published":"2021-06-02T13:05:34Z","title":"MNL-Bandit with Knapsacks: a near-optimal algorithm","summary":"  We consider a dynamic assortment selection problem where a seller has a fixed\ninventory of $N$ substitutable products and faces an unknown demand that\narrives sequentially over $T$ periods. In each period, the seller needs to\ndecide on the assortment of products (satisfying certain constraints) to offer\nto the customers. The customer's response follows an unknown multinomial logit\nmodel (MNL) with parameter $\\boldsymbol{v}$. If customer selects product $i \\in\n[N]$, the seller receives revenue $r_i$. The goal of the seller is to maximize\nthe total expected revenue from the $T$ customers given the fixed initial\ninventory of $N$ products. We present MNLwK-UCB, a UCB-based algorithm and\ncharacterize its regret under different regimes of inventory size. We show that\nwhen the inventory size grows quasi-linearly in time, MNLwK-UCB achieves a\n$\\tilde{O}(N + \\sqrt{NT})$ regret bound. We also show that for a smaller\ninventory (with growth $\\sim T^{\\alpha}$, $\\alpha < 1$), MNLwK-UCB achieves a\n$\\tilde{O}(N(1 + T^{\\frac{1 - \\alpha}{2}}) + \\sqrt{NT})$. In particular, over a\nlong time horizon $T$, the rate $\\tilde{O}(\\sqrt{NT})$ is always achieved\nregardless of the constraints and the size of the inventory.\n","authors":["Abdellah Aznag","Vineet Goyal","Noemie Perivier"],"pdf_url":"https://arxiv.org/pdf/2106.01135v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03311v3","updated":"2024-01-23T16:34:53Z","published":"2023-12-06T06:33:25Z","title":"On the Nystrom Approximation for Preconditioning in Kernel Machines","summary":"  Kernel methods are a popular class of nonlinear predictive models in machine\nlearning. Scalable algorithms for learning kernel models need to be iterative\nin nature, but convergence can be slow due to poor conditioning. Spectral\npreconditioning is an important tool to speed-up the convergence of such\niterative algorithms for training kernel models. However computing and storing\na spectral preconditioner can be expensive which can lead to large\ncomputational and storage overheads, precluding the application of kernel\nmethods to problems with large datasets. A Nystrom approximation of the\nspectral preconditioner is often cheaper to compute and store, and has\ndemonstrated success in practical applications. In this paper we analyze the\ntrade-offs of using such an approximated preconditioner. Specifically, we show\nthat a sample of logarithmic size (as a function of the size of the dataset)\nenables the Nystrom-based approximated preconditioner to accelerate gradient\ndescent nearly as well as the exact preconditioner, while also reducing the\ncomputational and storage overheads.\n","authors":["Amirhesam Abedsoltan","Parthe Pandit","Luis Rademacher","Mikhail Belkin"],"pdf_url":"https://arxiv.org/pdf/2312.03311v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12882v1","updated":"2024-01-23T16:22:50Z","published":"2024-01-23T16:22:50Z","title":"Model-Free $δ$-Policy Iteration Based on Damped Newton Method for\n  Nonlinear Continuous-Time H$\\infty$ Tracking Control","summary":"  This paper presents a {\\delta}-PI algorithm which is based on damped Newton\nmethod for the H{\\infty} tracking control problem of unknown continuous-time\nnonlinear system. A discounted performance function and an augmented system are\nused to get the tracking Hamilton-Jacobi-Isaac (HJI) equation. Tracking HJI\nequation is a nonlinear partial differential equation, traditional\nreinforcement learning methods for solving the tracking HJI equation are mostly\nbased on the Newton method, which usually only satisfies local convergence and\nneeds a good initial guess. Based upon the damped Newton iteration operator\nequation, a generalized tracking Bellman equation is derived firstly. The\n{\\delta}-PI algorithm can seek the optimal solution of the tracking HJI\nequation by iteratively solving the generalized tracking Bellman equation.\nOn-policy learning and off-policy learning {\\delta}-PI reinforcement learning\nmethods are provided, respectively. Off-policy version {\\delta}-PI algorithm is\na model-free algorithm which can be performed without making use of a priori\nknowledge of the system dynamics. NN-based implementation scheme for the\noff-policy {\\delta}-PI algorithms is shown. The suitability of the model-free\n{\\delta}-PI algorithm is illustrated with a nonlinear system simulation.\n","authors":["Qi Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12882v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2303.07846v2","updated":"2024-01-23T16:14:46Z","published":"2023-03-14T12:36:01Z","title":"Sample-efficient Adversarial Imitation Learning","summary":"  Imitation learning, in which learning is performed by demonstration, has been\nstudied and advanced for sequential decision-making tasks in which a reward\nfunction is not predefined. However, imitation learning methods still require\nnumerous expert demonstration samples to successfully imitate an expert's\nbehavior. To improve sample efficiency, we utilize self-supervised\nrepresentation learning, which can generate vast training signals from the\ngiven data. In this study, we propose a self-supervised representation-based\nadversarial imitation learning method to learn state and action representations\nthat are robust to diverse distortions and temporally predictive, on non-image\ncontrol tasks. In particular, in comparison with existing self-supervised\nlearning methods for tabular data, we propose a different corruption method for\nstate and action representations that is robust to diverse distortions. We\ntheoretically and empirically observe that making an informative feature\nmanifold with less sample complexity significantly improves the performance of\nimitation learning. The proposed method shows a 39% relative improvement over\nexisting adversarial imitation learning methods on MuJoCo in a setting limited\nto 100 expert state-action pairs. Moreover, we conduct comprehensive ablations\nand additional experiments using demonstrations with varying optimality to\nprovide insights into a range of factors.\n","authors":["Dahuin Jung","Hyungyu Lee","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2303.07846v2.pdf","comment":"Published at JMLR (Journal of Machine Learning Research), A\n  preliminary version of this manuscript was presented at Deep RL Workshop,\n  NeurIPS 2022"},{"id":"http://arxiv.org/abs/2307.02764v2","updated":"2024-01-23T16:01:02Z","published":"2023-07-06T04:13:57Z","title":"When Does Confidence-Based Cascade Deferral Suffice?","summary":"  Cascades are a classical strategy to enable inference cost to vary adaptively\nacross samples, wherein a sequence of classifiers are invoked in turn. A\ndeferral rule determines whether to invoke the next classifier in the sequence,\nor to terminate prediction. One simple deferral rule employs the confidence of\nthe current classifier, e.g., based on the maximum predicted softmax\nprobability. Despite being oblivious to the structure of the cascade -- e.g.,\nnot modelling the errors of downstream models -- such confidence-based deferral\noften works remarkably well in practice. In this paper, we seek to better\nunderstand the conditions under which confidence-based deferral may fail, and\nwhen alternate deferral strategies can perform better. We first present a\ntheoretical characterisation of the optimal deferral rule, which precisely\ncharacterises settings under which confidence-based deferral may suffer. We\nthen study post-hoc deferral mechanisms, and demonstrate they can significantly\nimprove upon confidence-based deferral in settings where (i) downstream models\nare specialists that only work well on a subset of inputs, (ii) samples are\nsubject to label noise, and (iii) there is distribution shift between the train\nand test set.\n","authors":["Wittawat Jitkrittum","Neha Gupta","Aditya Krishna Menon","Harikrishna Narasimhan","Ankit Singh Rawat","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2307.02764v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.12866v1","updated":"2024-01-23T16:00:45Z","published":"2024-01-23T16:00:45Z","title":"Evaluating Collaborative and Autonomous Agents in Data-Stream-Supported\n  Coordination of Mobile Crowdsourcing","summary":"  Mobile crowdsourcing refers to systems where the completion of tasks\nnecessarily requires physical movement of crowdworkers in an on-demand\nworkforce. Evidence suggests that in such systems, tasks often get assigned to\ncrowdworkers who struggle to complete those tasks successfully, resulting in\nhigh failure rates and low service quality. A promising solution to ensure\nhigher quality of service is to continuously adapt the assignment and respond\nto failure-causing events by transferring tasks to better-suited workers who\nuse different routes or vehicles. However, implementing task transfers in\nmobile crowdsourcing is difficult because workers are autonomous and may reject\ntransfer requests. Moreover, task outcomes are uncertain and need to be\npredicted. In this paper, we propose different mechanisms to achieve outcome\nprediction and task coordination in mobile crowdsourcing. First, we analyze\ndifferent data stream learning approaches for the prediction of task outcomes.\nSecond, based on the suggested prediction model, we propose and evaluate two\ndifferent approaches for task coordination with different degrees of autonomy:\nan opportunistic approach for crowdshipping with collaborative, but\nnon-autonomous workers, and a market-based model with autonomous workers for\ncrowdsensing.\n","authors":["Ralf Bruns","Jeremias Dötterl","Jürgen Dunkel","Sascha Ossowski"],"pdf_url":"https://arxiv.org/pdf/2401.12866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14391v4","updated":"2024-01-23T15:52:28Z","published":"2023-04-27T17:55:13Z","title":"Energy-based Models are Zero-Shot Planners for Compositional Scene\n  Rearrangement","summary":"  Language is compositional; an instruction can express multiple relation\nconstraints to hold among objects in a scene that a robot is tasked to\nrearrange. Our focus in this work is an instructable scene-rearranging\nframework that generalizes to longer instructions and to spatial concept\ncompositions never seen at training time. We propose to represent\nlanguage-instructed spatial concepts with energy functions over relative object\narrangements. A language parser maps instructions to corresponding energy\nfunctions and an open-vocabulary visual-language model grounds their arguments\nto relevant objects in the scene. We generate goal scene configurations by\ngradient descent on the sum of energy functions, one per language predicate in\nthe instruction. Local vision-based policies then re-locate objects to the\ninferred goal locations. We test our model on established instruction-guided\nmanipulation benchmarks, as well as benchmarks of compositional instructions we\nintroduce. We show our model can execute highly compositional instructions\nzero-shot in simulation and in the real world. It outperforms\nlanguage-to-action reactive policies and Large Language Model planners by a\nlarge margin, especially for long instructions that involve compositions of\nmultiple spatial concepts. Simulation and real-world robot execution videos, as\nwell as our code and datasets are publicly available on our website:\nhttps://ebmplanner.github.io.\n","authors":["Nikolaos Gkanatsios","Ayush Jain","Zhou Xian","Yunchu Zhang","Christopher Atkeson","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2304.14391v4.pdf","comment":"First two authors contributed equally | RSS 2023"},{"id":"http://arxiv.org/abs/2401.12851v1","updated":"2024-01-23T15:35:50Z","published":"2024-01-23T15:35:50Z","title":"Classification of grapevine varieties using UAV hyperspectral imaging","summary":"  The classification of different grapevine varieties is a relevant phenotyping\ntask in Precision Viticulture since it enables estimating the growth of\nvineyard rows dedicated to different varieties, among other applications\nconcerning the wine industry. This task can be performed with destructive\nmethods that require time-consuming tasks, including data collection and\nanalysis in the laboratory. However, Unmanned Aerial Vehicles (UAV) provide a\nmore efficient and less prohibitive approach to collecting hyperspectral data,\ndespite acquiring noisier data. Therefore, the first task is the processing of\nthese data to correct and downsample large amounts of data. In addition, the\nhyperspectral signatures of grape varieties are very similar. In this work, a\nConvolutional Neural Network (CNN) is proposed for classifying seventeen\nvarieties of red and white grape variants. Rather than classifying single\nsamples, these are processed together with their neighbourhood. Hence, the\nextraction of spatial and spectral features is addressed with 1) a spatial\nattention layer and 2) Inception blocks. The pipeline goes from processing to\ndataset elaboration, finishing with the training phase. The fitted model is\nevaluated in terms of response time, accuracy and data separability, and\ncompared with other state-of-the-art CNNs for classifying hyperspectral data.\nOur network was proven to be much more lightweight with a reduced number of\ninput bands, a lower number of trainable weights and therefore, reduced\ntraining time. Despite this, the evaluated metrics showed much better results\nfor our network (~99% overall accuracy), in comparison with previous works\nbarely achieving 81% OA.\n","authors":["Alfonso López","Carlos Javier Ogayar","Francisco Ramón Feito","Joaquim João Sousa"],"pdf_url":"https://arxiv.org/pdf/2401.12851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12849v1","updated":"2024-01-23T15:33:30Z","published":"2024-01-23T15:33:30Z","title":"Learning safety critics via a non-contractive binary bellman operator","summary":"  The inability to naturally enforce safety in Reinforcement Learning (RL),\nwith limited failures, is a core challenge impeding its use in real-world\napplications. One notion of safety of vast practical relevance is the ability\nto avoid (unsafe) regions of the state space. Though such a safety goal can be\ncaptured by an action-value-like function, a.k.a. safety critics, the\nassociated operator lacks the desired contraction and uniqueness properties\nthat the classical Bellman operator enjoys. In this work, we overcome the\nnon-contractiveness of safety critic operators by leveraging that safety is a\nbinary property. To that end, we study the properties of the binary safety\ncritic associated with a deterministic dynamical system that seeks to avoid\nreaching an unsafe region. We formulate the corresponding binary Bellman\nequation (B2E) for safety and study its properties. While the resulting\noperator is still non-contractive, we fully characterize its fixed points\nrepresenting--except for a spurious solution--maximal persistently safe regions\nof the state space that can always avoid failure. We provide an algorithm that,\nby design, leverages axiomatic knowledge of safe data to avoid spurious fixed\npoints.\n","authors":["Agustin Castellano","Hancheng Min","Juan Andrés Bazerque","Enrique Mallada"],"pdf_url":"https://arxiv.org/pdf/2401.12849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18382v2","updated":"2024-01-23T15:27:21Z","published":"2023-10-27T02:58:11Z","title":"From Generative AI to Generative Internet of Things: Fundamentals,\n  Framework, and Outlooks","summary":"  Generative Artificial Intelligence (GAI) possesses the capabilities of\ngenerating realistic data and facilitating advanced decision-making. By\nintegrating GAI into modern Internet of Things (IoT), Generative Internet of\nThings (GIoT) is emerging and holds immense potential to revolutionize various\naspects of society, enabling more efficient and intelligent IoT applications,\nsuch as smart surveillance and voice assistants. In this article, we present\nthe concept of GIoT and conduct an exploration of its potential prospects.\nSpecifically, we first overview four GAI techniques and investigate promising\nGIoT applications. Then, we elaborate on the main challenges in enabling GIoT\nand propose a general GAI-based secure incentive mechanism framework to address\nthem, in which we adopt Generative Diffusion Models (GDMs) for incentive\nmechanism designs and apply blockchain technologies for secure GIoT management.\nMoreover, we conduct a case study on modern Internet of Vehicle traffic\nmonitoring, which utilizes GDMs to generate effective contracts for\nincentivizing users to contribute sensing data with high quality. Finally, we\nsuggest several open directions worth investigating for the future popularity\nof GIoT.\n","authors":["Jinbo Wen","Jiangtian Nie","Jiawen Kang","Dusit Niyato","Hongyang Du","Yang Zhang","Mohsen Guizani"],"pdf_url":"https://arxiv.org/pdf/2310.18382v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12843v1","updated":"2024-01-23T15:25:21Z","published":"2024-01-23T15:25:21Z","title":"An embedding-based distance for temporal graphs","summary":"  We define a distance between temporal graphs based on graph embeddings built\nusing time-respecting random walks. We study both the case of matched graphs,\nwhen there exists a known relation between the nodes, and the unmatched case,\nwhen such a relation is unavailable and the graphs may be of different sizes.\nWe illustrate the interest of our distance definition, using both real and\nsynthetic temporal network data, by showing its ability to discriminate between\ngraphs with different structural and temporal properties. Leveraging\nstate-of-the-art machine learning techniques, we propose an efficient\nimplementation of distance computation that is viable for large-scale temporal\ngraphs.\n","authors":["Lorenzo Dall'Amico","Alain Barrat","Ciro Cattuto"],"pdf_url":"https://arxiv.org/pdf/2401.12843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12842v1","updated":"2024-01-23T15:23:13Z","published":"2024-01-23T15:23:13Z","title":"Iterated Relevance Matrix Analysis (IRMA) for the identification of\n  class-discriminative subspaces","summary":"  We introduce and investigate the iterated application of Generalized Matrix\nLearning Vector Quantizaton for the analysis of feature relevances in\nclassification problems, as well as for the construction of\nclass-discriminative subspaces. The suggested Iterated Relevance Matrix\nAnalysis (IRMA) identifies a linear subspace representing the classification\nspecific information of the considered data sets using Generalized Matrix\nLearning Vector Quantization (GMLVQ). By iteratively determining a new\ndiscriminative subspace while projecting out all previously identified ones, a\ncombined subspace carrying all class-specific information can be found. This\nfacilitates a detailed analysis of feature relevances, and enables improved\nlow-dimensional representations and visualizations of labeled data sets.\nAdditionally, the IRMA-based class-discriminative subspace can be used for\ndimensionality reduction and the training of robust classifiers with\npotentially improved performance.\n","authors":["Sofie Lövdal","Michael Biehl"],"pdf_url":"https://arxiv.org/pdf/2401.12842v1.pdf","comment":"17 pages, 5 figures, 1 table. Submitted to Neurocomputing. Extension\n  of 2023 ESANN conference contribution"},{"id":"http://arxiv.org/abs/2401.11202v2","updated":"2024-01-23T15:11:46Z","published":"2024-01-20T10:30:31Z","title":"PartIR: Composing SPMD Partitioning Strategies for Machine Learning","summary":"  Training of modern large neural networks (NN) requires a combination of\nparallelization strategies encompassing data, model, or optimizer sharding.\nWhen strategies increase in complexity, it becomes necessary for partitioning\ntools to be 1) expressive, allowing the composition of simpler strategies, and\n2) predictable to estimate performance analytically. We present PartIR, our\ndesign for a NN partitioning system. PartIR is focused on an incremental\napproach to rewriting and is hardware-and-runtime agnostic. We present a simple\nbut powerful API for composing sharding strategies and a simulator to validate\nthem. The process is driven by high-level programmer-issued partitioning\ntactics, which can be both manual and automatic. Importantly, the tactics are\nspecified separately from the model code, making them easy to change. We\nevaluate PartIR on several different models to demonstrate its predictability,\nexpressibility, and ability to reach peak performance..\n","authors":["Sami Alabed","Bart Chrzaszcz","Juliana Franco","Dominik Grewe","Dougal Maclaurin","James Molloy","Tom Natan","Tamara Norman","Xiaoyue Pan","Adam Paszke","Norman A. Rink","Michael Schaarschmidt","Timur Sitdikov","Agnieszka Swietlik","Dimitrios Vytiniotis","Joel Wee"],"pdf_url":"https://arxiv.org/pdf/2401.11202v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12830v1","updated":"2024-01-23T15:07:49Z","published":"2024-01-23T15:07:49Z","title":"Enhancing Next Destination Prediction: A Novel LSTM Approach Using\n  Real-World Airline Data","summary":"  In the modern transportation industry, accurate prediction of travelers' next\ndestinations brings multiple benefits to companies, such as customer\nsatisfaction and targeted marketing. This study focuses on developing a precise\nmodel that captures the sequential patterns and dependencies in travel data,\nenabling accurate predictions of individual travelers' future destinations. To\nachieve this, a novel model architecture with a sliding window approach based\non Long Short-Term Memory (LSTM) is proposed for destination prediction in the\ntransportation industry. The experimental results highlight satisfactory\nperformance and high scores achieved by the proposed model across different\ndata sizes and performance metrics. This research contributes to advancing\ndestination prediction methods, empowering companies to deliver personalized\nrecommendations and optimize customer experiences in the dynamic travel\nlandscape.\n","authors":["Salih Salihoglu","Gulser Koksal","Orhan Abar"],"pdf_url":"https://arxiv.org/pdf/2401.12830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12824v1","updated":"2024-01-23T14:59:46Z","published":"2024-01-23T14:59:46Z","title":"MAPPING: Debiasing Graph Neural Networks for Fair Node Classification\n  with Limited Sensitive Information Leakage","summary":"  Despite remarkable success in diverse web-based applications, Graph Neural\nNetworks(GNNs) inherit and further exacerbate historical discrimination and\nsocial stereotypes, which critically hinder their deployments in high-stake\ndomains such as online clinical diagnosis, financial crediting, etc. However,\ncurrent fairness research that primarily craft on i.i.d data, cannot be\ntrivially replicated to non-i.i.d. graph structures with topological dependence\namong samples. Existing fair graph learning typically favors pairwise\nconstraints to achieve fairness but fails to cast off dimensional limitations\nand generalize them into multiple sensitive attributes; besides, most studies\nfocus on in-processing techniques to enforce and calibrate fairness,\nconstructing a model-agnostic debiasing GNN framework at the pre-processing\nstage to prevent downstream misuses and improve training reliability is still\nlargely under-explored. Furthermore, previous work on GNNs tend to enhance\neither fairness or privacy individually but few probe into their interplays. In\nthis paper, we propose a novel model-agnostic debiasing framework named MAPPING\n(\\underline{M}asking \\underline{A}nd \\underline{P}runing and\nMessage-\\underline{P}assing train\\underline{ING}) for fair node classification,\nin which we adopt the distance covariance($dCov$)-based fairness constraints to\nsimultaneously reduce feature and topology biases in arbitrary dimensions, and\ncombine them with adversarial debiasing to confine the risks of attribute\ninference attacks. Experiments on real-world datasets with different GNN\nvariants demonstrate the effectiveness and flexibility of MAPPING. Our results\nshow that MAPPING can achieve better trade-offs between utility and fairness,\nand mitigate privacy risks of sensitive information leakage.\n","authors":["Ying Song","Balaji Palanisamy"],"pdf_url":"https://arxiv.org/pdf/2401.12824v1.pdf","comment":"Finished May last year. Remember to submit all papers to arXiv early\n  without compromising the principles of conferences"},{"id":"http://arxiv.org/abs/2401.12822v1","updated":"2024-01-23T14:55:46Z","published":"2024-01-23T14:55:46Z","title":"Deep Learning Based Simulators for the Phosphorus Removal Process\n  Control in Wastewater Treatment via Deep Reinforcement Learning Algorithms","summary":"  Phosphorus removal is vital in wastewater treatment to reduce reliance on\nlimited resources. Deep reinforcement learning (DRL) is a machine learning\ntechnique that can optimize complex and nonlinear systems, including the\nprocesses in wastewater treatment plants, by learning control policies through\ntrial and error. However, applying DRL to chemical and biological processes is\nchallenging due to the need for accurate simulators. This study trained six\nmodels to identify the phosphorus removal process and used them to create a\nsimulator for the DRL environment. Although the models achieved high accuracy\n(>97%), uncertainty and incorrect prediction behavior limited their performance\nas simulators over longer horizons. Compounding errors in the models'\npredictions were identified as one of the causes of this problem. This approach\nfor improving process control involves creating simulation environments for DRL\nalgorithms, using data from supervisory control and data acquisition (SCADA)\nsystems with a sufficient historical horizon without complex system modeling or\nparameter estimation.\n","authors":["Esmaeel Mohammadi","Mikkel Stokholm-Bjerregaard","Aviaja Anna Hansen","Per Halkjær Nielsen","Daniel Ortiz-Arroyo","Petar Durdevic"],"pdf_url":"https://arxiv.org/pdf/2401.12822v1.pdf","comment":"Journal Paper"},{"id":"http://arxiv.org/abs/2401.12820v1","updated":"2024-01-23T14:53:32Z","published":"2024-01-23T14:53:32Z","title":"DatUS^2: Data-driven Unsupervised Semantic Segmentation with Pre-trained\n  Self-supervised Vision Transformer","summary":"  Successive proposals of several self-supervised training schemes continue to\nemerge, taking one step closer to developing a universal foundation model. In\nthis process, the unsupervised downstream tasks are recognized as one of the\nevaluation methods to validate the quality of visual features learned with a\nself-supervised training scheme. However, unsupervised dense semantic\nsegmentation has not been explored as a downstream task, which can utilize and\nevaluate the quality of semantic information introduced in patch-level feature\nrepresentations during self-supervised training of a vision transformer.\nTherefore, this paper proposes a novel data-driven approach for unsupervised\nsemantic segmentation (DatUS^2) as a downstream task. DatUS^2 generates\nsemantically consistent and dense pseudo annotate segmentation masks for the\nunlabeled image dataset without using any visual-prior or synchronized data. We\ncompare these pseudo-annotated segmentation masks with ground truth masks for\nevaluating recent self-supervised training schemes to learn shared semantic\nproperties at the patch level and discriminative semantic properties at the\nsegment level. Finally, we evaluate existing state-of-the-art self-supervised\ntraining schemes with our proposed downstream task, i.e., DatUS^2. Also, the\nbest version of DatUS^2 outperforms the existing state-of-the-art method for\nthe unsupervised dense semantic segmentation task with 15.02% MiOU and 21.47%\nPixel accuracy on the SUIM dataset. It also achieves a competitive level of\naccuracy for a large-scale and complex dataset, i.e., the COCO dataset.\n","authors":["Sonal Kumar","Arijit Sur","Rashmi Dutta Baruah"],"pdf_url":"https://arxiv.org/pdf/2401.12820v1.pdf","comment":"The manuscript contains 13 pages, 9 figures and 7 tables"},{"id":"http://arxiv.org/abs/2401.12819v1","updated":"2024-01-23T14:53:20Z","published":"2024-01-23T14:53:20Z","title":"Dynamic Layer Tying for Parameter-Efficient Transformers","summary":"  In the pursuit of reducing the number of trainable parameters in deep\ntransformer networks, we employ Reinforcement Learning to dynamically select\nlayers during training and tie them together. Every few iterations, the RL\nagent is asked whether to train each layer $i$ independently or to copy the\nweights of a previous layer $j<i$. This facilitates weight sharing, reduces the\nnumber of trainable parameters, and also serves as an effective regularization\ntechnique. Experimental evaluations validate that our model modestly\noutperforms the baseline transformer model with regard to perplexity and\ndrastically reduces the number of trainable parameters. In particular, the\nmemory consumption during training is up to one order of magnitude less than\nthe conventional training method.\n","authors":["Tamir David Hay","Lior Wolf"],"pdf_url":"https://arxiv.org/pdf/2401.12819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14190v2","updated":"2024-01-23T14:51:41Z","published":"2023-08-27T19:43:43Z","title":"Score-Based Generative Models for PET Image Reconstruction","summary":"  Score-based generative models have demonstrated highly promising results for\nmedical image reconstruction tasks in magnetic resonance imaging or computed\ntomography. However, their application to Positron Emission Tomography (PET) is\nstill largely unexplored. PET image reconstruction involves a variety of\nchallenges, including Poisson noise with high variance and a wide dynamic\nrange. To address these challenges, we propose several PET-specific adaptations\nof score-based generative models. The proposed framework is developed for both\n2D and 3D PET. In addition, we provide an extension to guided reconstruction\nusing magnetic resonance images. We validate the approach through extensive 2D\nand 3D $\\textit{in-silico}$ experiments with a model trained on\npatient-realistic data without lesions, and evaluate on data without lesions as\nwell as out-of-distribution data with lesions. This demonstrates the proposed\nmethod's robustness and significant potential for improved PET reconstruction.\n","authors":["Imraj RD Singh","Alexander Denker","Riccardo Barbano","Željko Kereta","Bangti Jin","Kris Thielemans","Peter Maass","Simon Arridge"],"pdf_url":"https://arxiv.org/pdf/2308.14190v2.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n  Biomedical Imaging (MELBA) https://melba-journal.org/2024:001"},{"id":"http://arxiv.org/abs/2401.09943v2","updated":"2024-01-23T14:41:41Z","published":"2024-01-18T12:45:46Z","title":"Infinite-Horizon Graph Filters: Leveraging Power Series to Enhance\n  Sparse Information Aggregation","summary":"  Graph Neural Networks (GNNs) have shown considerable effectiveness in a\nvariety of graph learning tasks, particularly those based on the\nmessage-passing approach in recent years. However, their performance is often\nconstrained by a limited receptive field, a challenge that becomes more acute\nin the presence of sparse graphs. In light of the power series, which possesses\ninfinite expansion capabilities, we propose a novel Graph Power Filter Neural\nNetwork (GPFN) that enhances node classification by employing a power series\ngraph filter to augment the receptive field. Concretely, our GPFN designs a new\nway to build a graph filter with an infinite receptive field based on the\nconvergence power series, which can be analyzed in the spectral and spatial\ndomains. Besides, we theoretically prove that our GPFN is a general framework\nthat can integrate any power series and capture long-range dependencies.\nFinally, experimental results on three datasets demonstrate the superiority of\nour GPFN over state-of-the-art baselines.\n","authors":["Ruizhe Zhang","Xinke Jiang","Yuchen Fang","Jiayuan Luo","Yongxin Xu","Yichen Zhu","Xu Chu","Junfeng Zhao","Yasha Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09943v2.pdf","comment":"v1"},{"id":"http://arxiv.org/abs/2401.12806v1","updated":"2024-01-23T14:37:51Z","published":"2024-01-23T14:37:51Z","title":"Binary structured physics-informed neural networks for solving equations\n  with rapidly changing solutions","summary":"  Physics-informed neural networks (PINNs), rooted in deep learning, have\nemerged as a promising approach for solving partial differential equations\n(PDEs). By embedding the physical information described by PDEs into\nfeedforward neural networks, PINNs are trained as surrogate models to\napproximate solutions without the need for label data. Nevertheless, even\nthough PINNs have shown remarkable performance, they can face difficulties,\nespecially when dealing with equations featuring rapidly changing solutions.\nThese difficulties encompass slow convergence, susceptibility to becoming\ntrapped in local minima, and reduced solution accuracy. To address these\nissues, we propose a binary structured physics-informed neural network (BsPINN)\nframework, which employs binary structured neural network (BsNN) as the neural\nnetwork component. By leveraging a binary structure that reduces inter-neuron\nconnections compared to fully connected neural networks, BsPINNs excel in\ncapturing the local features of solutions more effectively and efficiently.\nThese features are particularly crucial for learning the rapidly changing in\nthe nature of solutions. In a series of numerical experiments solving Burgers\nequation, Euler equation, Helmholtz equation, and high-dimension Poisson\nequation, BsPINNs exhibit superior convergence speed and heightened accuracy\ncompared to PINNs. From these experiments, we discover that BsPINNs resolve the\nissues caused by increased hidden layers in PINNs resulting in over-smoothing,\nand prevent the decline in accuracy due to non-smoothness of PDEs solutions.\n","authors":["Yanzhi Liu","Ruifan Wu","Ying Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.12806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12790v1","updated":"2024-01-23T14:25:43Z","published":"2024-01-23T14:25:43Z","title":"MORPH: Towards Automated Concept Drift Adaptation for Malware Detection","summary":"  Concept drift is a significant challenge for malware detection, as the\nperformance of trained machine learning models degrades over time, rendering\nthem impractical. While prior research in malware concept drift adaptation has\nprimarily focused on active learning, which involves selecting representative\nsamples to update the model, self-training has emerged as a promising approach\nto mitigate concept drift. Self-training involves retraining the model using\npseudo labels to adapt to shifting data distributions. In this research, we\npropose MORPH -- an effective pseudo-label-based concept drift adaptation\nmethod specifically designed for neural networks. Through extensive\nexperimental analysis of Android and Windows malware datasets, we demonstrate\nthe efficacy of our approach in mitigating the impact of concept drift. Our\nmethod offers the advantage of reducing annotation efforts when combined with\nactive learning. Furthermore, our method significantly improves over existing\nworks in automated concept drift adaptation for malware detection.\n","authors":["Md Tanvirul Alam","Romy Fieblinger","Ashim Mahara","Nidhi Rastogi"],"pdf_url":"https://arxiv.org/pdf/2401.12790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12783v1","updated":"2024-01-23T14:11:29Z","published":"2024-01-23T14:11:29Z","title":"A Review of Deep Learning Methods for Photoplethysmography Data","summary":"  Photoplethysmography (PPG) is a highly promising device due to its advantages\nin portability, user-friendly operation, and non-invasive capabilities to\nmeasure a wide range of physiological information. Recent advancements in deep\nlearning have demonstrated remarkable outcomes by leveraging PPG signals for\ntasks related to personal health management and other multifaceted\napplications. In this review, we systematically reviewed papers that applied\ndeep learning models to process PPG data between January 1st of 2017 and July\n31st of 2023 from Google Scholar, PubMed and Dimensions. Each paper is analyzed\nfrom three key perspectives: tasks, models, and data. We finally extracted 193\npapers where different deep learning frameworks were used to process PPG\nsignals. Based on the tasks addressed in these papers, we categorized them into\ntwo major groups: medical-related, and non-medical-related. The medical-related\ntasks were further divided into seven subgroups, including blood pressure\nanalysis, cardiovascular monitoring and diagnosis, sleep health, mental health,\nrespiratory monitoring and analysis, blood glucose analysis, as well as others.\nThe non-medical-related tasks were divided into four subgroups, which encompass\nsignal processing, biometric identification, electrocardiogram reconstruction,\nand human activity recognition. In conclusion, significant progress has been\nmade in the field of using deep learning methods to process PPG data recently.\nThis allows for a more thorough exploration and utilization of the information\ncontained in PPG signals. However, challenges remain, such as limited quantity\nand quality of publicly available databases, a lack of effective validation in\nreal-world scenarios, and concerns about the interpretability, scalability, and\ncomplexity of deep learning models. Moreover, there are still emerging research\nareas that require further investigation.\n","authors":["Guangkun Nie","Jiabao Zhu","Gongzheng Tang","Deyun Zhang","Shijia Geng","Qinghao Zhao","Shenda Hong"],"pdf_url":"https://arxiv.org/pdf/2401.12783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12780v1","updated":"2024-01-23T14:06:08Z","published":"2024-01-23T14:06:08Z","title":"DeepRicci: Self-supervised Graph Structure-Feature Co-Refinement for\n  Alleviating Over-squashing","summary":"  Graph Neural Networks (GNNs) have shown great power for learning and mining\non graphs, and Graph Structure Learning (GSL) plays an important role in\nboosting GNNs with a refined graph. In the literature, most GSL solutions\neither primarily focus on structure refinement with task-specific supervision\n(i.e., node classification), or overlook the inherent weakness of GNNs\nthemselves (e.g., over-squashing), resulting in suboptimal performance despite\nsophisticated designs. In light of these limitations, we propose to study\nself-supervised graph structure-feature co-refinement for effectively\nalleviating the issue of over-squashing in typical GNNs. In this paper, we take\na fundamentally different perspective of the Ricci curvature in Riemannian\ngeometry, in which we encounter the challenges of modeling, utilizing and\ncomputing Ricci curvature. To tackle these challenges, we present a\nself-supervised Riemannian model, DeepRicci. Specifically, we introduce a\nlatent Riemannian space of heterogeneous curvatures to model various Ricci\ncurvatures, and propose a gyrovector feature mapping to utilize Ricci curvature\nfor typical GNNs. Thereafter, we refine node features by geometric contrastive\nlearning among different geometric views, and simultaneously refine graph\nstructure by backward Ricci flow based on a novel formulation of differentiable\nRicci curvature. Finally, extensive experiments on public datasets show the\nsuperiority of DeepRicci, and the connection between backward Ricci flow and\nover-squashing. Codes of our work are given in https://github.com/RiemanGraph/.\n","authors":["Li Sun","Zhenhao Huang","Hua Wu","Junda Ye","Hao Peng","Zhengtao Yu","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2401.12780v1.pdf","comment":"Accepted by IEEE ICDM 2023, Full paper, 10 pages"},{"id":"http://arxiv.org/abs/2308.11624v2","updated":"2024-01-23T14:05:13Z","published":"2023-08-01T05:58:31Z","title":"Revolutionizing TCAD Simulations with Universal Device Encoding and\n  Graph Attention Networks","summary":"  An innovative methodology that leverages artificial intelligence (AI) and\ngraph representation for semiconductor device encoding in TCAD device\nsimulation is proposed. A graph-based universal encoding scheme is presented\nthat not only considers material-level and device-level embeddings, but also\nintroduces a novel spatial relationship embedding inspired by interpolation\noperations typically used in finite element meshing. Universal physical laws\nfrom device simulations are leveraged for comprehensive data-driven modeling,\nwhich encompasses surrogate Poisson emulation and current-voltage (IV)\nprediction based on drift-diffusion model. Both are achieved using a novel\ngraph attention network, referred to as RelGAT. Comprehensive technical details\nbased on the device simulator Sentaurus TCAD are presented, empowering\nresearchers to adopt the proposed AI-driven Electronic Design Automation (EDA)\nsolution at the device level.\n","authors":["Guangxi Fan","Leilai Shao","Kain Lu Low"],"pdf_url":"https://arxiv.org/pdf/2308.11624v2.pdf","comment":"32 pages, 13 figures and 4 tables"},{"id":"http://arxiv.org/abs/2207.02829v5","updated":"2024-01-23T14:00:54Z","published":"2022-07-06T17:36:59Z","title":"Online Bilevel Optimization: Regret Analysis of Online Alternating\n  Gradient Methods","summary":"  This paper introduces an \\textit{online bilevel optimization} setting in\nwhich a sequence of time-varying bilevel problems are revealed one after the\nother. We extend the known regret bounds for single-level online algorithms to\nthe bilevel setting. Specifically, we provide new notions of \\textit{bilevel\nregret}, develop an online alternating time-averaged gradient method that is\ncapable of leveraging smoothness, and give regret bounds in terms of the\npath-length of the inner and outer minimizer sequences.\n","authors":["Davoud Ataee Tarzanagh","Parvin Nazari","Bojian Hou","Li Shen","Laura Balzano"],"pdf_url":"https://arxiv.org/pdf/2207.02829v5.pdf","comment":"Accepted for publication at AISTATS 2024. v5: experiments are\n  expanded"},{"id":"http://arxiv.org/abs/2401.12764v1","updated":"2024-01-23T13:44:15Z","published":"2024-01-23T13:44:15Z","title":"Fast Nonlinear Two-Time-Scale Stochastic Approximation: Achieving\n  $\\mathcal{O}(1/k)$ Finite-Sample Complexity","summary":"  This paper proposes to develop a new variant of the two-time-scale stochastic\napproximation to find the roots of two coupled nonlinear operators, assuming\nonly noisy samples of these operators can be observed. Our key idea is to\nleverage the classic Ruppert-Polyak averaging technique to dynamically estimate\nthe operators through their samples. The estimated values of these averaging\nsteps will then be used in the two-time-scale stochastic approximation updates\nto find the desired solution. Our main theoretical result is to show that under\nthe strongly monotone condition of the underlying nonlinear operators the\nmean-squared errors of the iterates generated by the proposed method converge\nto zero at an optimal rate $\\mathcal{O}(1/k)$, where $k$ is the number of\niterations. Our result significantly improves the existing result of\ntwo-time-scale stochastic approximation, where the best known finite-time\nconvergence rate is $\\mathcal{O}(1/k^{2/3})$.\n","authors":["Thinh T. Doan"],"pdf_url":"https://arxiv.org/pdf/2401.12764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02869v3","updated":"2024-01-23T13:37:15Z","published":"2023-06-05T13:43:34Z","title":"Data-Driven Online Model Selection With Regret Guarantees","summary":"  We consider model selection for sequential decision making in stochastic\nenvironments with bandit feedback, where a meta-learner has at its disposal a\npool of base learners, and decides on the fly which action to take based on the\npolicies recommended by each base learner. Model selection is performed by\nregret balancing but, unlike the recent literature on this subject, we do not\nassume any prior knowledge about the base learners like candidate regret\nguarantees; instead, we uncover these quantities in a data-driven manner. The\nmeta-learner is therefore able to leverage the realized regret incurred by each\nbase learner for the learning environment at hand (as opposed to the expected\nregret), and single out the best such regret. We design two model selection\nalgorithms operating with this more ambitious notion of regret and, besides\nproving model selection guarantees via regret balancing, we experimentally\ndemonstrate the compelling practical benefits of dealing with actual regrets\ninstead of candidate regret bounds.\n","authors":["Aldo Pacchiano","Christoph Dann","Claudio Gentile"],"pdf_url":"https://arxiv.org/pdf/2306.02869v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15282v2","updated":"2024-01-23T13:32:03Z","published":"2023-12-23T15:38:22Z","title":"Causal Forecasting for Pricing","summary":"  This paper proposes a novel method for demand forecasting in a pricing\ncontext. Here, modeling the causal relationship between price as an input\nvariable to demand is crucial because retailers aim to set prices in a (profit)\noptimal manner in a downstream decision making problem. Our methods bring\ntogether the Double Machine Learning methodology for causal inference and\nstate-of-the-art transformer-based forecasting models. In extensive empirical\nexperiments, we show on the one hand that our method estimates the causal\neffect better in a fully controlled setting via synthetic, yet realistic data.\nOn the other hand, we demonstrate on real-world data that our method\noutperforms forecasting methods in off-policy settings (i.e., when there's a\nchange in the pricing policy) while only slightly trailing in the on-policy\nsetting.\n","authors":["Douglas Schultz","Johannes Stephan","Julian Sieber","Trudie Yeh","Manuel Kunz","Patrick Doupe","Tim Januschowski"],"pdf_url":"https://arxiv.org/pdf/2312.15282v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12745v1","updated":"2024-01-23T13:23:59Z","published":"2024-01-23T13:23:59Z","title":"On the Utility of Probing Trajectories for Algorithm-Selection","summary":"  Machine-learning approaches to algorithm-selection typically take data\ndescribing an instance as input. Input data can take the form of features\nderived from the instance description or fitness landscape, or can be a direct\nrepresentation of the instance itself, i.e. an image or textual description.\nRegardless of the choice of input, there is an implicit assumption that\ninstances that are similar will elicit similar performance from algorithm, and\nthat a model is capable of learning this relationship. We argue that viewing\nalgorithm-selection purely from an instance perspective can be misleading as it\nfails to account for how an algorithm `views' similarity between instances. We\npropose a novel `algorithm-centric' method for describing instances that can be\nused to train models for algorithm-selection: specifically, we use short\nprobing trajectories calculated by applying a solver to an instance for a very\nshort period of time. The approach is demonstrated to be promising, providing\ncomparable or better results to computationally expensive landscape-based\nfeature-based approaches. Furthermore, projecting the trajectories into a\n2-dimensional space illustrates that functions that are similar from an\nalgorithm-perspective do not necessarily correspond to the accepted\ncategorisation of these functions from a human perspective.\n","authors":["Quentin Renau","Emma Hart"],"pdf_url":"https://arxiv.org/pdf/2401.12745v1.pdf","comment":"To appear in the proceedings of the 27th International Conference,\n  EvoApplications 2024"},{"id":"http://arxiv.org/abs/2307.03212v2","updated":"2024-01-23T13:15:31Z","published":"2023-07-06T16:38:43Z","title":"Region-Wise Attentive Multi-View Representation Learning for Urban\n  Region Embeddings","summary":"  Urban region embedding is an important and yet highly challenging issue due\nto the complexity and constantly changing nature of urban data. To address the\nchallenges, we propose a Region-Wise Multi-View Representation Learning (ROMER)\nto capture multi-view dependencies and learn expressive representations of\nurban regions without the constraints of rigid neighbourhood region conditions.\nOur model focus on learn urban region representation from multi-source urban\ndata. First, we capture the multi-view correlations from mobility flow\npatterns, POI semantics and check-in dynamics. Then, we adopt global graph\nattention networks to learn similarity of any two vertices in graphs. To\ncomprehensively consider and share features of multiple views, a two-stage\nfusion module is further proposed to learn weights with external attention to\nfuse multi-view embeddings. Extensive experiments for two downstream tasks on\nreal-world datasets demonstrate that our model outperforms state-of-the-art\nmethods by up to 17\\% improvement.\n","authors":["Weiliang Chan","Qianqian Ren"],"pdf_url":"https://arxiv.org/pdf/2307.03212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12733v1","updated":"2024-01-23T13:11:05Z","published":"2024-01-23T13:11:05Z","title":"TNANet: A Temporal-Noise-Aware Neural Network for Suicidal Ideation\n  Prediction with Noisy Physiological Data","summary":"  The robust generalization of deep learning models in the presence of inherent\nnoise remains a significant challenge, especially when labels are subjective\nand noise is indiscernible in natural settings. This problem is particularly\npronounced in many practical applications. In this paper, we address a special\nand important scenario of monitoring suicidal ideation, where time-series data,\nsuch as photoplethysmography (PPG), is susceptible to such noise. Current\nmethods predominantly focus on image and text data or address artificially\nintroduced noise, neglecting the complexities of natural noise in time-series\nanalysis. To tackle this, we introduce a novel neural network model tailored\nfor analyzing noisy physiological time-series data, named TNANet, which merges\nadvanced encoding techniques with confidence learning, enhancing prediction\naccuracy. Another contribution of our work is the collection of a specialized\ndataset of PPG signals derived from real-world environments for suicidal\nideation prediction. Employing this dataset, our TNANet achieves the prediction\naccuracy of 63.33% in a binary classification task, outperforming\nstate-of-the-art models. Furthermore, comprehensive evaluations were conducted\non three other well-known public datasets with artificially introduced noise to\nrigorously test the TNANet's capabilities. These tests consistently\ndemonstrated TNANet's superior performance by achieving an accuracy improvement\nof more than 10% compared to baseline methods.\n","authors":["Niqi Liu","Fang Liu","Wenqi Ji","Xinxin Du","Xu Liu","Guozhen Zhao","Wenting Mu","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2401.12733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12731v1","updated":"2024-01-23T13:04:02Z","published":"2024-01-23T13:04:02Z","title":"The Distributional Uncertainty of the SHAP score in Explainable Machine\n  Learning","summary":"  Attribution scores reflect how important the feature values in an input\nentity are for the output of a machine learning model. One of the most popular\nattribution scores is the SHAP score, which is an instantiation of the general\nShapley value used in coalition game theory. The definition of this score\nrelies on a probability distribution on the entity population. Since the exact\ndistribution is generally unknown, it needs to be assigned subjectively or be\nestimated from data, which may lead to misleading feature scores. In this\npaper, we propose a principled framework for reasoning on SHAP scores under\nunknown entity population distributions. In our framework, we consider an\nuncertainty region that contains the potential distributions, and the SHAP\nscore of a feature becomes a function defined over this region. We study the\nbasic problems of finding maxima and minima of this function, which allows us\nto determine tight ranges for the SHAP scores of all features. In particular,\nwe pinpoint the complexity of these problems, and other related ones, showing\nthem to be NP-complete. Finally, we present experiments on a real-world\ndataset, showing that our framework may contribute to a more robust feature\nscoring.\n","authors":["Santiago Cifuentes","Leopoldo Bertossi","Nina Pardal","Sergio Abriola","Maria Vanina Martinez","Miguel Romero"],"pdf_url":"https://arxiv.org/pdf/2401.12731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12729v1","updated":"2024-01-23T13:02:11Z","published":"2024-01-23T13:02:11Z","title":"Enhancing Object Detection Performance for Small Objects through\n  Synthetic Data Generation and Proportional Class-Balancing Technique: A\n  Comparative Study in Industrial Scenarios","summary":"  Object Detection (OD) has proven to be a significant computer vision method\nin extracting localized class information and has multiple applications in the\nindustry. Although many of the state-of-the-art (SOTA) OD models perform well\non medium and large sized objects, they seem to under perform on small objects.\nIn most of the industrial use cases, it is difficult to collect and annotate\ndata for small objects, as it is time-consuming and prone to human errors.\nAdditionally, those datasets are likely to be unbalanced and often result in an\ninefficient model convergence. To tackle this challenge, this study presents a\nnovel approach that injects additional data points to improve the performance\nof the OD models. Using synthetic data generation, the difficulties in data\ncollection and annotations for small object data points can be minimized and to\ncreate a dataset with balanced distribution. This paper discusses the effects\nof a simple proportional class-balancing technique, to enable better anchor\nmatching of the OD models. A comparison was carried out on the performances of\nthe SOTA OD models: YOLOv5, YOLOv7 and SSD, for combinations of real and\nsynthetic datasets within an industrial use case.\n","authors":["Jibinraj Antony","Vinit Hegiste","Ali Nazeri","Hooman Tavakoli","Snehal Walunj","Christiane Plociennik","Martin Ruskowski"],"pdf_url":"https://arxiv.org/pdf/2401.12729v1.pdf","comment":"Accepted and presented in conference ESAIM23 1st European Symposium\n  on Artificial Intelligence in Manufacturing"},{"id":"http://arxiv.org/abs/2401.10754v2","updated":"2024-01-23T12:53:33Z","published":"2024-01-19T15:25:09Z","title":"Data Augmentation for Traffic Classification","summary":"  Data Augmentation (DA) -- enriching training data by adding synthetic samples\n-- is a technique widely adopted in Computer Vision (CV) and Natural Language\nProcessing (NLP) tasks to improve models performance. Yet, DA has struggled to\ngain traction in networking contexts, particularly in Traffic Classification\n(TC) tasks. In this work, we fulfill this gap by benchmarking 18 augmentation\nfunctions applied to 3 TC datasets using packet time series as input\nrepresentation and considering a variety of training conditions. Our results\nshow that (i) DA can reap benefits previously unexplored, (ii) augmentations\nacting on time series sequence order and masking are better suited for TC than\namplitude augmentations and (iii) basic models latent space analysis can help\nunderstanding the positive/negative effects of augmentations on classification\nperformance.\n","authors":["Chao Wang","Alessandro Finamore","Pietro Michiardi","Massimo Gallo","Dario Rossi"],"pdf_url":"https://arxiv.org/pdf/2401.10754v2.pdf","comment":"to appear at Passive and Active Measurements (PAM), 2024"},{"id":"http://arxiv.org/abs/2110.07869v3","updated":"2024-01-23T12:50:42Z","published":"2021-10-15T05:47:26Z","title":"DPGNN: Dual-Perception Graph Neural Network for Representation Learning","summary":"  Graph neural networks (GNNs) have drawn increasing attention in recent years\nand achieved remarkable performance in many graph-based tasks, especially in\nsemi-supervised learning on graphs. However, most existing GNNs are based on\nthe message-passing paradigm to iteratively aggregate neighborhood information\nin a single topology space. Despite their success, the expressive power of GNNs\nis limited by some drawbacks, such as inflexibility of message source\nexpansion, negligence of node-level message output discrepancy, and restriction\nof single message space. To address these drawbacks, we present a novel\nmessage-passing paradigm, based on the properties of multi-step message source,\nnode-specific message output, and multi-space message interaction. To verify\nits validity, we instantiate the new message-passing paradigm as a\nDual-Perception Graph Neural Network (DPGNN), which applies a node-to-step\nattention mechanism to aggregate node-specific multi-step neighborhood\ninformation adaptively. Our proposed DPGNN can capture the structural\nneighborhood information and the feature-related information simultaneously for\ngraph representation learning. Experimental results on six benchmark datasets\nwith different topological structures demonstrate that our method outperforms\nthe latest state-of-the-art models, which proves the superiority and\nversatility of our method. To our knowledge, we are the first to consider\nnode-specific message passing in the GNNs.\n","authors":["Li Zhou","Wenyu Chen","Dingyi Zeng","Shaohuan Cheng","Wanlong Liu","Malu Zhang","Hong Qu"],"pdf_url":"https://arxiv.org/pdf/2110.07869v3.pdf","comment":"Published in Knowledge-Based Systems"},{"id":"http://arxiv.org/abs/2401.12722v1","updated":"2024-01-23T12:48:27Z","published":"2024-01-23T12:48:27Z","title":"Falcon: Fair Active Learning using Multi-armed Bandits","summary":"  Biased data can lead to unfair machine learning models, highlighting the\nimportance of embedding fairness at the beginning of data analysis,\nparticularly during dataset curation and labeling. In response, we propose\nFalcon, a scalable fair active learning framework. Falcon adopts a data-centric\napproach that improves machine learning model fairness via strategic sample\nselection. Given a user-specified group fairness measure, Falcon identifies\nsamples from \"target groups\" (e.g., (attribute=female, label=positive)) that\nare the most informative for improving fairness. However, a challenge arises\nsince these target groups are defined using ground truth labels that are not\navailable during sample selection. To handle this, we propose a novel\ntrial-and-error method, where we postpone using a sample if the predicted label\nis different from the expected one and falls outside the target group. We also\nobserve the trade-off that selecting more informative samples results in higher\nlikelihood of postponing due to undesired label prediction, and the optimal\nbalance varies per dataset. We capture the trade-off between informativeness\nand postpone rate as policies and propose to automatically select the best\npolicy using adversarial multi-armed bandit methods, given their computational\nefficiency and theoretical guarantees. Experiments show that Falcon\nsignificantly outperforms existing fair active learning approaches in terms of\nfairness and accuracy and is more efficient. In particular, only Falcon\nsupports a proper trade-off between accuracy and fairness where its maximum\nfairness score is 1.8-4.5x higher than the second-best results.\n","authors":["Ki Hyun Tae","Hantian Zhang","Jaeyoung Park","Kexin Rong","Steven Euijong Whang"],"pdf_url":"https://arxiv.org/pdf/2401.12722v1.pdf","comment":"18 pages, 12 figures, 14 tables"},{"id":"http://arxiv.org/abs/2401.12717v1","updated":"2024-01-23T12:39:15Z","published":"2024-01-23T12:39:15Z","title":"Gas trap prediction from 3D seismic and well test data using machine\n  learning","summary":"  The aim of this work is to create and apply a methodological approach for\npredicting gas traps from 3D seismic data and gas well testing. The paper\nformalizes the approach to creating a training dataset by selecting volumes\nwith established gas saturation and filtration properties within the seismic\nwavefield. The training dataset thus created is used in a process stack of\nsequential application of data processing methods and ensemble machine learning\nalgorithms. As a result, a cube of calibrated probabilities of belonging of the\nstudy space to gas reservoirs was obtained. The high efficiency of this\napproach is shown on a delayed test sample of three wells (blind wells). The\nfinal value of the gas reservoir prediction quality metric f1 score was\n0.893846.\n","authors":["Dmitry Ivlev"],"pdf_url":"https://arxiv.org/pdf/2401.12717v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2211.01758v2","updated":"2024-01-23T12:29:23Z","published":"2022-11-03T12:40:24Z","title":"Optimal Algorithms for Stochastic Complementary Composite Minimization","summary":"  Inspired by regularization techniques in statistics and machine learning, we\nstudy complementary composite minimization in the stochastic setting. This\nproblem corresponds to the minimization of the sum of a (weakly) smooth\nfunction endowed with a stochastic first-order oracle, and a structured\nuniformly convex (possibly nonsmooth and non-Lipschitz) regularization term.\nDespite intensive work on closely related settings, prior to our work no\ncomplexity bounds for this problem were known. We close this gap by providing\nnovel excess risk bounds, both in expectation and with high probability. Our\nalgorithms are nearly optimal, which we prove via novel lower complexity bounds\nfor this class of problems. We conclude by providing numerical results\ncomparing our methods to the state of the art.\n","authors":["Alexandre d'Aspremont","Cristóbal Guzmán","Clément Lezane"],"pdf_url":"https://arxiv.org/pdf/2211.01758v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12711v1","updated":"2024-01-23T12:20:17Z","published":"2024-01-23T12:20:17Z","title":"When Redundancy Matters: Machine Teaching of Representations","summary":"  In traditional machine teaching, a teacher wants to teach a concept to a\nlearner, by means of a finite set of examples, the witness set. But concepts\ncan have many equivalent representations. This redundancy strongly affects the\nsearch space, to the extent that teacher and learner may not be able to easily\ndetermine the equivalence class of each representation. In this common\nsituation, instead of teaching concepts, we explore the idea of teaching\nrepresentations. We work with several teaching schemas that exploit\nrepresentation and witness size (Eager, Greedy and Optimal) and analyze the\ngains in teaching effectiveness for some representational languages (DNF\nexpressions and Turing-complete P3 programs). Our theoretical and experimental\nresults indicate that there are various types of redundancy, handled better by\nthe Greedy schema introduced here than by the Eager schema, although both can\nbe arbitrarily far away from the Optimal. For P3 programs we found that witness\nsets are usually smaller than the programs they identify, which is an\nilluminating justification of why machine teaching from examples makes sense at\nall.\n","authors":["Cèsar Ferri","Dario Garigliotti","Brigt Arve Toppe Håvardstun","Josè Hernández-Orallo","Jan Arne Telle"],"pdf_url":"https://arxiv.org/pdf/2401.12711v1.pdf","comment":"16 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2401.12708v1","updated":"2024-01-23T12:15:47Z","published":"2024-01-23T12:15:47Z","title":"Deep Neural Network Benchmarks for Selective Classification","summary":"  With the increasing deployment of machine learning models in many\nsocially-sensitive tasks, there is a growing demand for reliable and\ntrustworthy predictions. One way to accomplish these requirements is to allow a\nmodel to abstain from making a prediction when there is a high risk of making\nan error. This requires adding a selection mechanism to the model, which\nselects those examples for which the model will provide a prediction. The\nselective classification framework aims to design a mechanism that balances the\nfraction of rejected predictions (i.e., the proportion of examples for which\nthe model does not make a prediction) versus the improvement in predictive\nperformance on the selected predictions. Multiple selective classification\nframeworks exist, most of which rely on deep neural network architectures.\nHowever, the empirical evaluation of the existing approaches is still limited\nto partial comparisons among methods and settings, providing practitioners with\nlittle insight into their relative merits. We fill this gap by benchmarking 18\nbaselines on a diverse set of 44 datasets that includes both image and tabular\ndata. Moreover, there is a mix of binary and multiclass tasks. We evaluate\nthese approaches using several criteria, including selective error rate,\nempirical coverage, distribution of rejected instance's classes, and\nperformance on out-of-distribution instances. The results indicate that there\nis not a single clear winner among the surveyed baselines, and the best method\ndepends on the users' objectives.\n","authors":["Andrea Pugnana","Lorenzo Perini","Jesse Davis","Salvatore Ruggieri"],"pdf_url":"https://arxiv.org/pdf/2401.12708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12689v1","updated":"2024-01-23T11:54:09Z","published":"2024-01-23T11:54:09Z","title":"Energy-based Automated Model Evaluation","summary":"  The conventional evaluation protocols on machine learning models rely heavily\non a labeled, i.i.d-assumed testing dataset, which is not often present in real\nworld applications. The Automated Model Evaluation (AutoEval) shows an\nalternative to this traditional workflow, by forming a proximal prediction\npipeline of the testing performance without the presence of ground-truth\nlabels. Despite its recent successes, the AutoEval frameworks still suffer from\nan overconfidence issue, substantial storage and computational cost. In that\nregard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that\nallows the AutoEval framework to be both more efficient and effective. The core\nof the MDE is to establish a meta-distribution statistic, on the information\n(energy) associated with individual samples, then offer a smoother\nrepresentation enabled by energy-based learning. We further provide our\ntheoretical insights by connecting the MDE with the classification loss. We\nprovide extensive experiments across modalities, datasets and different\narchitectural backbones to validate MDE's validity, together with its\nsuperiority compared with prior approaches. We also prove MDE's versatility by\nshowing its seamless integration with large-scale models, and easy adaption to\nlearning scenarios with noisy- or imbalanced- labels.\n","authors":["Ru Peng","Heming Zou","Haobo Wang","Yawen Zeng","Zenan Huang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.12689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12687v1","updated":"2024-01-23T11:52:25Z","published":"2024-01-23T11:52:25Z","title":"DVL Calibration using Data-driven Methods","summary":"  Autonomous underwater vehicles (AUVs) are used in a wide range of underwater\napplications, ranging from seafloor mapping to industrial operations. While\nunderwater, the AUV navigation solution commonly relies on the fusion between\ninertial sensors and Doppler velocity logs (DVL). To achieve accurate DVL\nmeasurements a calibration procedure should be conducted before the mission\nbegins. Model-based calibration approaches include filtering approaches\nutilizing global navigation satellite system signals. In this paper, we propose\nan end-to-end deep-learning framework for the calibration procedure. Using\nstimulative data, we show that our proposed approach outperforms model-based\napproaches by 35% in accuracy and 80% in the required calibration time.\n","authors":["Zeev Yampolsky","Itzik Klein"],"pdf_url":"https://arxiv.org/pdf/2401.12687v1.pdf","comment":"5 pages , 3 figures , 5 tables"},{"id":"http://arxiv.org/abs/2401.12686v1","updated":"2024-01-23T11:52:00Z","published":"2024-01-23T11:52:00Z","title":"Learning Mean Field Games on Sparse Graphs: A Hybrid Graphex Approach","summary":"  Learning the behavior of large agent populations is an important task for\nnumerous research areas. Although the field of multi-agent reinforcement\nlearning (MARL) has made significant progress towards solving these systems,\nsolutions for many agents often remain computationally infeasible and lack\ntheoretical guarantees. Mean Field Games (MFGs) address both of these issues\nand can be extended to Graphon MFGs (GMFGs) to include network structures\nbetween agents. Despite their merits, the real world applicability of GMFGs is\nlimited by the fact that graphons only capture dense graphs. Since most\nempirically observed networks show some degree of sparsity, such as power law\ngraphs, the GMFG framework is insufficient for capturing these network\ntopologies. Thus, we introduce the novel concept of Graphex MFGs (GXMFGs) which\nbuilds on the graph theoretical concept of graphexes. Graphexes are the\nlimiting objects to sparse graph sequences that also have other desirable\nfeatures such as the small world property. Learning equilibria in these games\nis challenging due to the rich and sparse structure of the underlying graphs.\nTo tackle these challenges, we design a new learning algorithm tailored to the\nGXMFG setup. This hybrid graphex learning approach leverages that the system\nmainly consists of a highly connected core and a sparse periphery. After\ndefining the system and providing a theoretical analysis, we state our learning\napproach and demonstrate its learning capabilities on both synthetic graphs and\nreal-world networks. This comparison shows that our GXMFG learning algorithm\nsuccessfully extends MFGs to a highly relevant class of hard, realistic\nlearning problems that are not accurately addressed by current MARL and MFG\nmethods.\n","authors":["Christian Fabian","Kai Cui","Heinz Koeppl"],"pdf_url":"https://arxiv.org/pdf/2401.12686v1.pdf","comment":"accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.12683v1","updated":"2024-01-23T11:46:52Z","published":"2024-01-23T11:46:52Z","title":"LLpowershap: Logistic Loss-based Automated Shapley Values Feature\n  Selection Method","summary":"  Shapley values have been used extensively in machine learning, not only to\nexplain black box machine learning models, but among other tasks, also to\nconduct model debugging, sensitivity and fairness analyses and to select\nimportant features for robust modelling and for further follow-up analyses.\nShapley values satisfy certain axioms that promote fairness in distributing\ncontributions of features toward prediction or reducing error, after accounting\nfor non-linear relationships and interactions when complex machine learning\nmodels are employed. Recently, a number of feature selection methods utilising\nShapley values have been introduced. Here, we present a novel feature selection\nmethod, LLpowershap, which makes use of loss-based Shapley values to identify\ninformative features with minimal noise among the selected sets of features.\nOur simulation results show that LLpowershap not only identifies higher number\nof informative features but outputs fewer noise features compared to other\nstate-of-the-art feature selection methods. Benchmarking results on four\nreal-world datasets demonstrate higher or at par predictive performance of\nLLpowershap compared to other Shapley based wrapper methods, or filter methods.\n","authors":["Iqbal Madakkatel","Elina Hyppönen"],"pdf_url":"https://arxiv.org/pdf/2401.12683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12681v1","updated":"2024-01-23T11:46:31Z","published":"2024-01-23T11:46:31Z","title":"Non-Neighbors Also Matter to Kriging: A New Contrastive-Prototypical\n  Learning","summary":"  Kriging aims at estimating the attributes of unsampled geo-locations from\nobservations in the spatial vicinity or physical connections, which helps\nmitigate skewed monitoring caused by under-deployed sensors. Existing works\nassume that neighbors' information offers the basis for estimating the\nattributes of the unobserved target while ignoring non-neighbors. However,\nnon-neighbors could also offer constructive information, and neighbors could\nalso be misleading. To this end, we propose ``Contrastive-Prototypical''\nself-supervised learning for Kriging (KCP) to refine valuable information from\nneighbors and recycle the one from non-neighbors. As a pre-trained paradigm, we\nconduct the Kriging task from a new perspective of representation: we aim to\nfirst learn robust and general representations and then recover attributes from\nrepresentations. A neighboring contrastive module is designed that coarsely\nlearns the representations by narrowing the representation distance between the\ntarget and its neighbors while pushing away the non-neighbors. In parallel, a\nprototypical module is introduced to identify similar representations via\nexchanged prediction, thus refining the misleading neighbors and recycling the\nuseful non-neighbors from the neighboring contrast component. As a result, not\nall the neighbors and some of the non-neighbors will be used to infer the\ntarget. To encourage the two modules above to learn general and robust\nrepresentations, we design an adaptive augmentation module that incorporates\ndata-driven attribute augmentation and centrality-based topology augmentation\nover the spatiotemporal Kriging graph data. Extensive experiments on real-world\ndatasets demonstrate the superior performance of KCP compared to its peers with\n6% improvements and exceptional transferability and robustness. The code is\navailable at https://github.com/bonaldli/KCP\n","authors":["Zhishuai Li","Yunhao Nie","Ziyue Li","Lei Bai","Yisheng Lv","Rui Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.12681v1.pdf","comment":"Accepted in AISTATS 2024"},{"id":"http://arxiv.org/abs/2312.02246v3","updated":"2024-01-23T11:26:42Z","published":"2023-12-04T14:45:56Z","title":"Conditional Variational Diffusion Models","summary":"  Inverse problems aim to determine parameters from observations, a crucial\ntask in engineering and science. Lately, generative models, especially\ndiffusion models, have gained popularity in this area for their ability to\nproduce realistic solutions and their good mathematical properties. Despite\ntheir success, an important drawback of diffusion models is their sensitivity\nto the choice of variance schedule, which controls the dynamics of the\ndiffusion process. Fine-tuning this schedule for specific applications is\ncrucial but time-costly and does not guarantee an optimal result. We propose a\nnovel approach for learning the schedule as part of the training process. Our\nmethod supports probabilistic conditioning on data, provides high-quality\nsolutions, and is flexible, proving able to adapt to different applications\nwith minimum overhead. This approach is tested in two unrelated inverse\nproblems: super-resolution microscopy and quantitative phase imaging, yielding\ncomparable or superior results to previous methods and fine-tuned diffusion\nmodels. We conclude that fine-tuning the schedule by experimentation should be\navoided because it can be learned during training in a stable way that yields\nbetter results.\n","authors":["Gabriel della Maggiora","Luis Alberto Croquevielle","Nikita Deshpande","Harry Horsley","Thomas Heinis","Artur Yakimovich"],"pdf_url":"https://arxiv.org/pdf/2312.02246v3.pdf","comment":"Denoising Diffusion Probabilistic Models, Inverse Problems,\n  Generative Models, Super Resolution, Phase Quantification, Variational\n  Methods"},{"id":"http://arxiv.org/abs/2401.12667v1","updated":"2024-01-23T11:22:03Z","published":"2024-01-23T11:22:03Z","title":"Feature Selection via Robust Weighted Score for High Dimensional Binary\n  Class-Imbalanced Gene Expression Data","summary":"  In this paper, a robust weighted score for unbalanced data (ROWSU) is\nproposed for selecting the most discriminative feature for high dimensional\ngene expression binary classification with class-imbalance problem. The method\naddresses one of the most challenging problems of highly skewed class\ndistributions in gene expression datasets that adversely affect the performance\nof classification algorithms. First, the training dataset is balanced by\nsynthetically generating data points from minority class observations. Second,\na minimum subset of genes is selected using a greedy search approach. Third, a\nnovel weighted robust score, where the weights are computed by support vectors,\nis introduced to obtain a refined set of genes. The highest-scoring genes based\non this approach are combined with the minimum subset of genes selected by the\ngreedy search approach to form the final set of genes. The novel method ensures\nthe selection of the most discriminative genes, even in the presence of skewed\nclass distribution, thus improving the performance of the classifiers. The\nperformance of the proposed ROWSU method is evaluated on $6$ gene expression\ndatasets. Classification accuracy and sensitivity are used as performance\nmetrics to compare the proposed ROWSU algorithm with several other\nstate-of-the-art methods. Boxplots and stability plots are also constructed for\na better understanding of the results. The results show that the proposed\nmethod outperforms the existing feature selection procedures based on\nclassification performance from k nearest neighbours (kNN) and random forest\n(RF) classifiers.\n","authors":["Zardad Khan","Amjad Ali","Saeed Aldahmani"],"pdf_url":"https://arxiv.org/pdf/2401.12667v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2401.12662v1","updated":"2024-01-23T11:14:59Z","published":"2024-01-23T11:14:59Z","title":"Integrating Human Expertise in Continuous Spaces: A Novel Interactive\n  Bayesian Optimization Framework with Preference Expected Improvement","summary":"  Interactive Machine Learning (IML) seeks to integrate human expertise into\nmachine learning processes. However, most existing algorithms cannot be applied\nto Realworld Scenarios because their state spaces and/or action spaces are\nlimited to discrete values. Furthermore, the interaction of all existing\nmethods is restricted to deciding between multiple proposals. We therefore\npropose a novel framework based on Bayesian Optimization (BO). Interactive\nBayesian Optimization (IBO) enables collaboration between machine learning\nalgorithms and humans. This framework captures user preferences and provides an\ninterface for users to shape the strategy by hand. Additionally, we've\nincorporated a new acquisition function, Preference Expected Improvement (PEI),\nto refine the system's efficiency using a probabilistic model of the user\npreferences. Our approach is geared towards ensuring that machines can benefit\nfrom human expertise, aiming for a more aligned and effective learning process.\nIn the course of this work, we applied our method to simulations and in a real\nworld task using a Franka Panda robot to show human-robot collaboration.\n","authors":["Nikolaus Feith","Elmar Rueckert"],"pdf_url":"https://arxiv.org/pdf/2401.12662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12648v1","updated":"2024-01-23T10:56:01Z","published":"2024-01-23T10:56:01Z","title":"Consistency Enhancement-Based Deep Multiview Clustering via Contrastive\n  Learning","summary":"  Multiview clustering (MVC) segregates data samples into meaningful clusters\nby synthesizing information across multiple views. Moreover, deep\nlearning-based methods have demonstrated their strong feature learning\ncapabilities in MVC scenarios. However, effectively generalizing feature\nrepresentations while maintaining consistency is still an intractable problem.\nIn addition, most existing deep clustering methods based on contrastive\nlearning overlook the consistency of the clustering representations during the\nclustering process. In this paper, we show how the above problems can be\novercome and propose a consistent enhancement-based deep MVC method via\ncontrastive learning (CCEC). Specifically, semantic connection blocks are\nincorporated into a feature representation to preserve the consistent\ninformation among multiple views. Furthermore, the representation process for\nclustering is enhanced through spectral clustering, and the consistency across\nmultiple views is improved. Experiments conducted on five datasets demonstrate\nthe effectiveness and superiority of our method in comparison with the\nstate-of-the-art (SOTA) methods. The code for this method can be accessed at\nhttps://anonymous.4open.science/r/CCEC-E84E/.\n","authors":["Hao Yang","Hua Mao","Wai Lok Woo","Jie Chen","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2401.12648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12645v1","updated":"2024-01-23T10:55:29Z","published":"2024-01-23T10:55:29Z","title":"On the Robustness of Deep Learning-aided Symbol Detectors to Varying\n  Conditions and Imperfect Channel Knowledge","summary":"  Recently, a data-driven Bahl-Cocke-Jelinek-Raviv (BCJR) algorithm tailored to\nchannels with intersymbol interference has been introduced. This so-called\nBCJRNet algorithm utilizes neural networks to calculate channel likelihoods.\nBCJRNet has demonstrated resilience against inaccurate channel tap estimations\nwhen applied to a time-invariant channel with ideal exponential decay profiles.\nHowever, its generalization capabilities for practically-relevant time-varying\nchannels, where the receiver can only access incorrect channel parameters,\nremain largely unexplored. The primary contribution of this paper is to expand\nupon the results from existing literature to encompass a variety of imperfect\nchannel knowledge cases that appear in real-world transmissions. Our findings\ndemonstrate that BCJRNet significantly outperforms the conventional BCJR\nalgorithm for stationary transmission scenarios when learning from noisy\nchannel data and with imperfect channel decay profiles. However, this advantage\nis shown to diminish when the operating channel is also rapidly time-varying.\nOur results also show the importance of memory assumptions for conventional\nBCJR and BCJRNet. An underestimation of the memory largely degrades the\nperformance of both BCJR and BCJRNet, especially in a slow-decaying channel. To\nmimic a situation closer to a practical scenario, we also combined channel tap\nuncertainty with imperfect channel memory knowledge. Somewhat surprisingly, our\nresults revealed improved performance when employing the conventional BCJR with\nan underestimated memory assumption. BCJRNet, on the other hand, showed a\nconsistent performance improvement as the level of accurate memory knowledge\nincreased.\n","authors":["Chin-Hung Chen","Boris Karanov","Wim van Houtum","Wu Yan","Alex Young","Alex Alvarado"],"pdf_url":"https://arxiv.org/pdf/2401.12645v1.pdf","comment":"Accepted paper at IEEE Wireless Communications and Networking\n  Conference (WCNC) 2024"},{"id":"http://arxiv.org/abs/2401.12644v1","updated":"2024-01-23T10:54:13Z","published":"2024-01-23T10:54:13Z","title":"Binary Feature Mask Optimization for Feature Selection","summary":"  We investigate feature selection problem for generic machine learning (ML)\nmodels. We introduce a novel framework that selects features considering the\npredictions of the model. Our framework innovates by using a novel feature\nmasking approach to eliminate the features during the selection process,\ninstead of completely removing them from the dataset. This allows us to use the\nsame ML model during feature selection, unlike other feature selection methods\nwhere we need to train the ML model again as the dataset has different\ndimensions on each iteration. We obtain the mask operator using the predictions\nof the ML model, which offers a comprehensive view on the subsets of the\nfeatures essential for the predictive performance of the model. A variety of\napproaches exist in the feature selection literature. However, no study has\nintroduced a training-free framework for a generic ML model to select features\nwhile considering the importance of the feature subsets as a whole, instead of\nfocusing on the individual features. We demonstrate significant performance\nimprovements on the real-life datasets under different settings using LightGBM\nand Multi-Layer Perceptron as our ML models. Additionally, we openly share the\nimplementation code for our methods to encourage the research and the\ncontributions in this area.\n","authors":["Mehmet E. Lorasdagi","Mehmet Y. Turali","Ali T. Koc","Suleyman S. Kozat"],"pdf_url":"https://arxiv.org/pdf/2401.12644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18417v3","updated":"2024-01-23T10:50:06Z","published":"2023-05-28T19:07:55Z","title":"Determinantal Point Process Attention Over Grid Cell Code Supports Out\n  of Distribution Generalization","summary":"  Deep neural networks have made tremendous gains in emulating human-like\nintelligence, and have been used increasingly as ways of understanding how the\nbrain may solve the complex computational problems on which this relies.\nHowever, these still fall short of, and therefore fail to provide insight into\nhow the brain supports strong forms of generalization of which humans are\ncapable. One such case is out-of-distribution (OOD) generalization-successful\nperformance on test examples that lie outside the distribution of the training\nset. Here, we identify properties of processing in the brain that may\ncontribute to this ability. We describe a two-part algorithm that draws on\nspecific features of neural computation to achieve OOD generalization, and\nprovide a proof of concept by evaluating performance on two challenging\ncognitive tasks. First we draw on the fact that the mammalian brain represents\nmetric spaces using grid cell code (e.g., in the entorhinal cortex): abstract\nrepresentations of relational structure, organized in recurring motifs that\ncover the representational space. Second, we propose an attentional mechanism\nthat operates over the grid cell code using Determinantal Point Process (DPP),\nthat we call DPP attention (DPP-A) -- a transformation that ensures maximum\nsparseness in the coverage of that space. We show that a loss function that\ncombines standard task-optimized error with DPP-A can exploit the recurring\nmotifs in the grid cell code, and can be integrated with common architectures\nto achieve strong OOD generalization performance on analogy and arithmetic\ntasks. This provides both an interpretation of how the grid cell code in the\nmammalian brain may contribute to generalization performance, and at the same\ntime a potential means for improving such capabilities in artificial neural\nnetworks.\n","authors":["Shanka Subhra Mondal","Steven Frankland","Taylor Webb","Jonathan D. Cohen"],"pdf_url":"https://arxiv.org/pdf/2305.18417v3.pdf","comment":"29 pages (including Appendix), 21 figures"},{"id":"http://arxiv.org/abs/2303.02444v2","updated":"2024-01-23T10:39:54Z","published":"2023-03-04T16:04:17Z","title":"Calibrating Transformers via Sparse Gaussian Processes","summary":"  Transformer models have achieved profound success in prediction tasks in a\nwide range of applications in natural language processing, speech recognition\nand computer vision. Extending Transformer's success to safety-critical domains\nrequires calibrated uncertainty estimation which remains under-explored. To\naddress this, we propose Sparse Gaussian Process attention (SGPA), which\nperforms Bayesian inference directly in the output space of multi-head\nattention blocks (MHAs) in transformer to calibrate its uncertainty. It\nreplaces the scaled dot-product operation with a valid symmetric kernel and\nuses sparse Gaussian processes (SGP) techniques to approximate the posterior\nprocesses of MHA outputs. Empirically, on a suite of prediction tasks on text,\nimages and graphs, SGPA-based Transformers achieve competitive predictive\naccuracy, while noticeably improving both in-distribution calibration and\nout-of-distribution robustness and detection.\n","authors":["Wenlong Chen","Yingzhen Li"],"pdf_url":"https://arxiv.org/pdf/2303.02444v2.pdf","comment":"Published at The Eleventh International Conference on Learning\n  Representations (ICLR 2023). This latest Arxiv version includes a\n  clarification of how ECE/MCE are computed (at page 10)"},{"id":"http://arxiv.org/abs/2401.12631v1","updated":"2024-01-23T10:27:42Z","published":"2024-01-23T10:27:42Z","title":"A Reply to Makelov et al. (2023)'s \"Interpretability Illusion\" Arguments","summary":"  We respond to the recent paper by Makelov et al. (2023), which reviews\nsubspace interchange intervention methods like distributed alignment search\n(DAS; Geiger et al. 2023) and claims that these methods potentially cause\n\"interpretability illusions\". We first review Makelov et al. (2023)'s technical\nnotion of what an \"interpretability illusion\" is, and then we show that even\nintuitive and desirable explanations can qualify as illusions in this sense. As\na result, their method of discovering \"illusions\" can reject explanations they\nconsider \"non-illusory\". We then argue that the illusions Makelov et al. (2023)\nsee in practice are artifacts of their training and evaluation paradigms. We\nclose by emphasizing that, though we disagree with their core characterization,\nMakelov et al. (2023)'s examples and discussion have undoubtedly pushed the\nfield of interpretability forward.\n","authors":["Zhengxuan Wu","Atticus Geiger","Jing Huang","Aryaman Arora","Thomas Icard","Christopher Potts","Noah D. Goodman"],"pdf_url":"https://arxiv.org/pdf/2401.12631v1.pdf","comment":"20 pages, 14 figures"},{"id":"http://arxiv.org/abs/2401.12630v1","updated":"2024-01-23T10:27:38Z","published":"2024-01-23T10:27:38Z","title":"Full-Stack Optimization for CAM-Only DNN Inference","summary":"  The accuracy of neural networks has greatly improved across various domains\nover the past years. Their ever-increasing complexity, however, leads to\nprohibitively high energy demands and latency in von Neumann systems. Several\ncomputing-in-memory (CIM) systems have recently been proposed to overcome this,\nbut trade-offs involving accuracy, hardware reliability, and scalability for\nlarge models remain a challenge. Additionally, for some CIM designs, the\nactivation movement still requires considerable time and energy. This paper\nexplores the combination of algorithmic optimizations for ternary weight neural\nnetworks and associative processors (APs) implemented using racetrack memory\n(RTM). We propose a novel compilation flow to optimize convolutions on APs by\nreducing their arithmetic intensity. By leveraging the benefits of RTM-based\nAPs, this approach substantially reduces data transfers within the memory while\naddressing accuracy, energy efficiency, and reliability concerns. Concretely,\nour solution improves the energy efficiency of ResNet-18 inference on ImageNet\nby 7.5x compared to crossbar in-memory accelerators while retaining software\naccuracy.\n","authors":["João Paulo C. de Lima","Asif Ali Khan","Luigi Carro","Jeronimo Castrillon"],"pdf_url":"https://arxiv.org/pdf/2401.12630v1.pdf","comment":"To be presented at DATE24"},{"id":"http://arxiv.org/abs/2401.12627v1","updated":"2024-01-23T10:26:15Z","published":"2024-01-23T10:26:15Z","title":"Blind Channel Estimation and Joint Symbol Detection with Data-Driven\n  Factor Graphs","summary":"  We investigate the application of the factor graph framework for blind joint\nchannel estimation and symbol detection on time-variant linear inter-symbol\ninterference channels. In particular, we consider the expectation maximization\n(EM) algorithm for maximum likelihood estimation, which typically suffers from\nhigh complexity as it requires the computation of the symbol-wise posterior\ndistributions in every iteration. We address this issue by efficiently\napproximating the posteriors using the belief propagation (BP) algorithm on a\nsuitable factor graph. By interweaving the iterations of BP and EM, the\ndetection complexity can be further reduced to a single BP iteration per EM\nstep. In addition, we propose a data-driven version of our algorithm that\nintroduces momentum in the BP updates and learns a suitable EM parameter update\nschedule, thereby significantly improving the performance-complexity tradeoff\nwith a few offline training samples. Our numerical experiments demonstrate the\nexcellent performance of the proposed blind detector and show that it even\noutperforms coherent BP detection in high signal-to-noise scenarios.\n","authors":["Luca Schmid","Tomer Raviv","Nir Shlezinger","Laurent Schmalen"],"pdf_url":"https://arxiv.org/pdf/2401.12627v1.pdf","comment":"Submitted to IEEE for peer review"},{"id":"http://arxiv.org/abs/2401.12624v1","updated":"2024-01-23T10:23:13Z","published":"2024-01-23T10:23:13Z","title":"Knowledge Distillation from Language-Oriented to Emergent Communication\n  for Multi-Agent Remote Control","summary":"  In this work, we compare emergent communication (EC) built upon multi-agent\ndeep reinforcement learning (MADRL) and language-oriented semantic\ncommunication (LSC) empowered by a pre-trained large language model (LLM) using\nhuman language. In a multi-agent remote navigation task, with multimodal input\ndata comprising location and channel maps, it is shown that EC incurs high\ntraining cost and struggles when using multimodal data, whereas LSC yields high\ninference computing cost due to the LLM's large size. To address their\nrespective bottlenecks, we propose a novel framework of language-guided EC\n(LEC) by guiding the EC training using LSC via knowledge distillation (KD).\nSimulations corroborate that LEC achieves faster travel time while avoiding\nareas with poor channel conditions, as well as speeding up the MADRL training\nconvergence by up to 61.8% compared to EC.\n","authors":["Yongjun Kim","Sejin Seo","Jihong Park","Mehdi Bennis","Seong-Lyun Kim","Junil Choi"],"pdf_url":"https://arxiv.org/pdf/2401.12624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12617v1","updated":"2024-01-23T10:16:44Z","published":"2024-01-23T10:16:44Z","title":"The Joint Effect of Task Similarity and Overparameterization on\n  Catastrophic Forgetting -- An Analytical Model","summary":"  In continual learning, catastrophic forgetting is affected by multiple\naspects of the tasks. Previous works have analyzed separately how forgetting is\naffected by either task similarity or overparameterization. In contrast, our\npaper examines how task similarity and overparameterization jointly affect\nforgetting in an analyzable model. Specifically, we focus on two-task continual\nlinear regression, where the second task is a random orthogonal transformation\nof an arbitrary first task (an abstraction of random permutation tasks). We\nderive an exact analytical expression for the expected forgetting - and uncover\na nuanced pattern. In highly overparameterized models, intermediate task\nsimilarity causes the most forgetting. However, near the interpolation\nthreshold, forgetting decreases monotonically with the expected task\nsimilarity. We validate our findings with linear regression on synthetic data,\nand with neural networks on established permutation task benchmarks.\n","authors":["Itay Evron","Daniel Goldfarb","Nir Weinberger","Daniel Soudry","Paul Hand"],"pdf_url":"https://arxiv.org/pdf/2401.12617v1.pdf","comment":"Accepted to the Twelfth International Conference on Learning\n  Representations (ICLR 2024)"},{"id":"http://arxiv.org/abs/2401.12611v1","updated":"2024-01-23T10:10:01Z","published":"2024-01-23T10:10:01Z","title":"Prompt Smells: An Omen for Undesirable Generative AI Outputs","summary":"  Recent Generative Artificial Intelligence (GenAI) trends focus on various\napplications, including creating stories, illustrations, poems, articles,\ncomputer code, music compositions, and videos. Extrinsic hallucinations are a\ncritical limitation of such GenAI, which can lead to significant challenges in\nachieving and maintaining the trustworthiness of GenAI. In this paper, we\npropose two new concepts that we believe will aid the research community in\naddressing limitations associated with the application of GenAI models. First,\nwe propose a definition for the \"desirability\" of GenAI outputs and three\nfactors which are observed to influence it. Second, drawing inspiration from\nMartin Fowler's code smells, we propose the concept of \"prompt smells\" and the\nadverse effects they are observed to have on the desirability of GenAI outputs.\nWe expect our work will contribute to the ongoing conversation about the\ndesirability of GenAI outputs and help advance the field in a meaningful way.\n","authors":["Krishna Ronanki","Beatriz Cabrero-Daniel","Christian Berger"],"pdf_url":"https://arxiv.org/pdf/2401.12611v1.pdf","comment":"Accepted at CAIN 2024: Poster Track"},{"id":"http://arxiv.org/abs/2401.12610v1","updated":"2024-01-23T10:09:14Z","published":"2024-01-23T10:09:14Z","title":"The twin peaks of learning neural networks","summary":"  Recent works demonstrated the existence of a double-descent phenomenon for\nthe generalization error of neural networks, where highly overparameterized\nmodels escape overfitting and achieve good test performance, at odds with the\nstandard bias-variance trade-off described by statistical learning theory. In\nthe present work, we explore a link between this phenomenon and the increase of\ncomplexity and sensitivity of the function represented by neural networks. In\nparticular, we study the Boolean mean dimension (BMD), a metric developed in\nthe context of Boolean function analysis. Focusing on a simple teacher-student\nsetting for the random feature model, we derive a theoretical analysis based on\nthe replica method that yields an interpretable expression for the BMD, in the\nhigh dimensional regime where the number of data points, the number of\nfeatures, and the input size grow to infinity. We find that, as the degree of\noverparameterization of the network is increased, the BMD reaches an evident\npeak at the interpolation threshold, in correspondence with the generalization\nerror peak, and then slowly approaches a low asymptotic value. The same\nphenomenology is then traced in numerical experiments with different model\nclasses and training setups. Moreover, we find empirically that adversarially\ninitialized models tend to show higher BMD values, and that models that are\nmore robust to adversarial attacks exhibit a lower BMD.\n","authors":["Elizaveta Demyanenko","Christoph Feinauer","Enrico M. Malatesta","Luca Saglietti"],"pdf_url":"https://arxiv.org/pdf/2401.12610v1.pdf","comment":"36 pages, 30 figures"},{"id":"http://arxiv.org/abs/2401.12609v1","updated":"2024-01-23T10:07:41Z","published":"2024-01-23T10:07:41Z","title":"Fast Semi-supervised Unmixing using Non-convex Optimization","summary":"  In this paper, we introduce a novel linear model tailored for\nsemisupervised/library-based unmixing. Our model incorporates considerations\nfor library mismatch while enabling the enforcement of the abundance sum-to-one\nconstraint (ASC). Unlike conventional sparse unmixing methods, this model\ninvolves nonconvex optimization, presenting significant computational\nchallenges. We demonstrate the efficacy of Alternating Methods of Multipliers\n(ADMM) in cyclically solving these intricate problems. We propose two\nsemisupervised unmixing approaches, each relying on distinct priors applied to\nthe new model in addition to the ASC: sparsity prior and convexity constraint.\nOur experimental results validate that enforcing the convexity constraint\noutperforms the sparsity prior for the endmember library. These results are\ncorroborated across three simulated datasets (accounting for spectral\nvariability and varying pixel purity levels) and the Cuprite dataset.\nAdditionally, our comparison with conventional sparse unmixing methods\nshowcases considerable advantages of our proposed model, which entails\nnonconvex optimization. Notably, our implementations of the proposed\nalgorithms-fast semisupervised unmixing (FaSUn) and sparse unmixing using\nsoft-shrinkage (SUnS)-prove considerably more efficient than traditional sparse\nunmixing methods. SUnS and FaSUn were implemented using PyTorch and provided in\na dedicated Python package called Fast Semisupervised Unmixing (FUnmix), which\nis open-source and available at https://github.com/BehnoodRasti/FUnmix\n","authors":["Behnood Rasti","Alexandre Zouaoui","Julien Mairal","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2401.12609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16613v2","updated":"2024-01-23T10:04:49Z","published":"2023-12-27T15:36:17Z","title":"Self-supervised Pretraining for Robust Personalized Voice Activity\n  Detection in Adverse Conditions","summary":"  In this paper, we propose the use of self-supervised pretraining on a large\nunlabelled data set to improve the performance of a personalized voice activity\ndetection (VAD) model in adverse conditions. We pretrain a long short-term\nmemory (LSTM)-encoder using the autoregressive predictive coding (APC)\nframework and fine-tune it for personalized VAD. We also propose a denoising\nvariant of APC, with the goal of improving the robustness of personalized VAD.\nThe trained models are systematically evaluated on both clean speech and speech\ncontaminated by various types of noise at different SNR-levels and compared to\na purely supervised model. Our experiments show that self-supervised\npretraining not only improves performance in clean conditions, but also yields\nmodels which are more robust to adverse conditions compared to purely\nsupervised learning.\n","authors":["Holger Severin Bovbjerg","Jesper Jensen","Jan Østergaard","Zheng-Hua Tan"],"pdf_url":"https://arxiv.org/pdf/2312.16613v2.pdf","comment":"To be published at ICASSP2024, 14th of April 2024, Seoul, South\n  Korea. Copyright (c) 2023 IEEE. 5 pages, 2, figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.19004v3","updated":"2024-01-23T09:59:27Z","published":"2023-05-30T13:02:25Z","title":"Policy Gradient Algorithms for Robust MDPs with Non-Rectangular\n  Uncertainty Sets","summary":"  We propose policy gradient algorithms for robust infinite-horizon Markov\ndecision processes (MDPs) with non-rectangular uncertainty sets, thereby\naddressing an open challenge in the robust MDP literature. Indeed, uncertainty\nsets that display statistical optimality properties and make optimal use of\nlimited data often fail to be rectangular. Unfortunately, the corresponding\nrobust MDPs cannot be solved with dynamic programming techniques and are in\nfact provably intractable. We first present a randomized projected Langevin\ndynamics algorithm that solves the robust policy evaluation problem to global\noptimality but is inefficient. We also propose a deterministic policy gradient\nmethod that is efficient but solves the robust policy evaluation problem only\napproximately, and we prove that the approximation error scales with a new\nmeasure of non-rectangularity of the uncertainty set. Finally, we describe an\nactor-critic algorithm that finds an $\\epsilon$-optimal solution for the robust\npolicy improvement problem in $\\mathcal{O}(1/\\epsilon^4)$ iterations. We thus\npresent the first complete solution scheme for robust MDPs with non-rectangular\nuncertainty sets offering global optimality guarantees. Numerical experiments\nshow that our algorithms compare favorably against state-of-the-art methods.\n","authors":["Mengmeng Li","Daniel Kuhn","Tobias Sutter"],"pdf_url":"https://arxiv.org/pdf/2305.19004v3.pdf","comment":"comments are welcome"},{"id":"http://arxiv.org/abs/2401.12588v1","updated":"2024-01-23T09:43:30Z","published":"2024-01-23T09:43:30Z","title":"Interpreting Equivariant Representations","summary":"  Latent representations are used extensively for downstream tasks, such as\nvisualization, interpolation or feature extraction of deep learning models.\nInvariant and equivariant neural networks are powerful and well-established\nmodels for enforcing inductive biases. In this paper, we demonstrate that the\ninductive bias imposed on the by an equivariant model must also be taken into\naccount when using latent representations. We show how not accounting for the\ninductive biases leads to decreased performance on downstream tasks, and vice\nversa, how accounting for inductive biases can be done effectively by using an\ninvariant projection of the latent representations. We propose principles for\nhow to choose such a projection, and show the impact of using these principles\nin two common examples: First, we study a permutation equivariant variational\nauto-encoder trained for molecule graph generation; here we show that invariant\nprojections can be designed that incur no loss of information in the resulting\ninvariant representation. Next, we study a rotation-equivariant representation\nused for image classification. Here, we illustrate how random invariant\nprojections can be used to obtain an invariant representation with a high\ndegree of retained information. In both cases, the analysis of invariant latent\nrepresentations proves superior to their equivariant counterparts. Finally, we\nillustrate that the phenomena documented here for equivariant neural networks\nhave counterparts in standard neural networks where invariance is encouraged\nvia augmentation. Thus, while these ambiguities may be known by experienced\ndevelopers of equivariant models, we make both the knowledge as well as\neffective tools to handle the ambiguities available to the broader community.\n","authors":["Andreas Abildtrup Hansen","Anna Calissano","Aasa Feragen"],"pdf_url":"https://arxiv.org/pdf/2401.12588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14624v2","updated":"2024-01-23T09:43:23Z","published":"2023-06-26T11:56:00Z","title":"Insights From Insurance for Fair Machine Learning","summary":"  We argue that insurance can act as an analogon for the social situatedness of\nmachine learning systems, hence allowing machine learning scholars to take\ninsights from the rich and interdisciplinary insurance literature. Tracing the\ninteraction of uncertainty, fairness and responsibility in insurance provides a\nfresh perspective on fairness in machine learning. We link insurance fairness\nconceptions to their machine learning relatives, and use this bridge to\nproblematize fairness as calibration. In this process, we bring to the\nforefront two themes that have been largely overlooked in the machine learning\nliterature: responsibility and aggregate-individual tensions.\n","authors":["Christian Fröhlich","Robert C. Williamson"],"pdf_url":"https://arxiv.org/pdf/2306.14624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09126v2","updated":"2024-01-23T09:37:39Z","published":"2023-12-14T16:59:49Z","title":"Towards Trustworthy AI Software Development Assistance","summary":"  It is expected that in the near future, AI software development assistants\nwill play an important role in the software industry. However, current software\ndevelopment assistants tend to be unreliable, often producing incorrect,\nunsafe, or low-quality code. We seek to resolve these issues by introducing a\nholistic architecture for constructing, training, and using trustworthy AI\nsoftware development assistants. In the center of the architecture, there is a\nfoundational LLM trained on datasets representative of real-world coding\nscenarios and complex software architectures, and fine-tuned on code quality\ncriteria beyond correctness. The LLM will make use of graph-based code\nrepresentations for advanced semantic comprehension. We envision a knowledge\ngraph integrated into the system to provide up-to-date background knowledge and\nto enable the assistant to provide appropriate explanations. Finally, a modular\nframework for constrained decoding will ensure that certain guarantees (e.g.,\nfor correctness and security) hold for the generated code.\n","authors":["Daniel Maninger","Krishna Narasimhan","Mira Mezini"],"pdf_url":"https://arxiv.org/pdf/2312.09126v2.pdf","comment":"6 pages, 1 figure; to be published in New Ideas and Emerging Results\n  (ICSE-NIER'24), April 14-20, 2024, Lisbon, Portugal; updated version to\n  reflect the information provided by ACM"},{"id":"http://arxiv.org/abs/2212.03281v3","updated":"2024-01-23T09:24:04Z","published":"2022-12-06T19:32:06Z","title":"Copula Conformal Prediction for Multi-step Time Series Forecasting","summary":"  Accurate uncertainty measurement is a key step to building robust and\nreliable machine learning systems. Conformal prediction is a distribution-free\nuncertainty quantification algorithm popular for its ease of implementation,\nstatistical coverage guarantees, and versatility for underlying forecasters.\nHowever, existing conformal prediction algorithms for time series are limited\nto single-step prediction without considering the temporal dependency. In this\npaper we propose a Copula Conformal Prediction algorithm for multivariate,\nmulti-step Time Series forecasting, CopulaCPTS. We prove that CopulaCPTS has\nfinite sample validity guarantee. On several synthetic and real-world\nmultivariate time series datasets, we show that CopulaCPTS produces more\ncalibrated and sharp confidence intervals for multi-step prediction tasks than\nexisting techniques.\n","authors":["Sophia Sun","Rose Yu"],"pdf_url":"https://arxiv.org/pdf/2212.03281v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01185v3","updated":"2024-01-23T09:16:00Z","published":"2023-12-02T17:24:17Z","title":"A ripple in time: a discontinuity in American history","summary":"  In this note we use the State of the Union Address (SOTU) dataset from Kaggle\nto make some surprising (and some not so surprising) observations pertaining to\nthe general timeline of American history, and the character and nature of the\naddresses themselves. Our main approach is using vector embeddings, such as\nBERT (DistilBERT) and GPT-2.\n  While it is widely believed that BERT (and its variations) is most suitable\nfor NLP classification tasks, we find out that GPT-2 in conjunction with\nnonlinear dimension reduction methods such as UMAP provide better separation\nand stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In\nour case, no model fine-tuning is required, and the pre-trained out-of-the-box\nGPT-2 model is enough.\n  We also used a fine-tuned DistilBERT model for classification detecting which\nPresident delivered which address, with very good results (accuracy 93% - 95%\ndepending on the run). An analogous task was performed to determine the year of\nwriting, and we were able to pin it down to about 4 years (which is a single\npresidential term).\n  It is worth noting that SOTU addresses provide relatively small writing\nsamples (with about 8'000 words on average, and varying widely from under 2'000\nwords to more than 20'000), and that the number of authors is relatively large\n(we used SOTU addresses of 42 US presidents). This shows that the techniques\nemployed turn out to be rather efficient, while all the computations described\nin this note can be performed using a single GPU instance of Google Colab.\n  The accompanying code is available on GitHub.\n","authors":["Alexander Kolpakov","Igor Rivin"],"pdf_url":"https://arxiv.org/pdf/2312.01185v3.pdf","comment":"7 pages, 8 figures; GitHub repository\n  https://github.com/sashakolpakov/ripple_in_time"},{"id":"http://arxiv.org/abs/2401.12576v1","updated":"2024-01-23T09:11:07Z","published":"2024-01-23T09:11:07Z","title":"LLMCheckup: Conversational Examination of Large Language Models via\n  Interpretability Tools","summary":"  Interpretability tools that offer explanations in the form of a dialogue have\ndemonstrated their efficacy in enhancing users' understanding, as one-off\nexplanations may occasionally fall short in providing sufficient information to\nthe user. Current solutions for dialogue-based explanations, however, require\nmany dependencies and are not easily transferable to tasks they were not\ndesigned for. With LLMCheckup, we present an easily accessible tool that allows\nusers to chat with any state-of-the-art large language model (LLM) about its\nbehavior. We enable LLMs to generate all explanations by themselves and take\ncare of intent recognition without fine-tuning, by connecting them with a broad\nspectrum of Explainable AI (XAI) tools, e.g. feature attributions,\nembedding-based similarity, and prompting strategies for counterfactual and\nrationale generation. LLM (self-)explanations are presented as an interactive\ndialogue that supports follow-up questions and generates suggestions.\nLLMCheckup provides tutorials for operations available in the system, catering\nto individuals with varying levels of expertise in XAI and supports multiple\ninput modalities. We introduce a new parsing strategy called multi-prompt\nparsing substantially enhancing the parsing accuracy of LLMs. Finally, we\nshowcase the tasks of fact checking and commonsense question answering.\n","authors":["Qianli Wang","Tatiana Anikina","Nils Feldhus","Josef van Genabith","Leonhard Hennig","Sebastian Möller"],"pdf_url":"https://arxiv.org/pdf/2401.12576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06075v2","updated":"2024-01-23T09:06:46Z","published":"2023-05-26T13:41:35Z","title":"DeepSeaNet: Improving Underwater Object Detection using EfficientDet","summary":"  Marine animals and deep underwater objects are difficult to recognize and\nmonitor for safety of aquatic life. There is an increasing challenge when the\nwater is saline with granular particles and impurities. In such natural\nadversarial environment, traditional approaches like CNN start to fail and are\nexpensive to compute. This project involves implementing and evaluating various\nobject detection models, including EfficientDet, YOLOv5, YOLOv8, and\nDetectron2, on an existing annotated underwater dataset, called the\nBrackish-Dataset. The dataset comprises annotated image sequences of fish,\ncrabs, starfish, and other aquatic animals captured in Limfjorden water with\nlimited visibility. The aim of this research project is to study the efficiency\nof newer models on the same dataset and contrast them with the previous results\nbased on accuracy and inference time. Firstly, I compare the results of YOLOv3\n(31.10% mean Average Precision (mAP)), YOLOv4 (83.72% mAP), YOLOv5 (97.6%),\nYOLOv8 (98.20%), EfficientDet (98.56% mAP) and Detectron2 (95.20% mAP) on the\nsame dataset. Secondly, I provide a modified BiSkFPN mechanism (BiFPN neck with\nskip connections) to perform complex feature fusion in adversarial noise which\nmakes modified EfficientDet robust to perturbations. Third, analyzed the effect\non accuracy of EfficientDet (98.63% mAP) and YOLOv5 by adversarial learning\n(98.04% mAP). Last, I provide class activation map based explanations (CAM) for\nthe two models to promote Explainability in black box models. Overall, the\nresults indicate that modified EfficientDet achieved higher accuracy with\nfive-fold cross validation than the other models with 88.54% IoU of feature\nmaps.\n","authors":["Sanyam Jain"],"pdf_url":"https://arxiv.org/pdf/2306.06075v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05739v4","updated":"2024-01-23T09:05:19Z","published":"2023-06-09T08:13:06Z","title":"Leaping through tree space: continuous phylogenetic inference for rooted\n  and unrooted trees","summary":"  Phylogenetics is now fundamental in life sciences, providing insights into\nthe earliest branches of life and the origins and spread of epidemics. However,\nfinding suitable phylogenies from the vast space of possible trees remains\nchallenging. To address this problem, for the first time, we perform both tree\nexploration and inference in a continuous space where the computation of\ngradients is possible. This continuous relaxation allows for major leaps across\ntree space in both rooted and unrooted trees, and is less susceptible to\nconvergence to local minima. Our approach outperforms the current best methods\nfor inference on unrooted trees and, in simulation, accurately infers the tree\nand root in ultrametric cases. The approach is effective in cases of empirical\ndata with negligible amounts of data, which we demonstrate on the phylogeny of\njawed vertebrates. Indeed, only a few genes with an ultrametric signal were\ngenerally sufficient for resolving the major lineages of vertebrates.\nOptimisation is possible via automatic differentiation and our method presents\nan effective way forwards for exploring the most difficult, data-deficient\nphylogenetic questions.\n","authors":["Matthew J Penn","Neil Scheidwasser","Joseph Penn","Christl A Donnelly","David A Duchêne","Samir Bhatt"],"pdf_url":"https://arxiv.org/pdf/2306.05739v4.pdf","comment":"26 pages, 3 figures, 2 tables, 20 supplementary pages, 3\n  supplementary figures"},{"id":"http://arxiv.org/abs/2401.12564v1","updated":"2024-01-23T08:47:28Z","published":"2024-01-23T08:47:28Z","title":"Graph Contrastive Invariant Learning from the Causal Perspective","summary":"  Graph contrastive learning (GCL), learning the node representation by\ncontrasting two augmented graphs in a self-supervised way, has attracted\nconsiderable attention. GCL is usually believed to learn the invariant\nrepresentation. However, does this understanding always hold in practice? In\nthis paper, we first study GCL from the perspective of causality. By analyzing\nGCL with the structural causal model (SCM), we discover that traditional GCL\nmay not well learn the invariant representations due to the non-causal\ninformation contained in the graph. How can we fix it and encourage the current\nGCL to learn better invariant representations? The SCM offers two requirements\nand motives us to propose a novel GCL method. Particularly, we introduce the\nspectral graph augmentation to simulate the intervention upon non-causal\nfactors. Then we design the invariance objective and independence objective to\nbetter capture the causal factors. Specifically, (i) the invariance objective\nencourages the encoder to capture the invariant information contained in causal\nvariables, and (ii) the independence objective aims to reduce the influence of\nconfounders on the causal variables. Experimental results demonstrate the\neffectiveness of our approach on node classification tasks.\n","authors":["Yanhu Mo","Xiao Wang","Shaohua Fan","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2401.12564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12550v1","updated":"2024-01-23T08:19:00Z","published":"2024-01-23T08:19:00Z","title":"UR4NNV: Neural Network Verification, Under-approximation Reachability\n  Works!","summary":"  Recently, formal verification of deep neural networks (DNNs) has garnered\nconsiderable attention, and over-approximation based methods have become\npopular due to their effectiveness and efficiency. However, these strategies\nface challenges in addressing the \"unknown dilemma\" concerning whether the\nexact output region or the introduced approximation error violates the property\nin question. To address this, this paper introduces the UR4NNV verification\nframework, which utilizes under-approximation reachability analysis for DNN\nverification for the first time. UR4NNV focuses on DNNs with Rectified Linear\nUnit (ReLU) activations and employs a binary tree branch-based\nunder-approximation algorithm. In each epoch, UR4NNV under-approximates a\nsub-polytope of the reachable set and verifies this polytope against the given\nproperty. Through a trial-and-error approach, UR4NNV effectively falsifies DNN\nproperties while providing confidence levels when reaching verification epoch\nbounds and failing falsifying properties. Experimental comparisons with\nexisting verification methods demonstrate the effectiveness and efficiency of\nUR4NNV, significantly reducing the impact of the \"unknown dilemma\".\n","authors":["Zhen Liang","Taoran Wu","Ran Zhao","Bai Xue","Ji Wang","Wenjing Yang","Shaojun Deng","Wanwei Liu"],"pdf_url":"https://arxiv.org/pdf/2401.12550v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.10487v2","updated":"2024-01-23T08:18:27Z","published":"2023-08-21T06:04:53Z","title":"Deciphering Raw Data in Neuro-Symbolic Learning with Provable Guarantees","summary":"  Neuro-symbolic hybrid systems are promising for integrating machine learning\nand symbolic reasoning, where perception models are facilitated with\ninformation inferred from a symbolic knowledge base through logical reasoning.\nDespite empirical evidence showing the ability of hybrid systems to learn\naccurate perception models, the theoretical understanding of learnability is\nstill lacking. Hence, it remains unclear why a hybrid system succeeds for a\nspecific task and when it may fail given a different knowledge base. In this\npaper, we introduce a novel way of characterising supervision signals from a\nknowledge base, and establish a criterion for determining the knowledge's\nefficacy in facilitating successful learning. This, for the first time, allows\nus to address the two questions above by inspecting the knowledge base under\ninvestigation. Our analysis suggests that many knowledge bases satisfy the\ncriterion, thus enabling effective learning, while some fail to satisfy it,\nindicating potential failures. Comprehensive experiments confirm the utility of\nour criterion on benchmark tasks.\n","authors":["Lue Tao","Yu-Xuan Huang","Wang-Zhou Dai","Yuan Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.10487v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12546v1","updated":"2024-01-23T08:08:09Z","published":"2024-01-23T08:08:09Z","title":"On Building Myopic MPC Policies using Supervised Learning","summary":"  The application of supervised learning techniques in combination with model\npredictive control (MPC) has recently generated significant interest,\nparticularly in the area of approximate explicit MPC, where function\napproximators like deep neural networks are used to learn the MPC policy via\noptimal state-action pairs generated offline. While the aim of approximate\nexplicit MPC is to closely replicate the MPC policy, substituting online\noptimization with a trained neural network, the performance guarantees that\ncome with solving the online optimization problem are typically lost. This\npaper considers an alternative strategy, where supervised learning is used to\nlearn the optimal value function offline instead of learning the optimal\npolicy. This can then be used as the cost-to-go function in a myopic MPC with a\nvery short prediction horizon, such that the online computation burden reduces\nsignificantly without affecting the controller performance. This approach\ndiffers from existing work on value function approximations in the sense that\nit learns the cost-to-go function by using offline-collected state-value pairs,\nrather than closed-loop performance data. The cost of generating the\nstate-value pairs used for training is addressed using a sensitivity-based data\naugmentation scheme.\n","authors":["Christopher A. Orrico","Bokan Yang","Dinesh Krishnamoorthy"],"pdf_url":"https://arxiv.org/pdf/2401.12546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.02059v3","updated":"2024-01-23T08:06:35Z","published":"2022-06-04T21:37:59Z","title":"Empowering GNNs via Edge-Aware Weisfeiler-Leman Algorithm","summary":"  Message passing graph neural networks (GNNs) are known to have their\nexpressiveness upper-bounded by 1-dimensional Weisfeiler-Leman (1-WL)\nalgorithm. To achieve more powerful GNNs, existing attempts either require ad\nhoc features, or involve operations that incur high time and space\ncomplexities. In this work, we propose a general and provably powerful GNN\nframework that preserves the scalability of the message passing scheme. In\nparticular, we first propose to empower 1-WL for graph isomorphism test by\nconsidering edges among neighbors, giving rise to NC-1-WL. The expressiveness\nof NC-1-WL is shown to be strictly above 1-WL and below 3-WL theoretically.\nFurther, we propose the NC-GNN framework as a differentiable neural version of\nNC-1-WL. Our simple implementation of NC-GNN is provably as powerful as\nNC-1-WL. Experiments demonstrate that our NC-GNN performs effectively and\nefficiently on various benchmarks.\n","authors":["Meng Liu","Haiyang Yu","Shuiwang Ji"],"pdf_url":"https://arxiv.org/pdf/2206.02059v3.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2212.12970v3","updated":"2024-01-23T07:57:55Z","published":"2022-12-25T23:19:56Z","title":"Refined Edge Usage of Graph Neural Networks for Edge Prediction","summary":"  Graph Neural Networks (GNNs), originally proposed for node classification,\nhave also motivated many recent works on edge prediction (a.k.a., link\nprediction). However, existing methods lack elaborate design regarding the\ndistinctions between two tasks that have been frequently overlooked: (i) edges\nonly constitute the topology in the node classification task but can be used as\nboth the topology and the supervisions (i.e., labels) in the edge prediction\ntask; (ii) the node classification makes prediction over each individual node,\nwhile the edge prediction is determinated by each pair of nodes. To this end,\nwe propose a novel edge prediction paradigm named Edge-aware Message PassIng\nneuRal nEtworks (EMPIRE). Concretely, we first introduce an edge splitting\ntechnique to specify use of each edge where each edge is solely used as either\nthe topology or the supervision (named as topology edge or supervision edge).\nWe then develop a new message passing mechanism that generates the messages to\nsource nodes (through topology edges) being aware of target nodes (through\nsupervision edges). In order to emphasize the differences between pairs\nconnected by supervision edges and pairs unconnected, we further weight the\nmessages to highlight the relative ones that can reflect the differences. In\naddition, we design a novel negative node-pair sampling trick that efficiently\nsamples 'hard' negative instances in the supervision instances, and can\nsignificantly improve the performance. Experimental results verify that the\nproposed method can significantly outperform existing state-of-the-art models\nregarding the edge prediction task on multiple homogeneous and heterogeneous\ngraph datasets.\n","authors":["Jiarui Jin","Yangkun Wang","Weinan Zhang","Quan Gan","Xiang Song","Yong Yu","Zheng Zhang","David Wipf"],"pdf_url":"https://arxiv.org/pdf/2212.12970v3.pdf","comment":"Need major revisions"},{"id":"http://arxiv.org/abs/2310.03025v2","updated":"2024-01-23T07:49:13Z","published":"2023-10-04T17:59:41Z","title":"Retrieval meets Long Context Large Language Models","summary":"  Extending the context window of large language models (LLMs) is getting\npopular recently, while the solution of augmenting LLMs with retrieval has\nexisted for years. The natural questions are: i) Retrieval-augmentation versus\nlong context window, which one is better for downstream tasks? ii) Can both\nmethods be combined to get the best of both worlds? In this work, we answer\nthese questions by studying both solutions using two state-of-the-art\npretrained LLMs, i.e., a proprietary 43B GPT and Llama2-70B. Perhaps\nsurprisingly, we find that LLM with 4K context window using simple\nretrieval-augmentation at generation can achieve comparable performance to\nfinetuned LLM with 16K context window via positional interpolation on long\ncontext tasks, while taking much less computation. More importantly, we\ndemonstrate that retrieval can significantly improve the performance of LLMs\nregardless of their extended context window sizes. Our best model,\nretrieval-augmented Llama2-70B with 32K context window, outperforms\nGPT-3.5-turbo-16k and Davinci003 in terms of average score on nine long context\ntasks including question answering, query-based summarization, and in-context\nfew-shot learning tasks. It also outperforms its non-retrieval Llama2-70B-32k\nbaseline by a margin, while being much faster at generation. Our study provides\ngeneral insights on the choice of retrieval-augmentation versus long context\nextension of LLM for practitioners.\n","authors":["Peng Xu","Wei Ping","Xianchao Wu","Lawrence McAfee","Chen Zhu","Zihan Liu","Sandeep Subramanian","Evelina Bakhturina","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2310.03025v2.pdf","comment":"Published at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11447v2","updated":"2024-01-23T07:44:49Z","published":"2024-01-21T09:55:47Z","title":"Sequential Model for Predicting Patient Adherence in Subcutaneous\n  Immunotherapy for Allergic Rhinitis","summary":"  Objective: Subcutaneous Immunotherapy (SCIT) is the long-lasting causal\ntreatment of allergic rhinitis. How to enhance the adherence of patients to\nmaximize the benefit of allergen immunotherapy (AIT) plays a crucial role in\nthe management of AIT. This study aims to leverage novel machine learning\nmodels to precisely predict the risk of non-adherence of patients and related\nsystematic symptom scores, to provide a novel approach in the management of\nlong-term AIT.\n  Methods: The research develops and analyzes two models, Sequential Latent\nActor-Critic (SLAC) and Long Short-Term Memory (LSTM), evaluating them based on\nscoring and adherence prediction capabilities.\n  Results: Excluding the biased samples at the first time step, the predictive\nadherence accuracy of the SLAC models is from $60\\,\\%$ to $72\\%$, and for LSTM\nmodels, it is $66\\,\\%$ to $84\\,\\%$, varying according to the time steps. The\nrange of Root Mean Square Error (RMSE) for SLAC models is between $0.93$ and\n$2.22$, while for LSTM models it is between $1.09$ and $1.77$. Notably, these\nRMSEs are significantly lower than the random prediction error of $4.55$.\n  Conclusion: We creatively apply sequential models in the long-term management\nof SCIT with promising accuracy in the prediction of SCIT nonadherence in\nAllergic Rhinitis (AR) patients. While LSTM outperforms SLAC in adherence\nprediction, SLAC excels in score prediction for patients undergoing SCIT for\nAR. The state-action-based SLAC adds flexibility, presenting a novel and\neffective approach for managing long-term AIT.\n","authors":["Yin Li","Yu Xiong","Wenxin Fan","Kai Wang","Qingqing Yu","Liping Si","Patrick van der Smagt","Jun Tang","Nutan Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11447v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10134v2","updated":"2024-01-23T07:42:40Z","published":"2024-01-18T17:03:59Z","title":"Spatial-Temporal Large Language Model for Traffic Prediction","summary":"  Traffic prediction, a critical component for intelligent transportation\nsystems, endeavors to foresee future traffic at specific locations using\nhistorical data. Although existing traffic prediction models often emphasize\ndeveloping complex neural network structures, their accuracy has not seen\nimprovements accordingly. Recently, Large Language Models (LLMs) have shown\noutstanding capabilities in time series analysis. Differing from existing\nmodels, LLMs progress mainly through parameter expansion and extensive\npre-training while maintaining their fundamental structures. In this paper, we\npropose a Spatial-Temporal Large Language Model (ST-LLM) for traffic\nprediction. Specifically, ST-LLM redefines the timesteps at each location as\ntokens and incorporates a spatial-temporal embedding module to learn the\nspatial location and global temporal representations of tokens. Then these\nrepresentations are fused to provide each token with unified spatial and\ntemporal information. Furthermore, we propose a novel partially frozen\nattention strategy of the LLM, which is designed to capture spatial-temporal\ndependencies for traffic prediction. Comprehensive experiments on real traffic\ndatasets offer evidence that ST-LLM outperforms state-of-the-art models.\nNotably, the ST-LLM also exhibits robust performance in both few-shot and\nzero-shot prediction scenarios.\n","authors":["Chenxi Liu","Sun Yang","Qianxiong Xu","Zhishuai Li","Cheng Long","Ziyue Li","Rui Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10134v2.pdf","comment":"Revise"},{"id":"http://arxiv.org/abs/2110.11334v3","updated":"2024-01-23T07:36:33Z","published":"2021-10-21T17:59:41Z","title":"Generalized Out-of-Distribution Detection: A Survey","summary":"  Out-of-distribution (OOD) detection is critical to ensuring the reliability\nand safety of machine learning systems. For instance, in autonomous driving, we\nwould like the driving system to issue an alert and hand over the control to\nhumans when it detects unusual scenes or objects that it has never seen during\ntraining time and cannot make a safe decision. The term, OOD detection, first\nemerged in 2017 and since then has received increasing attention from the\nresearch community, leading to a plethora of methods developed, ranging from\nclassification-based to density-based to distance-based ones. Meanwhile,\nseveral other problems, including anomaly detection (AD), novelty detection\n(ND), open set recognition (OSR), and outlier detection (OD), are closely\nrelated to OOD detection in terms of motivation and methodology. Despite common\ngoals, these topics develop in isolation, and their subtle differences in\ndefinition and problem setting often confuse readers and practitioners. In this\nsurvey, we first present a unified framework called generalized OOD detection,\nwhich encompasses the five aforementioned problems, i.e., AD, ND, OSR, OOD\ndetection, and OD. Under our framework, these five problems can be seen as\nspecial cases or sub-tasks, and are easier to distinguish. We then review each\nof these five areas by summarizing their recent technical developments, with a\nspecial focus on OOD detection methodologies. We conclude this survey with open\nchallenges and potential research directions.\n","authors":["Jingkang Yang","Kaiyang Zhou","Yixuan Li","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2110.11334v3.pdf","comment":"Feel free to comment on our Overleaf manuscript:\n  https://www.overleaf.com/9899719915wmccvdtwpkct#c25192"},{"id":"http://arxiv.org/abs/2401.12533v1","updated":"2024-01-23T07:16:32Z","published":"2024-01-23T07:16:32Z","title":"Efficient Constrained $k$-Center Clustering with Background Knowledge","summary":"  Center-based clustering has attracted significant research interest from both\ntheory and practice. In many practical applications, input data often contain\nbackground knowledge that can be used to improve clustering results. In this\nwork, we build on widely adopted $k$-center clustering and model its input\nbackground knowledge as must-link (ML) and cannot-link (CL) constraint sets.\nHowever, most clustering problems including $k$-center are inherently\n$\\mathcal{NP}$-hard, while the more complex constrained variants are known to\nsuffer severer approximation and computation barriers that significantly limit\ntheir applicability. By employing a suite of techniques including reverse\ndominating sets, linear programming (LP) integral polyhedron, and LP duality,\nwe arrive at the first efficient approximation algorithm for constrained\n$k$-center with the best possible ratio of 2. We also construct competitive\nbaseline algorithms and empirically evaluate our approximation algorithm\nagainst them on a variety of real datasets. The results validate our\ntheoretical findings and demonstrate the great advantages of our algorithm in\nterms of clustering cost, clustering quality, and running time.\n","authors":["Longkun Guo","Chaoqi Jia","Kewen Liao","Zhigang Lu","Minhui Xue"],"pdf_url":"https://arxiv.org/pdf/2401.12533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12532v1","updated":"2024-01-23T07:15:47Z","published":"2024-01-23T07:15:47Z","title":"DAFA: Distance-Aware Fair Adversarial Training","summary":"  The disparity in accuracy between classes in standard training is amplified\nduring adversarial training, a phenomenon termed the robust fairness problem.\nExisting methodologies aimed to enhance robust fairness by sacrificing the\nmodel's performance on easier classes in order to improve its performance on\nharder ones. However, we observe that under adversarial attacks, the majority\nof the model's predictions for samples from the worst class are biased towards\nclasses similar to the worst class, rather than towards the easy classes.\nThrough theoretical and empirical analysis, we demonstrate that robust fairness\ndeteriorates as the distance between classes decreases. Motivated by these\ninsights, we introduce the Distance-Aware Fair Adversarial training (DAFA)\nmethodology, which addresses robust fairness by taking into account the\nsimilarities between classes. Specifically, our method assigns distinct loss\nweights and adversarial margins to each class and adjusts them to encourage a\ntrade-off in robustness among similar classes. Experimental results across\nvarious datasets demonstrate that our method not only maintains average robust\naccuracy but also significantly improves the worst robust accuracy, indicating\na marked improvement in robust fairness compared to existing methods.\n","authors":["Hyungyu Lee","Saehyung Lee","Hyemi Jang","Junsung Park","Ho Bae","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2401.12532v1.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.09479v2","updated":"2024-01-23T07:04:18Z","published":"2024-01-15T05:45:51Z","title":"Uncertainty-Aware Hardware Trojan Detection Using Multimodal Deep\n  Learning","summary":"  The risk of hardware Trojans being inserted at various stages of chip\nproduction has increased in a zero-trust fabless era. To counter this, various\nmachine learning solutions have been developed for the detection of hardware\nTrojans. While most of the focus has been on either a statistical or deep\nlearning approach, the limited number of Trojan-infected benchmarks affects the\ndetection accuracy and restricts the possibility of detecting zero-day Trojans.\nTo close the gap, we first employ generative adversarial networks to amplify\nour data in two alternative representation modalities, a graph and a tabular,\nensuring that the dataset is distributed in a representative manner. Further,\nwe propose a multimodal deep learning approach to detect hardware Trojans and\nevaluate the results from both early fusion and late fusion strategies. We also\nestimate the uncertainty quantification metrics of each prediction for\nrisk-aware decision-making. The outcomes not only confirms the efficacy of our\nproposed hardware Trojan detection method but also opens a new door for future\nstudies employing multimodality and uncertainty quantification to address other\nhardware security challenges.\n","authors":["Rahul Vishwakarma","Amin Rezaei"],"pdf_url":"https://arxiv.org/pdf/2401.09479v2.pdf","comment":"2024 Design, Automation and Test in Europe Conference | The European\n  Event for Electronic System Design & Test (accepted)"},{"id":"http://arxiv.org/abs/2311.16536v2","updated":"2024-01-23T07:01:19Z","published":"2023-11-28T05:45:20Z","title":"Personalized Predictions of Glioblastoma Infiltration: Mathematical\n  Models, Physics-Informed Neural Networks and Multimodal Scans","summary":"  Predicting the infiltration of Glioblastoma (GBM) from medical MRI scans is\ncrucial for understanding tumor growth dynamics and designing personalized\nradiotherapy treatment plans.Mathematical models of GBM growth can complement\nthe data in the prediction of spatial distributions of tumor cells. However,\nthis requires estimating patient-specific parameters of the model from clinical\ndata, which is a challenging inverse problem due to limited temporal data and\nthe limited time between imaging and diagnosis. This work proposes a method\nthat uses Physics-Informed Neural Networks (PINNs) to estimate patient-specific\nparameters of a reaction-diffusion PDE model of GBM growth from a single 3D\nstructural MRI snapshot. PINNs embed both the data and the PDE into a loss\nfunction, thus integrating theory and data. Key innovations include the\nidentification and estimation of characteristic non-dimensional parameters, a\npre-training step that utilizes the non-dimensional parameters and a\nfine-tuning step to determine the patient specific parameters. Additionally,\nthe diffuse domain method is employed to handle the complex brain geometry\nwithin the PINN framework. Our method is validated both on synthetic and\npatient datasets, and shows promise for real-time parametric inference in the\nclinical setting for personalized GBM treatment.\n","authors":["Ray Zirui Zhang","Ivan Ezhov","Michal Balcerak","Andy Zhu","Benedikt Wiestler","Bjoern Menze","John Lowengrub"],"pdf_url":"https://arxiv.org/pdf/2311.16536v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12522v1","updated":"2024-01-23T06:36:49Z","published":"2024-01-23T06:36:49Z","title":"BiTA: Bi-Directional Tuning for Lossless Acceleration in Large Language\n  Models","summary":"  Large language models (LLMs) commonly employ autoregressive generation during\ninference, leading to high memory bandwidth demand and consequently extended\nlatency. To mitigate this inefficiency, we present Bi-directional Tuning for\nlossless Acceleration (BiTA), an innovative method expediting LLMs via\nstreamlined semi-autoregressive generation and draft verification. Inspired by\nthe concept of prompt tuning, we enhance LLMs with a parameter-efficient design\ncalled bi-directional tuning for the capability in semi-autoregressive\ngeneration. Employing efficient tree-based decoding, the models perform draft\ncandidate generation and verification in parallel, ensuring outputs identical\nto their autoregressive counterparts under greedy sampling. BiTA serves as a\nlightweight plug-in module, seamlessly boosting the inference efficiency of\nexisting LLMs without requiring additional assistance models or incurring\nsignificant extra memory costs. Applying the proposed BiTA, LLaMA-2-70B-Chat\nachieves a 2.7$\\times$ speedup on the MT-Bench benchmark. Extensive experiments\nconfirm our method surpasses state-of-the-art acceleration techniques.\n","authors":["Feng Lin","Hanling Yi","Hongbin Li","Yifan Yang","Xiaotian Yu","Guangming Lu","Rong Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.12522v1.pdf","comment":"Source code at https://github.com/linfeng93/BiTA"},{"id":"http://arxiv.org/abs/2401.12520v1","updated":"2024-01-23T06:30:05Z","published":"2024-01-23T06:30:05Z","title":"Key Information Retrieval to Classify the Unstructured Data Content of\n  Preferential Trade Agreements","summary":"  With the rapid proliferation of textual data, predicting long texts has\nemerged as a significant challenge in the domain of natural language\nprocessing. Traditional text prediction methods encounter substantial\ndifficulties when grappling with long texts, primarily due to the presence of\nredundant and irrelevant information, which impedes the model's capacity to\ncapture pivotal insights from the text. To address this issue, we introduce a\nnovel approach to long-text classification and prediction. Initially, we employ\nembedding techniques to condense the long texts, aiming to diminish the\nredundancy therein. Subsequently,the Bidirectional Encoder Representations from\nTransformers (BERT) embedding method is utilized for text classification\ntraining. Experimental outcomes indicate that our method realizes considerable\nperformance enhancements in classifying long texts of Preferential Trade\nAgreements. Furthermore, the condensation of text through embedding methods not\nonly augments prediction accuracy but also substantially reduces computational\ncomplexity. Overall, this paper presents a strategy for long-text prediction,\noffering a valuable reference for researchers and engineers in the natural\nlanguage processing sphere.\n","authors":["Jiahui Zhao","Ziyi Meng","Stepan Gordeev","Zijie Pan","Dongjin Song","Sandro Steinbach","Caiwen Ding"],"pdf_url":"https://arxiv.org/pdf/2401.12520v1.pdf","comment":"AI4TS Workshop@AAAI 2024 accepted publication"},{"id":"http://arxiv.org/abs/2205.05173v5","updated":"2024-01-23T06:25:31Z","published":"2022-05-10T21:16:14Z","title":"Self-Supervised Anomaly Detection in Computer Vision and Beyond: A\n  Survey and Outlook","summary":"  Anomaly detection (AD) plays a crucial role in various domains, including\ncybersecurity, finance, and healthcare, by identifying patterns or events that\ndeviate from normal behaviour. In recent years, significant progress has been\nmade in this field due to the remarkable growth of deep learning models.\nNotably, the advent of self-supervised learning has sparked the development of\nnovel AD algorithms that outperform the existing state-of-the-art approaches by\na considerable margin. This paper aims to provide a comprehensive review of the\ncurrent methodologies in self-supervised anomaly detection. We present\ntechnical details of the standard methods and discuss their strengths and\ndrawbacks. We also compare the performance of these models against each other\nand other state-of-the-art anomaly detection models. Finally, the paper\nconcludes with a discussion of future directions for self-supervised anomaly\ndetection, including the development of more effective and efficient algorithms\nand the integration of these techniques with other related fields, such as\nmulti-modal learning.\n","authors":["Hadi Hojjati","Thi Kieu Khanh Ho","Narges Armanfard"],"pdf_url":"https://arxiv.org/pdf/2205.05173v5.pdf","comment":"18 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2401.12517v1","updated":"2024-01-23T06:21:34Z","published":"2024-01-23T06:21:34Z","title":"DDMI: Domain-Agnostic Latent Diffusion Models for Synthesizing\n  High-Quality Implicit Neural Representations","summary":"  Recent studies have introduced a new class of generative models for\nsynthesizing implicit neural representations (INRs) that capture arbitrary\ncontinuous signals in various domains. These models opened the door for\ndomain-agnostic generative models, but they often fail to achieve high-quality\ngeneration. We observed that the existing methods generate the weights of\nneural networks to parameterize INRs and evaluate the network with fixed\npositional embeddings (PEs). Arguably, this architecture limits the expressive\npower of generative models and results in low-quality INR generation. To\naddress this limitation, we propose Domain-agnostic Latent Diffusion Model for\nINRs (DDMI) that generates adaptive positional embeddings instead of neural\nnetworks' weights. Specifically, we develop a Discrete-to-continuous space\nVariational AutoEncoder (D2C-VAE), which seamlessly connects discrete data and\nthe continuous signal functions in the shared latent space. Additionally, we\nintroduce a novel conditioning mechanism for evaluating INRs with the\nhierarchically decomposed PEs to further enhance expressive power. Extensive\nexperiments across four modalities, e.g., 2D images, 3D shapes, Neural Radiance\nFields, and videos, with seven benchmark datasets, demonstrate the versatility\nof DDMI and its superior performance compared to the existing INR generative\nmodels.\n","authors":["Dogyun Park","Sihyeon Kim","Sojin Lee","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2401.12517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10016v2","updated":"2024-01-23T06:16:38Z","published":"2023-09-18T16:17:44Z","title":"Evaluation of GPT-3 for Anti-Cancer Drug Sensitivity Prediction","summary":"  In this study, we investigated the potential of GPT-3 for the anti-cancer\ndrug sensitivity prediction task using structured pharmacogenomics data across\nfive tissue types and evaluated its performance with zero-shot prompting and\nfine-tuning paradigms. The drug's smile representation and cell line's genomic\nmutation features were predictive of the drug response. The results from this\nstudy have the potential to pave the way for designing more efficient treatment\nprotocols in precision oncology.\n","authors":["Shaika Chowdhury","Sivaraman Rajaganapathy","Lichao Sun","James Cerhan","Nansu Zong"],"pdf_url":"https://arxiv.org/pdf/2309.10016v2.pdf","comment":"AMIA Informatics Summit 2024"},{"id":"http://arxiv.org/abs/2401.08119v2","updated":"2024-01-23T06:14:04Z","published":"2024-01-16T05:23:34Z","title":"SpecSTG: A Fast Spectral Diffusion Framework for Probabilistic\n  Spatio-Temporal Traffic Forecasting","summary":"  Traffic forecasting, a crucial application of spatio-temporal graph (STG)\nlearning, has traditionally relied on deterministic models for accurate point\nestimations. Yet, these models fall short of identifying latent risks of\nunexpected volatility in future observations. To address this gap,\nprobabilistic methods, especially variants of diffusion models, have emerged as\nuncertainty-aware solutions. However, existing diffusion methods typically\nfocus on generating separate future time series for individual sensors in the\ntraffic network, resulting in insufficient involvement of spatial network\ncharacteristics in the probabilistic learning process. To better leverage\nspatial dependencies and systematic patterns inherent in traffic data, we\npropose SpecSTG, a novel spectral diffusion framework. Our method generates the\nFourier representation of future time series, transforming the learning process\ninto the spectral domain enriched with spatial information. Additionally, our\napproach incorporates a fast spectral graph convolution designed for Fourier\ninput, alleviating the computational burden associated with existing models.\nNumerical experiments show that SpecSTG achieves outstanding performance with\ntraffic flow and traffic speed datasets compared to state-of-the-art baselines.\nThe source code for SpecSTG is available at\nhttps://anonymous.4open.science/r/SpecSTG.\n","authors":["Lequan Lin","Dai Shi","Andi Han","Junbin Gao"],"pdf_url":"https://arxiv.org/pdf/2401.08119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10309v2","updated":"2024-01-23T06:03:10Z","published":"2023-11-17T03:41:22Z","title":"Imagination-Augmented Hierarchical Reinforcement Learning for Safe and\n  Interactive Autonomous Driving in Urban Environments","summary":"  Hierarchical reinforcement learning (HRL) incorporates temporal abstraction\ninto reinforcement learning (RL) by explicitly taking advantage of hierarchical\nstructure. Modern HRL typically designs a hierarchical agent composed of a\nhigh-level policy and low-level policies. The high-level policy selects which\nlow-level policy to activate at a lower frequency and the activated low-level\npolicy selects an action at each time step. Recent HRL algorithms have achieved\nperformance gains over standard RL algorithms in synthetic navigation tasks.\nHowever, we cannot apply these HRL algorithms to real-world navigation tasks.\nOne of the main challenges is that real-world navigation tasks require an agent\nto perform safe and interactive behaviors in dynamic environments. In this\npaper, we propose imagination-augmented HRL (IAHRL) that efficiently integrates\nimagination into HRL to enable an agent to learn safe and interactive behaviors\nin real-world navigation tasks. Imagination is to predict the consequences of\nactions without interactions with actual environments. The key idea behind\nIAHRL is that the low-level policies imagine safe and structured behaviors, and\nthen the high-level policy infers interactions with surrounding objects by\ninterpreting the imagined behaviors. We also introduce a new attention\nmechanism that allows our high-level policy to be permutation-invariant to the\norder of surrounding objects and to prioritize our agent over them. To evaluate\nIAHRL, we introduce five complex urban driving tasks, which are among the most\nchallenging real-world navigation tasks. The experimental results indicate that\nIAHRL enables an agent to perform safe and interactive behaviors, achieving\nhigher success rates and lower average episode steps than baselines.\n","authors":["Sang-Hyun Lee","Yoonjae Jung","Seung-Woo Seo"],"pdf_url":"https://arxiv.org/pdf/2311.10309v2.pdf","comment":"15 pages, 9 figures; corrected typos, added references, revised\n  experiments (results unchanged)"},{"id":"http://arxiv.org/abs/2401.12509v1","updated":"2024-01-23T06:02:03Z","published":"2024-01-23T06:02:03Z","title":"Digital cloning of online social networks for language-sensitive\n  agent-based modeling of misinformation spread","summary":"  We develop a simulation framework for studying misinformation spread within\nonline social networks that blends agent-based modeling and natural language\nprocessing techniques. While many other agent-based simulations exist in this\nspace, their ability to provide actionable insights in in part limited by their\nlack of fidelity and generalizability to existing networks. To partially\naddress these concerns, we create a 'digital clone' of a known misinformation\nsharing network by downloading social media histories for over ten thousand of\nits users. We parse these histories to both extract the structure of the\nnetwork and model the nuanced ways in which information is shared and spread\namong its members. Unlike many other agent-based methods in this space,\ninformation sharing between users in our framework is sensitive to topic of\ndiscussion, user preferences, and online community dynamics. To evaluate the\nfidelity of our method, we seed our cloned network with a set of posts recorded\nin the base network and compare propagation dynamics between the two, observing\nreasonable agreement across the twin networks over a variety of metrics.\nLastly, we explore how the cloned network may serve as a flexible, low-cost\ntestbed for misinformation countermeasure evaluation and red teaming analysis.\nWe hope the tools explored here augment existing efforts in the space and\nunlock new opportunities for misinformation countermeasure evaluation, a field\nthat may become increasingly important to consider with the anticipated rise of\nmisinformation campaigns fueled by generative artificial intelligence.\n","authors":["Prateek Puri","Gabriel Hassler","Anton Shenk","Sai Katragadda"],"pdf_url":"https://arxiv.org/pdf/2401.12509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12508v1","updated":"2024-01-23T06:01:29Z","published":"2024-01-23T06:01:29Z","title":"On the Stochastic (Variance-Reduced) Proximal Gradient Method for\n  Regularized Expected Reward Optimization","summary":"  We consider a regularized expected reward optimization problem in the\nnon-oblivious setting that covers many existing problems in reinforcement\nlearning (RL). In order to solve such an optimization problem, we apply and\nanalyze the classical stochastic proximal gradient method. In particular, the\nmethod has shown to admit an $O(\\epsilon^{-4})$ sample complexity to an\n$\\epsilon$-stationary point, under standard conditions. Since the variance of\nthe classical stochastic gradient estimator is typically large which slows down\nthe convergence, we also apply an efficient stochastic variance-reduce proximal\ngradient method with an importance sampling based ProbAbilistic Gradient\nEstimator (PAGE). To the best of our knowledge, the application of this method\nrepresents a novel approach in addressing the general regularized reward\noptimization problem. Our analysis shows that the sample complexity can be\nimproved from $O(\\epsilon^{-4})$ to $O(\\epsilon^{-3})$ under additional\nconditions. Our results on the stochastic (variance-reduced) proximal gradient\nmethod match the sample complexity of their most competitive counterparts under\nsimilar settings in the RL literature.\n","authors":["Ling Liang","Haizhao Yang"],"pdf_url":"https://arxiv.org/pdf/2401.12508v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2206.11492v4","updated":"2024-01-23T05:52:26Z","published":"2022-06-23T06:24:50Z","title":"Gradual Domain Adaptation via Normalizing Flows","summary":"  Standard domain adaptation methods do not work well when a large gap exists\nbetween the source and target domains. Gradual domain adaptation is one of the\napproaches used to address the problem. It involves leveraging the intermediate\ndomain, which gradually shifts from the source domain to the target domain. In\nprevious work, it is assumed that the number of intermediate domains is large\nand the distance between adjacent domains is small; hence, the gradual domain\nadaptation algorithm, involving self-training with unlabeled datasets, is\napplicable. In practice, however, gradual self-training will fail because the\nnumber of intermediate domains is limited and the distance between adjacent\ndomains is large. We propose the use of normalizing flows to deal with this\nproblem while maintaining the framework of unsupervised domain adaptation. The\nproposed method learns a transformation from the distribution of the target\ndomain to the Gaussian mixture distribution via the source domain. We evaluate\nour proposed method by experiments using real-world datasets and confirm that\nit mitigates the above-explained problem and improves the classification\nperformance.\n","authors":["Shogo Sagawa","Hideitsu Hino"],"pdf_url":"https://arxiv.org/pdf/2206.11492v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.13069v3","updated":"2024-01-23T05:44:44Z","published":"2022-12-26T09:57:09Z","title":"Homophily modulates double descent generalization in graph convolution\n  networks","summary":"  Graph neural networks (GNNs) excel in modeling relational data such as\nbiological, social, and transportation networks, but the underpinnings of their\nsuccess are not well understood. Traditional complexity measures from\nstatistical learning theory fail to account for observed phenomena like the\ndouble descent or the impact of relational semantics on generalization error.\nMotivated by experimental observations of ``transductive'' double descent in\nkey networks and datasets, we use analytical tools from statistical physics and\nrandom matrix theory to precisely characterize generalization in simple graph\nconvolution networks on the contextual stochastic block model. Our results\nilluminate the nuances of learning on homophilic versus heterophilic data and\npredict double descent whose existence in GNNs has been questioned by recent\nwork. We show how risk is shaped by the interplay between the graph noise,\nfeature noise, and the number of training labels. Our findings apply beyond\nstylized models, capturing qualitative trends in real-world GNNs and datasets.\nAs a case in point, we use our analytic insights to improve performance of\nstate-of-the-art graph convolution networks on heterophilic datasets.\n","authors":["Cheng Shi","Liming Pan","Hong Hu","Ivan Dokmanić"],"pdf_url":"https://arxiv.org/pdf/2212.13069v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.01407v6","updated":"2024-01-23T05:44:22Z","published":"2022-10-04T06:32:45Z","title":"Homotopy-based training of NeuralODEs for accurate dynamics discovery","summary":"  Neural Ordinary Differential Equations (NeuralODEs) present an attractive way\nto extract dynamical laws from time series data, as they bridge neural networks\nwith the differential equation-based modeling paradigm of the physical\nsciences. However, these models often display long training times and\nsuboptimal results, especially for longer duration data. While a common\nstrategy in the literature imposes strong constraints to the NeuralODE\narchitecture to inherently promote stable model dynamics, such methods are\nill-suited for dynamics discovery as the unknown governing equation is not\nguaranteed to satisfy the assumed constraints. In this paper, we develop a new\ntraining method for NeuralODEs, based on synchronization and homotopy\noptimization, that does not require changes to the model architecture. We show\nthat synchronizing the model dynamics and the training data tames the\noriginally irregular loss landscape, which homotopy optimization can then\nleverage to enhance training. Through benchmark experiments, we demonstrate our\nmethod achieves competitive or better training loss while often requiring less\nthan half the number of training epochs compared to other model-agnostic\ntechniques. Furthermore, models trained with our method display better\nextrapolation capabilities, highlighting the effectiveness of our method.\n","authors":["Joon-Hyuk Ko","Hankyul Koh","Nojun Park","Wonho Jhe"],"pdf_url":"https://arxiv.org/pdf/2210.01407v6.pdf","comment":"10 pages, 5 figures, accepted at NeurIPS2023\n  (https://neurips.cc/virtual/2023/poster/70313)"},{"id":"http://arxiv.org/abs/2401.12497v1","updated":"2024-01-23T05:43:15Z","published":"2024-01-23T05:43:15Z","title":"Building Minimal and Reusable Causal State Abstractions for\n  Reinforcement Learning","summary":"  Two desiderata of reinforcement learning (RL) algorithms are the ability to\nlearn from relatively little experience and the ability to learn policies that\ngeneralize to a range of problem specifications. In factored state spaces, one\napproach towards achieving both goals is to learn state abstractions, which\nonly keep the necessary variables for learning the tasks at hand. This paper\nintroduces Causal Bisimulation Modeling (CBM), a method that learns the causal\nrelationships in the dynamics and reward functions for each task to derive a\nminimal, task-specific abstraction. CBM leverages and improves implicit\nmodeling to train a high-fidelity causal dynamics model that can be reused for\nall tasks in the same environment. Empirical validation on manipulation\nenvironments and Deepmind Control Suite reveals that CBM's learned implicit\ndynamics models identify the underlying causal relationships and state\nabstractions more accurately than explicit ones. Furthermore, the derived state\nabstractions allow a task learner to achieve near-oracle levels of sample\nefficiency and outperform baselines on all tasks.\n","authors":["Zizhao Wang","Caroline Wang","Xuesu Xiao","Yuke Zhu","Peter Stone"],"pdf_url":"https://arxiv.org/pdf/2401.12497v1.pdf","comment":"Accepted at AAAI24"},{"id":"http://arxiv.org/abs/2401.00334v3","updated":"2024-01-23T05:38:56Z","published":"2023-12-30T21:48:20Z","title":"Explainability-Driven Leaf Disease Classification Using Adversarial\n  Training and Knowledge Distillation","summary":"  This work focuses on plant leaf disease classification and explores three\ncrucial aspects: adversarial training, model explainability, and model\ncompression. The models' robustness against adversarial attacks is enhanced\nthrough adversarial training, ensuring accurate classification even in the\npresence of threats. Leveraging explainability techniques, we gain insights\ninto the model's decision-making process, improving trust and transparency.\nAdditionally, we explore model compression techniques to optimize computational\nefficiency while maintaining classification performance. Through our\nexperiments, we determine that on a benchmark dataset, the robustness can be\nthe price of the classification accuracy with performance reductions of 3%-20%\nfor regular tests and gains of 50%-70% for adversarial attack tests. We also\ndemonstrate that a student model can be 15-25 times more computationally\nefficient for a slight performance reduction, distilling the knowledge of more\ncomplex models.\n","authors":["Sebastian-Vasile Echim","Iulian-Marius Tăiatu","Dumitru-Clementin Cercel","Florin Pop"],"pdf_url":"https://arxiv.org/pdf/2401.00334v3.pdf","comment":"10 pages, 8 figures, Accepted by ICAART 2024"},{"id":"http://arxiv.org/abs/2401.12496v1","updated":"2024-01-23T05:37:32Z","published":"2024-01-23T05:37:32Z","title":"DexTouch: Learning to Seek and Manipulate Objects with Tactile Dexterity","summary":"  The sense of touch is an essential ability for skillfully performing a\nvariety of tasks, providing the capacity to search and manipulate objects\nwithout relying on visual information. Extensive research has been conducted\nover time to apply these human tactile abilities to robots. In this paper, we\nintroduce a multi-finger robot system designed to search for and manipulate\nobjects using the sense of touch without relying on visual information.\nRandomly located target objects are searched using tactile sensors, and the\nobjects are manipulated for tasks that mimic daily-life. The objective of the\nstudy is to endow robots with human-like tactile capabilities. To achieve this,\nbinary tactile sensors are implemented on one side of the robot hand to\nminimize the Sim2Real gap. Training the policy through reinforcement learning\nin simulation and transferring the trained policy to the real environment, we\ndemonstrate that object search and manipulation using tactile sensors is\npossible even in an environment without vision information. In addition, an\nablation study was conducted to analyze the effect of tactile information on\nmanipulative tasks. Our project page is available at\nhttps://lee-kangwon.github.io/dextouch/\n","authors":["Kang-Won Lee","Yuzhe Qin","Xiaolong Wang","Soo-Chul Lim"],"pdf_url":"https://arxiv.org/pdf/2401.12496v1.pdf","comment":"Project page: https://lee-kangwon.github.io/dextouch/"},{"id":"http://arxiv.org/abs/2401.12492v1","updated":"2024-01-23T05:20:35Z","published":"2024-01-23T05:20:35Z","title":"Comparing Human-Centered Language Modeling: Is it Better to Model\n  Groups, Individual Traits, or Both?","summary":"  Natural language processing has made progress in incorporating human context\ninto its models, but whether it is more effective to use group-wise attributes\n(e.g., over-45-year-olds) or model individuals remains open. Group attributes\nare technically easier but coarse: not all 45-year-olds write the same way. In\ncontrast, modeling individuals captures the complexity of each person's\nidentity. It allows for a more personalized representation, but we may have to\nmodel an infinite number of users and require data that may be impossible to\nget. We compare modeling human context via group attributes, individual users,\nand combined approaches. Combining group and individual features significantly\nbenefits user-level regression tasks like age estimation or personality\nassessment from a user's documents. Modeling individual users significantly\nimproves the performance of single document-level classification tasks like\nstance and topic detection. We also find that individual-user modeling does\nwell even without user's historical data.\n","authors":["Nikita Soni","Niranjan Balasubramanian","H. Andrew Schwartz","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2401.12492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03401v2","updated":"2024-01-23T05:16:12Z","published":"2023-06-06T04:32:10Z","title":"A Lightweight Method for Tackling Unknown Participation Statistics in\n  Federated Averaging","summary":"  In federated learning (FL), clients usually have diverse participation\nstatistics that are unknown a priori, which can significantly harm the\nperformance of FL if not handled properly. Existing works aiming at addressing\nthis problem are usually based on global variance reduction, which requires a\nsubstantial amount of additional memory in a multiplicative factor equal to the\ntotal number of clients. An important open problem is to find a lightweight\nmethod for FL in the presence of clients with unknown participation rates. In\nthis paper, we address this problem by adapting the aggregation weights in\nfederated averaging (FedAvg) based on the participation history of each client.\nWe first show that, with heterogeneous participation statistics, FedAvg with\nnon-optimal aggregation weights can diverge from the optimal solution of the\noriginal FL objective, indicating the need of finding optimal aggregation\nweights. However, it is difficult to compute the optimal weights when the\nparticipation statistics are unknown. To address this problem, we present a new\nalgorithm called FedAU, which improves FedAvg by adaptively weighting the\nclient updates based on online estimates of the optimal weights without knowing\nthe statistics of client participation. We provide a theoretical convergence\nanalysis of FedAU using a novel methodology to connect the estimation error and\nconvergence. Our theoretical results reveal important and interesting insights,\nwhile showing that FedAU converges to an optimal solution of the original\nobjective and has desirable properties such as linear speedup. Our experimental\nresults also verify the advantage of FedAU over baseline methods with various\nparticipation patterns.\n","authors":["Shiqiang Wang","Mingyue Ji"],"pdf_url":"https://arxiv.org/pdf/2306.03401v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.12489v1","updated":"2024-01-23T05:06:29Z","published":"2024-01-23T05:06:29Z","title":"Unsupervised Learning Method for the Wave Equation Based on Finite\n  Difference Residual Constraints Loss","summary":"  The wave equation is an important physical partial differential equation, and\nin recent years, deep learning has shown promise in accelerating or replacing\ntraditional numerical methods for solving it. However, existing deep learning\nmethods suffer from high data acquisition costs, low training efficiency, and\ninsufficient generalization capability for boundary conditions. To address\nthese issues, this paper proposes an unsupervised learning method for the wave\nequation based on finite difference residual constraints. We construct a novel\nfinite difference residual constraint based on structured grids and finite\ndifference methods, as well as an unsupervised training strategy, enabling\nconvolutional neural networks to train without data and predict the forward\npropagation process of waves. Experimental results show that finite difference\nresidual constraints have advantages over physics-informed neural networks\n(PINNs) type physical information constraints, such as easier fitting, lower\ncomputational costs, and stronger source term generalization capability, making\nour method more efficient in training and potent in application.\n","authors":["Xin Feng","Yi Jiang","Jia-Xian Qin","Lai-Ping Zhang","Xiao-Gang Deng"],"pdf_url":"https://arxiv.org/pdf/2401.12489v1.pdf","comment":"in Chinese language"},{"id":"http://arxiv.org/abs/2401.10225v2","updated":"2024-01-23T05:04:32Z","published":"2024-01-18T18:59:11Z","title":"ChatQA: Building GPT-4 Level Conversational QA Models","summary":"  In this work, we introduce ChatQA, a family of conversational question\nanswering (QA) models that obtain GPT-4 level accuracies. Specifically, we\npropose a two-stage instruction tuning method that can significantly improve\nthe zero-shot conversational QA results from large language models (LLMs). To\nhandle retrieval-augmented generation in conversational QA, we fine-tune a\ndense retriever on a multi-turn QA dataset, which provides comparable results\nto using the state-of-the-art query rewriting model while largely reducing\ndeployment cost. Notably, our ChatQA-70B can outperform GPT-4 in terms of\naverage score on 10 conversational QA datasets (54.14 vs. 53.90), without\nrelying on any synthetic data from OpenAI GPT models.\n","authors":["Zihan Liu","Wei Ping","Rajarshi Roy","Peng Xu","Chankyu Lee","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2401.10225v2.pdf","comment":"We added ChatQA-22B results"},{"id":"http://arxiv.org/abs/2306.17396v2","updated":"2024-01-23T05:03:55Z","published":"2023-06-30T04:26:46Z","title":"Koopman operator learning using invertible neural networks","summary":"  In Koopman operator theory, a finite-dimensional nonlinear system is\ntransformed into an infinite but linear system using a set of observable\nfunctions. However, manually selecting observable functions that span the\ninvariant subspace of the Koopman operator based on prior knowledge is\ninefficient and challenging, particularly when little or no information is\navailable about the underlying systems. Furthermore, current methodologies tend\nto disregard the importance of the invertibility of observable functions, which\nleads to inaccurate results. To address these challenges, we propose the\nso-called FlowDMD, aka Flow-based Dynamic Mode Decomposition, that utilizes the\nCoupling Flow Invertible Neural Network (CF-INN) framework. FlowDMD leverages\nthe intrinsically invertible characteristics of the CF-INN to learn the\ninvariant subspaces of the Koopman operator and accurately reconstruct state\nvariables. Numerical experiments demonstrate the superior performance of our\nalgorithm compared to state-of-the-art methodologies.\n","authors":["Yuhuang Meng","Jianguo Huang","Yue Qiu"],"pdf_url":"https://arxiv.org/pdf/2306.17396v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09781v3","updated":"2024-01-23T05:02:03Z","published":"2023-05-16T20:12:59Z","title":"SpecInfer: Accelerating Generative Large Language Model Serving with\n  Tree-based Speculative Inference and Verification","summary":"  This paper introduces SpecInfer, a system that accelerates generative large\nlanguage model (LLM) serving with tree-based speculative inference and\nverification. The key idea behind SpecInfer is leveraging small speculative\nmodels to predict the LLM's outputs; the predictions are organized as a token\ntree, whose nodes each represent a candidate token sequence. The correctness of\nall candidate token sequences represented by a token tree is verified against\nthe LLM in parallel using a novel tree-based parallel decoding mechanism.\nSpecInfer uses an LLM as a token tree verifier instead of an incremental\ndecoder, which significantly reduces the end-to-end latency and computational\nrequirement for serving generative LLMs while provably preserving model\nquality. Our evaluation shows that SpecInfer outperforms existing LLM serving\nsystems by 1.5-2.8x for distributed LLM inference and by 2.6-3.5x for\noffloading-based LLM inference, while preserving the same generative\nperformance. SpecInfer is publicly available at\nhttps://github.com/flexflow/FlexFlow/\n","authors":["Xupeng Miao","Gabriele Oliaro","Zhihao Zhang","Xinhao Cheng","Zeyu Wang","Zhengxin Zhang","Rae Ying Yee Wong","Alan Zhu","Lijie Yang","Xiaoxiang Shi","Chunan Shi","Zhuoming Chen","Daiyaan Arfeen","Reyna Abhyankar","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2305.09781v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12485v1","updated":"2024-01-23T04:50:13Z","published":"2024-01-23T04:50:13Z","title":"Adiabatic Quantum Support Vector Machines","summary":"  Adiabatic quantum computers can solve difficult optimization problems (e.g.,\nthe quadratic unconstrained binary optimization problem), and they seem well\nsuited to train machine learning models. In this paper, we describe an\nadiabatic quantum approach for training support vector machines. We show that\nthe time complexity of our quantum approach is an order of magnitude better\nthan the classical approach. Next, we compare the test accuracy of our quantum\napproach against a classical approach that uses the Scikit-learn library in\nPython across five benchmark datasets (Iris, Wisconsin Breast Cancer (WBC),\nWine, Digits, and Lambeq). We show that our quantum approach obtains accuracies\non par with the classical approach. Finally, we perform a scalability study in\nwhich we compute the total training times of the quantum approach and the\nclassical approach with increasing number of features and number of data points\nin the training dataset. Our scalability results show that the quantum approach\nobtains a 3.5--4.5 times speedup over the classical approach on datasets with\nmany (millions of) features.\n","authors":["Prasanna Date","Dong Jun Woun","Kathleen Hamilton","Eduardo A. Coello Perez","Mayanka Chandra Shekhar","Francisco Rios","John Gounley","In-Saeng Suh","Travis Humble","Georgia Tourassi"],"pdf_url":"https://arxiv.org/pdf/2401.12485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12478v1","updated":"2024-01-23T04:16:58Z","published":"2024-01-23T04:16:58Z","title":"Mini-batch Submodular Maximization","summary":"  We present the first mini-batch algorithm for maximizing a non-negative\nmonotone decomposable submodular function, $F=\\sum_{i=1}^N f^i$, under a set of\nconstraints. We improve over the sparsifier based approach both in theory and\nin practice. We experimentally observe that our algorithm generates solutions\nthat are far superior to those generated by the sparsifier based approach.\n","authors":["Gregory Schwartzman"],"pdf_url":"https://arxiv.org/pdf/2401.12478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12937v2","updated":"2024-01-23T04:10:56Z","published":"2023-12-20T11:27:46Z","title":"Robust Loss Functions for Training Decision Trees with Noisy Labels","summary":"  We consider training decision trees using noisily labeled data, focusing on\nloss functions that can lead to robust learning algorithms. Our contributions\nare threefold. First, we offer novel theoretical insights on the robustness of\nmany existing loss functions in the context of decision tree learning. We show\nthat some of the losses belong to a class of what we call conservative losses,\nand the conservative losses lead to an early stopping behavior during training\nand noise-tolerant predictions during testing. Second, we introduce a framework\nfor constructing robust loss functions, called distribution losses. These\nlosses apply percentile-based penalties based on an assumed margin\ndistribution, and they naturally allow adapting to different noise rates via a\nrobustness parameter. In particular, we introduce a new loss called the\nnegative exponential loss, which leads to an efficient greedy\nimpurity-reduction learning algorithm. Lastly, our experiments on multiple\ndatasets and noise settings validate our theoretical insight and the\neffectiveness of our adaptive negative exponential loss.\n","authors":["Jonathan Wilton","Nan Ye"],"pdf_url":"https://arxiv.org/pdf/2312.12937v2.pdf","comment":"Accepted at AAAI Conference on Artificial Intelligence 2024"},{"id":"http://arxiv.org/abs/2401.12476v1","updated":"2024-01-23T04:05:26Z","published":"2024-01-23T04:05:26Z","title":"Bayesian identification of nonseparable Hamiltonians with multiplicative\n  noise using deep learning and reduced-order modeling","summary":"  This paper presents a structure-preserving Bayesian approach for learning\nnonseparable Hamiltonian systems using stochastic dynamic models allowing for\nstatistically-dependent, vector-valued additive and multiplicative measurement\nnoise. The approach is comprised of three main facets. First, we derive a\nGaussian filter for a statistically-dependent, vector-valued, additive and\nmultiplicative noise model that is needed to evaluate the likelihood within the\nBayesian posterior. Second, we develop a novel algorithm for cost-effective\napplication of Bayesian system identification to high-dimensional systems.\nThird, we demonstrate how structure-preserving methods can be incorporated into\nthe proposed framework, using nonseparable Hamiltonians as an illustrative\nsystem class. We compare the Bayesian method to a state-of-the-art machine\nlearning method on a canonical nonseparable Hamiltonian model and a chaotic\ndouble pendulum model with small, noisy training datasets. The results show\nthat using the Bayesian posterior as a training objective can yield upwards of\n724 times improvement in Hamiltonian mean squared error using training data\nwith up to 10% multiplicative noise compared to a standard training objective.\nLastly, we demonstrate the utility of the novel algorithm for parameter\nestimation of a 64-dimensional model of the spatially-discretized nonlinear\nSchr\\\"odinger equation with data corrupted by up to 20% multiplicative noise.\n","authors":["Nicholas Galioto","Harsh Sharma","Boris Kramer","Alex Arkady Gorodetsky"],"pdf_url":"https://arxiv.org/pdf/2401.12476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18304v2","updated":"2024-01-23T04:01:25Z","published":"2023-10-27T17:53:53Z","title":"A Stability Principle for Learning under Non-Stationarity","summary":"  We develop a versatile framework for statistical learning in non-stationary\nenvironments. In each time period, our approach applies a stability principle\nto select a look-back window that maximizes the utilization of historical data\nwhile keeping the cumulative bias within an acceptable range relative to the\nstochastic error. Our theory showcases the adaptability of this approach to\nunknown non-stationarity. The regret bound is minimax optimal up to logarithmic\nfactors when the population losses are strongly convex, or Lipschitz only. At\nthe heart of our analysis lie two novel components: a measure of similarity\nbetween functions and a segmentation technique for dividing the non-stationary\ndata sequence into quasi-stationary pieces.\n","authors":["Chengpiao Huang","Kaizheng Wang"],"pdf_url":"https://arxiv.org/pdf/2310.18304v2.pdf","comment":"48 pages, 1 figure"},{"id":"http://arxiv.org/abs/2401.12474v1","updated":"2024-01-23T03:56:22Z","published":"2024-01-23T03:56:22Z","title":"Large Language Models are Superpositions of All Characters: Attaining\n  Arbitrary Role-play via Self-Alignment","summary":"  Considerable efforts have been invested in augmenting the role-playing\nproficiency of open-source large language models (LLMs) by emulating\nproprietary counterparts. Nevertheless, we posit that LLMs inherently harbor\nrole-play capabilities, owing to the extensive knowledge of characters and\npotential dialogues ingrained in their vast training corpora. Thus, in this\nstudy, we introduce Ditto, a self-alignment method for role-play. Ditto\ncapitalizes on character knowledge, encouraging an instruction-following LLM to\nsimulate role-play dialogues as a variant of reading comprehension. This method\ncreates a role-play training set comprising 4,000 characters, surpassing the\nscale of currently available datasets by tenfold regarding the number of roles.\nSubsequently, we fine-tune the LLM using this self-generated dataset to augment\nits role-playing capabilities. Upon evaluating our meticulously constructed and\nreproducible role-play benchmark and the roleplay subset of MT-Bench, Ditto, in\nvarious parameter scales, consistently maintains a consistent role identity and\nprovides accurate role-specific knowledge in multi-turn role-play\nconversations. Notably, it outperforms all open-source role-play baselines,\nshowcasing performance levels comparable to advanced proprietary chatbots.\nFurthermore, we present the first comprehensive cross-supervision alignment\nexperiment in the role-play domain, revealing that the intrinsic capabilities\nof LLMs confine the knowledge within role-play. Meanwhile, the role-play styles\ncan be easily acquired with the guidance of smaller models. We open-source\nrelated resources at https://github.com/OFA-Sys/Ditto.\n","authors":["Keming Lu","Bowen Yu","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.12474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.13883v4","updated":"2024-01-23T03:54:48Z","published":"2022-03-25T19:45:33Z","title":"Multi-modal Misinformation Detection: Approaches, Challenges and\n  Opportunities","summary":"  As social media platforms are evolving from text-based forums into\nmulti-modal environments, the nature of misinformation in social media is also\ntransforming accordingly. Taking advantage of the fact that visual modalities\nsuch as images and videos are more favorable and attractive to the users and\ntextual contents are sometimes skimmed carelessly, misinformation spreaders\nhave recently targeted contextual connections between the modalities e.g., text\nand image. Hence many researchers have developed automatic techniques for\ndetecting possible cross-modal discordance in web-based content. We analyze,\ncategorize and identify existing approaches in addition to challenges and\nshortcomings they face in order to unearth new research opportunities in the\nfield of multi-modal misinformation detection.\n","authors":["Sara Abdali"],"pdf_url":"https://arxiv.org/pdf/2203.13883v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12470v1","updated":"2024-01-23T03:43:34Z","published":"2024-01-23T03:43:34Z","title":"Reinforcement Learning for Graph Coloring: Understanding the Power and\n  Limits of Non-Label Invariant Representations","summary":"  Register allocation is one of the most important problems for modern\ncompilers. With a practically unlimited number of user variables and a small\nnumber of CPU registers, assigning variables to registers without conflicts is\na complex task. This work demonstrates the use of casting the register\nallocation problem as a graph coloring problem. Using technologies such as\nPyTorch and OpenAI Gymnasium Environments we will show that a Proximal Policy\nOptimization model can learn to solve the graph coloring problem. We will also\nshow that the labeling of a graph is critical to the performance of the model\nby taking the matrix representation of a graph and permuting it. We then test\nthe model's effectiveness on each of these permutations and show that it is not\neffective when given a relabeling of the same graph. Our main contribution lies\nin showing the need for label reordering invariant representations of graphs\nfor machine learning models to achieve consistent performance.\n","authors":["Chase Cummins","Richard Veras"],"pdf_url":"https://arxiv.org/pdf/2401.12470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.11628v4","updated":"2024-01-23T03:03:54Z","published":"2021-12-22T02:18:31Z","title":"SkipNode: On Alleviating Performance Degradation for Deep Graph\n  Convolutional Networks","summary":"  Graph Convolutional Networks (GCNs) suffer from performance degradation when\nmodels go deeper. However, earlier works only attributed the performance\ndegeneration to over-smoothing. In this paper, we conduct theoretical and\nexperimental analysis to explore the fundamental causes of performance\ndegradation in deep GCNs: over-smoothing and gradient vanishing have a mutually\nreinforcing effect that causes the performance to deteriorate more quickly in\ndeep GCNs. On the other hand, existing anti-over-smoothing methods all perform\nfull convolutions up to the model depth. They could not well resist the\nexponential convergence of over-smoothing due to model depth increasing. In\nthis work, we propose a simple yet effective plug-and-play module, Skipnode, to\novercome the performance degradation of deep GCNs. It samples graph nodes in\neach convolutional layer to skip the convolution operation. In this way, both\nover-smoothing and gradient vanishing can be effectively suppressed since (1)\nnot all nodes'features propagate through full layers and, (2) the gradient can\nbe directly passed back through ``skipped'' nodes. We provide both theoretical\nanalysis and empirical evaluation to demonstrate the efficacy of Skipnode and\nits superiority over SOTA baselines.\n","authors":["Weigang Lu","Yibing Zhan","Binbin Lin","Ziyu Guan","Liu Liu","Baosheng Yu","Wei Zhao","Yaming Yang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2112.11628v4.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2401.01520v2","updated":"2024-01-23T02:59:04Z","published":"2024-01-03T03:08:32Z","title":"S$^{2}$-DMs:Skip-Step Diffusion Models","summary":"  Diffusion models have emerged as powerful generative tools, rivaling GANs in\nsample quality and mirroring the likelihood scores of autoregressive models. A\nsubset of these models, exemplified by DDIMs, exhibit an inherent asymmetry:\nthey are trained over $T$ steps but only sample from a subset of $T$ during\ngeneration. This selective sampling approach, though optimized for speed,\ninadvertently misses out on vital information from the unsampled steps, leading\nto potential compromises in sample quality. To address this issue, we present\nthe S$^{2}$-DMs, which is a new training method by using an innovative\n$L_{skip}$, meticulously designed to reintegrate the information omitted during\nthe selective sampling phase. The benefits of this approach are manifold: it\nnotably enhances sample quality, is exceptionally simple to implement, requires\nminimal code modifications, and is flexible enough to be compatible with\nvarious sampling algorithms. On the CIFAR10 dataset, models trained using our\nalgorithm showed an improvement of 3.27% to 14.06% over models trained with\ntraditional methods across various sampling algorithms (DDIMs, PNDMs, DEIS) and\ndifferent numbers of sampling steps (10, 20, ..., 1000). On the CELEBA dataset,\nthe improvement ranged from 8.97% to 27.08%. Access to the code and additional\nresources is provided in the github.\n","authors":["Yixuan Wang","Shuangyin Li"],"pdf_url":"https://arxiv.org/pdf/2401.01520v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2401.11792v2","updated":"2024-01-23T02:53:58Z","published":"2024-01-22T09:44:16Z","title":"Safe and Generalized end-to-end Autonomous Driving System with\n  Reinforcement Learning and Demonstrations","summary":"  An intelligent driving system should be capable of dynamically formulating\nappropriate driving strategies based on the current environment and vehicle\nstatus, while ensuring the security and reliability of the system. However,\nexisting methods based on reinforcement learning and imitation learning suffer\nfrom low safety, poor generalization, and inefficient sampling. Additionally,\nthey cannot accurately predict future driving trajectories, and the accurate\nprediction of future driving trajectories is a precondition for making optimal\ndecisions. To solve these problems, in this paper, we introduce a Safe and\nGeneralized end-to-end Autonomous Driving System (SGADS) for complex and\nvarious scenarios. Our SGADS incorporates variational inference with\nnormalizing flows, enabling the intelligent vehicle to accurately predict\nfuture driving trajectories. Moreover, we propose the formulation of robust\nsafety constraints. Furthermore, we combine reinforcement learning with\ndemonstrations to augment search process of the agent. The experimental results\ndemonstrate that our SGADS can significantly improve safety performance,\nexhibit strong generalization, and enhance the training efficiency of\nintelligent vehicles in complex urban scenarios compared to existing methods.\n","authors":["Zuojin Tang","Xiaoyu Chen","YongQiang Li","Jianyu Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11792v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12455v1","updated":"2024-01-23T02:52:36Z","published":"2024-01-23T02:52:36Z","title":"Multi-agent deep reinforcement learning with centralized training and\n  decentralized execution for transportation infrastructure management","summary":"  We present a multi-agent Deep Reinforcement Learning (DRL) framework for\nmanaging large transportation infrastructure systems over their life-cycle.\nLife-cycle management of such engineering systems is a computationally\nintensive task, requiring appropriate sequential inspection and maintenance\ndecisions able to reduce long-term risks and costs, while dealing with\ndifferent uncertainties and constraints that lie in high-dimensional spaces. To\ndate, static age- or condition-based maintenance methods and risk-based or\nperiodic inspection plans have mostly addressed this class of optimization\nproblems. However, optimality, scalability, and uncertainty limitations are\noften manifested under such approaches. The optimization problem in this work\nis cast in the framework of constrained Partially Observable Markov Decision\nProcesses (POMDPs), which provides a comprehensive mathematical basis for\nstochastic sequential decision settings with observation uncertainties, risk\nconsiderations, and limited resources. To address significantly large state and\naction spaces, a Deep Decentralized Multi-agent Actor-Critic (DDMAC) DRL method\nwith Centralized Training and Decentralized Execution (CTDE), termed as\nDDMAC-CTDE is developed. The performance strengths of the DDMAC-CTDE method are\ndemonstrated in a generally representative and realistic example application of\nan existing transportation network in Virginia, USA. The network includes\nseveral bridge and pavement components with nonstationary degradation,\nagency-imposed constraints, and traffic delay and risk considerations. Compared\nto traditional management policies for transportation networks, the proposed\nDDMAC-CTDE method vastly outperforms its counterparts. Overall, the proposed\nalgorithmic framework provides near optimal solutions for transportation\ninfrastructure management under real-world constraints and complexities.\n","authors":["M. Saifullah","K. G. Papakonstantinou","C. P. Andriotis","S. M. Stoffels"],"pdf_url":"https://arxiv.org/pdf/2401.12455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12440v1","updated":"2024-01-23T02:19:31Z","published":"2024-01-23T02:19:31Z","title":"Post-Training Embedding Alignment for Decoupling Enrollment and Runtime\n  Speaker Recognition Models","summary":"  Automated speaker identification (SID) is a crucial step for the\npersonalization of a wide range of speech-enabled services. Typical SID systems\nuse a symmetric enrollment-verification framework with a single model to derive\nembeddings both offline for voice profiles extracted from enrollment\nutterances, and online from runtime utterances. Due to the distinct\ncircumstances of enrollment and runtime, such as different computation and\nlatency constraints, several applications would benefit from an asymmetric\nenrollment-verification framework that uses different models for enrollment and\nruntime embedding generation. To support this asymmetric SID where each of the\ntwo models can be updated independently, we propose using a lightweight neural\nnetwork to map the embeddings from the two independent models to a shared\nspeaker embedding space. Our results show that this approach significantly\noutperforms cosine scoring in a shared speaker logit space for models that were\ntrained with a contrastive loss on large datasets with many speaker identities.\nThis proposed Neural Embedding Speaker Space Alignment (NESSA) combined with an\nasymmetric update of only one of the models delivers at least 60% of the\nperformance gain achieved by updating both models in the standard symmetric SID\napproach.\n","authors":["Chenyang Gao","Brecht Desplanques","Chelsea J. -T. Ju","Aman Chadha","Andreas Stolcke"],"pdf_url":"https://arxiv.org/pdf/2401.12440v1.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.12438v1","updated":"2024-01-23T02:14:05Z","published":"2024-01-23T02:14:05Z","title":"Secure Federated Learning Approaches to Diagnosing COVID-19","summary":"  The recent pandemic has underscored the importance of accurately diagnosing\nCOVID-19 in hospital settings. A major challenge in this regard is\ndifferentiating COVID-19 from other respiratory illnesses based on chest\nX-rays, compounded by the restrictions of HIPAA compliance which limit the\ncomparison of patient X-rays. This paper introduces a HIPAA-compliant model to\naid in the diagnosis of COVID-19, utilizing federated learning. Federated\nlearning is a distributed machine learning approach that allows for algorithm\ntraining across multiple decentralized devices using local data samples,\nwithout the need for data sharing. Our model advances previous efforts in chest\nX-ray diagnostic models. We examined leading models from established\ncompetitions in this domain and developed our own models tailored to be\neffective with specific hospital data. Considering the model's operation in a\nfederated learning context, we explored the potential impact of biased data\nupdates on the model's performance. To enhance hospital understanding of the\nmodel's decision-making process and to verify that the model is not focusing on\nirrelevant features, we employed a visualization technique that highlights key\nfeatures in chest X-rays indicative of a positive COVID-19 diagnosis.\n","authors":["Rittika Adhikari","Christopher Settles"],"pdf_url":"https://arxiv.org/pdf/2401.12438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11748v2","updated":"2024-01-23T02:13:03Z","published":"2024-01-22T08:20:47Z","title":"GI-PIP: Do We Require Impractical Auxiliary Dataset for Gradient\n  Inversion Attacks?","summary":"  Deep gradient inversion attacks expose a serious threat to Federated Learning\n(FL) by accurately recovering private data from shared gradients. However, the\nstate-of-the-art heavily relies on impractical assumptions to access excessive\nauxiliary data, which violates the basic data partitioning principle of FL. In\nthis paper, a novel method, Gradient Inversion Attack using Practical Image\nPrior (GI-PIP), is proposed under a revised threat model. GI-PIP exploits\nanomaly detection models to capture the underlying distribution from fewer\ndata, while GAN-based methods consume significant more data to synthesize\nimages. The extracted distribution is then leveraged to regulate the attack\nprocess as Anomaly Score loss. Experimental results show that GI-PIP achieves a\n16.12 dB PSNR recovery using only 3.8% data of ImageNet, while GAN-based\nmethods necessitate over 70%. Moreover, GI-PIP exhibits superior capability on\ndistribution generalization compared to GAN-based methods. Our approach\nsignificantly alleviates the auxiliary data requirement on both amount and\ndistribution in gradient inversion attacks, hence posing more substantial\nthreat to real-world FL.\n","authors":["Yu sun","Gaojian Xiong","Xianxun Yao","Kailang Ma","Jian Cui"],"pdf_url":"https://arxiv.org/pdf/2401.11748v2.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.12436v1","updated":"2024-01-23T02:08:20Z","published":"2024-01-23T02:08:20Z","title":"Wasserstein Differential Privacy","summary":"  Differential privacy (DP) has achieved remarkable results in the field of\nprivacy-preserving machine learning. However, existing DP frameworks do not\nsatisfy all the conditions for becoming metrics, which prevents them from\nderiving better basic private properties and leads to exaggerated values on\nprivacy budgets. We propose Wasserstein differential privacy (WDP), an\nalternative DP framework to measure the risk of privacy leakage, which\nsatisfies the properties of symmetry and triangle inequality. We show and prove\nthat WDP has 13 excellent properties, which can be theoretical supports for the\nbetter performance of WDP than other DP frameworks. In addition, we derive a\ngeneral privacy accounting method called Wasserstein accountant, which enables\nWDP to be applied in stochastic gradient descent (SGD) scenarios containing\nsub-sampling. Experiments on basic mechanisms, compositions and deep learning\nshow that the privacy budgets obtained by Wasserstein accountant are relatively\nstable and less influenced by order. Moreover, the overestimation on privacy\nbudgets can be effectively alleviated. The code is available at\nhttps://github.com/Hifipsysta/WDP.\n","authors":["Chengyi Yang","Jiayin Qi","Aimin Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.12436v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.11687v2","updated":"2024-01-23T02:08:09Z","published":"2024-01-22T04:54:42Z","title":"TIM: An Efficient Temporal Interaction Module for Spiking Transformer","summary":"  Spiking Neural Networks (SNNs), as the third generation of neural networks,\nhave gained prominence for their biological plausibility and computational\nefficiency, especially in processing diverse datasets. The integration of\nattention mechanisms, inspired by advancements in neural network architectures,\nhas led to the development of Spiking Transformers. These have shown promise in\nenhancing SNNs' capabilities, particularly in the realms of both static and\nneuromorphic datasets. Despite their progress, a discernible gap exists in\nthese systems, specifically in the Spiking Self Attention (SSA) mechanism's\neffectiveness in leveraging the temporal processing potential of SNNs. To\naddress this, we introduce the Temporal Interaction Module (TIM), a novel,\nconvolution-based enhancement designed to augment the temporal data processing\nabilities within SNN architectures. TIM's integration into existing SNN\nframeworks is seamless and efficient, requiring minimal additional parameters\nwhile significantly boosting their temporal information handling capabilities.\nThrough rigorous experimentation, TIM has demonstrated its effectiveness in\nexploiting temporal information, leading to state-of-the-art performance across\nvarious neuromorphic datasets.\n","authors":["Sicheng Shen","Dongcheng Zhao","Guobin Shen","Yi Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.11687v2.pdf","comment":"9pages,6figures"},{"id":"http://arxiv.org/abs/2401.12435v1","updated":"2024-01-23T02:04:15Z","published":"2024-01-23T02:04:15Z","title":"Quantitative Analysis of Molecular Transport in the Extracellular Space\n  Using Physics-Informed Neural Network","summary":"  The brain extracellular space (ECS), an irregular, extremely tortuous\nnanoscale space located between cells or between cells and blood vessels, is\ncrucial for nerve cell survival. It plays a pivotal role in high-level brain\nfunctions such as memory, emotion, and sensation. However, the specific form of\nmolecular transport within the ECS remain elusive. To address this challenge,\nthis paper proposes a novel approach to quantitatively analyze the molecular\ntransport within the ECS by solving an inverse problem derived from the\nadvection-diffusion equation (ADE) using a physics-informed neural network\n(PINN). PINN provides a streamlined solution to the ADE without the need for\nintricate mathematical formulations or grid settings. Additionally, the\noptimization of PINN facilitates the automatic computation of the diffusion\ncoefficient governing long-term molecule transport and the velocity of\nmolecules driven by advection. Consequently, the proposed method allows for the\nquantitative analysis and identification of the specific pattern of molecular\ntransport within the ECS through the calculation of the Peclet number.\nExperimental validation on two datasets of magnetic resonance images (MRIs)\ncaptured at different time points showcases the effectiveness of the proposed\nmethod. Notably, our simulations reveal identical molecular transport patterns\nbetween datasets representing rats with tracer injected into the same brain\nregion. These findings highlight the potential of PINN as a promising tool for\ncomprehensively exploring molecular transport within the ECS.\n","authors":["Jiayi Xie","Hongfeng Li","Yu Jiang","Jin Cheng","Qingrui Cai","Hanbo Tan","Lingyun Zu","Xiaobo Qu","Hongbin Han"],"pdf_url":"https://arxiv.org/pdf/2401.12435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04378v3","updated":"2024-01-23T01:56:09Z","published":"2023-01-11T09:44:55Z","title":"Loss-Controlling Calibration for Predictive Models","summary":"  We propose a learning framework for calibrating predictive models to make\nloss-controlling prediction for exchangeable data, which extends our recently\nproposed conformal loss-controlling prediction for more general cases. By\ncomparison, the predictors built by the proposed loss-controlling approach are\nnot limited to set predictors, and the loss function can be any measurable\nfunction without the monotone assumption. To control the loss values in an\nefficient way, we introduce transformations preserving exchangeability to prove\nfinite-sample controlling guarantee when the test label is obtained, and then\ndevelop an approximation approach to construct predictors. The transformations\ncan be built on any predefined function, which include using optimization\nalgorithms for parameter searching. This approach is a natural extension of\nconformal loss-controlling prediction, since it can be reduced to the latter\nwhen the set predictors have the nesting property and the loss functions are\nmonotone. Our proposed method is applied to selective regression and\nhigh-impact weather forecasting problems, which demonstrates its effectiveness\nfor general loss-controlling prediction.\n","authors":["Di Wang","Junzhi Shi","Pingping Wang","Shuo Zhuang","Hongyue Li"],"pdf_url":"https://arxiv.org/pdf/2301.04378v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12425v1","updated":"2024-01-23T01:25:00Z","published":"2024-01-23T01:25:00Z","title":"The Neglected Tails of Vision-Language Models","summary":"  Vision-language models (VLMs) excel in zero-shot recognition but exhibit\ndrastically imbalanced performance across visual concepts. For example, CLIP,\ndespite an impressive mean zero-shot accuracy on ImageNet (72.7%), yields\n$<$10% on ten concepts (e.g., gyromitra and night snake), presumably, because\nthese concepts are under-represented in VLMs' imbalanced pretraining data. Yet,\nassessing this imbalance is challenging as it is non-trivial to calculate the\nfrequency of specific concepts within VLMs' large-scale pretraining data. Our\nwork makes the first attempt to measure the concept frequency by analyzing\npretraining texts. We use off-the-shelf language models to help count relevant\ntexts that contain synonyms of the given concepts and resolve linguistic\nambiguity. We confirm that popular VLM datasets like LAION indeed exhibit\nlong-tailed concept distributions, which strongly correlate with per-class\naccuracies. Further, contemporary multimodal systems, e.g., visual chatbots and\ntext-to-image generators, also struggle with the rare concepts identified by\nour method. To mitigate VLMs' imbalanced performance in zero-shot recognition,\nwe propose REtrieval-Augmented Learning REAL. First, instead of prompting VLMs\nusing the original class names, REAL uses their most frequent synonyms found in\nVLMs' pretraining texts. This already outperforms human-engineered and\nLLM-generated prompts over nine benchmark datasets, likely because VLMs have\nseen more images associated with the frequently used synonyms. Second, REAL\nuses all the concept synonyms to retrieve a small, class-balanced set of\npretraining data to train a robust classifier. REAL surpasses the recent\nretrieval-augmented solution REACT, using 400x less storage and 10,000x less\ntraining time!\n","authors":["Shubham Parashar","Zhiqiu Lin","Tian Liu","Xiangjue Dong","Yanan Li","Deva Ramanan","James Caverlee","Shu Kong"],"pdf_url":"https://arxiv.org/pdf/2401.12425v1.pdf","comment":"Project Page:\n  https://shubhamprshr27.github.io/neglected-tails-of-vlms/"},{"id":"http://arxiv.org/abs/2401.12424v1","updated":"2024-01-23T01:20:15Z","published":"2024-01-23T01:20:15Z","title":"DALex: Lexicase-like Selection via Diverse Aggregation","summary":"  Lexicase selection has been shown to provide advantages over other selection\nalgorithms in several areas of evolutionary computation and machine learning.\nIn its standard form, lexicase selection filters a population or other\ncollection based on randomly ordered training cases that are considered one at\na time. This iterated filtering process can be time-consuming, particularly in\nsettings with large numbers of training cases. In this paper, we propose a new\nmethod that is nearly equivalent to lexicase selection in terms of the\nindividuals that it selects, but which does so significantly more quickly. The\nnew method, called DALex (for Diversely Aggregated Lexicase), selects the best\nindividual with respect to a weighted sum of training case errors, where the\nweights are randomly sampled. This allows us to formulate the core computation\nrequired for selection as matrix multiplication instead of recursive loops of\ncomparisons, which in turn allows us to take advantage of optimized and\nparallel algorithms designed for matrix multiplication for speedup.\nFurthermore, we show that we can interpolate between the behavior of lexicase\nselection and its \"relaxed\" variants, such as epsilon or batch lexicase\nselection, by adjusting a single hyperparameter, named \"particularity\npressure,\" which represents the importance granted to each individual training\ncase. Results on program synthesis, deep learning, symbolic regression, and\nlearning classifier systems demonstrate that DALex achieves significant\nspeedups over lexicase selection and its relaxed variants while maintaining\nalmost identical problem-solving performance. Under a fixed computational\nbudget, these savings free up resources that can be directed towards increasing\npopulation size or the number of generations, enabling the potential for\nsolving more difficult problems.\n","authors":["Andrew Ni","Li Ding","Lee Spector"],"pdf_url":"https://arxiv.org/pdf/2401.12424v1.pdf","comment":"15 pages, 4 figures. Submitted to EuroGP'24"},{"id":"http://arxiv.org/abs/2301.02424v2","updated":"2024-01-23T00:48:54Z","published":"2023-01-06T08:58:49Z","title":"Conformal Loss-Controlling Prediction","summary":"  Conformal prediction is a learning framework controlling prediction coverage\nof prediction sets, which can be built on any learning algorithm for point\nprediction. This work proposes a learning framework named conformal\nloss-controlling prediction, which extends conformal prediction to the\nsituation where the value of a loss function needs to be controlled. Different\nfrom existing works about risk-controlling prediction sets and conformal risk\ncontrol with the purpose of controlling the expected values of loss functions,\nthe proposed approach in this paper focuses on the loss for any test object,\nwhich is an extension of conformal prediction from miscoverage loss to some\ngeneral loss. The controlling guarantee is proved under the assumption of\nexchangeability of data in finite-sample cases and the framework is tested\nempirically for classification with a class-varying loss and statistical\npostprocessing of numerical weather forecasting applications, which are\nintroduced as point-wise classification and point-wise regression problems. All\ntheoretical analysis and experimental results confirm the effectiveness of our\nloss-controlling approach.\n","authors":["Di Wang","Ping Wang","Zhong Ji","Xiaojun Yang","Hongyue Li"],"pdf_url":"https://arxiv.org/pdf/2301.02424v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13148v1","updated":"2024-01-23T23:50:19Z","published":"2024-01-23T23:50:19Z","title":"NLBAC: A Neural Ordinary Differential Equations-based Framework for\n  Stable and Safe Reinforcement Learning","summary":"  Reinforcement learning (RL) excels in applications such as video games and\nrobotics, but ensuring safety and stability remains challenging when using RL\nto control real-world systems where using model-free algorithms suffering from\nlow sample efficiency might be prohibitive. This paper first provides safety\nand stability definitions for the RL system, and then introduces a Neural\nordinary differential equations-based Lyapunov-Barrier Actor-Critic (NLBAC)\nframework that leverages Neural Ordinary Differential Equations (NODEs) to\napproximate system dynamics and integrates the Control Barrier Function (CBF)\nand Control Lyapunov Function (CLF) frameworks with the actor-critic method to\nassist in maintaining the safety and stability for the system. Within this\nframework, we employ the augmented Lagrangian method to update the RL-based\ncontroller parameters. Additionally, we introduce an extra backup controller in\nsituations where CBF constraints for safety and the CLF constraint for\nstability cannot be satisfied simultaneously. Simulation results demonstrate\nthat the framework leads the system to approach the desired state and allows\nfewer violations of safety constraints with better sample efficiency compared\nto other methods.\n","authors":["Liqun Zhao","Keyan Miao","Konstantinos Gatsis","Antonis Papachristodoulou"],"pdf_url":"https://arxiv.org/pdf/2401.13148v1.pdf","comment":"The comprehensive version of one paper submitted to 6th Annual\n  Learning for Dynamics & Control Conference (L4DC 2024)"},{"id":"http://arxiv.org/abs/2303.10728v2","updated":"2024-01-23T23:31:36Z","published":"2023-03-19T18:10:15Z","title":"Training Deep Boltzmann Networks with Sparse Ising Machines","summary":"  The slowing down of Moore's law has driven the development of unconventional\ncomputing paradigms, such as specialized Ising machines tailored to solve\ncombinatorial optimization problems. In this paper, we show a new application\ndomain for probabilistic bit (p-bit) based Ising machines by training deep\ngenerative AI models with them. Using sparse, asynchronous, and massively\nparallel Ising machines we train deep Boltzmann networks in a hybrid\nprobabilistic-classical computing setup. We use the full MNIST and Fashion\nMNIST (FMNIST) dataset without any downsampling and a reduced version of\nCIFAR-10 dataset in hardware-aware network topologies implemented in moderately\nsized Field Programmable Gate Arrays (FPGA). For MNIST, our machine using only\n4,264 nodes (p-bits) and about 30,000 parameters achieves the same\nclassification accuracy (90%) as an optimized software-based restricted\nBoltzmann Machine (RBM) with approximately 3.25 million parameters. Similar\nresults follow for FMNIST and CIFAR-10. Additionally, the sparse deep Boltzmann\nnetwork can generate new handwritten digits and fashion products, a task the\n3.25 million parameter RBM fails at despite achieving the same accuracy. Our\nhybrid computer takes a measured 50 to 64 billion probabilistic flips per\nsecond, which is at least an order of magnitude faster than superficially\nsimilar Graphics and Tensor Processing Unit (GPU/TPU) based implementations.\nThe massively parallel architecture can comfortably perform the contrastive\ndivergence algorithm (CD-n) with up to n = 10 million sweeps per update, beyond\nthe capabilities of existing software implementations. These results\ndemonstrate the potential of using Ising machines for traditionally\nhard-to-train deep generative Boltzmann networks, with further possible\nimprovement in nanodevice-based realizations.\n","authors":["Shaila Niazi","Navid Anjum Aadit","Masoud Mohseni","Shuvro Chowdhury","Yao Qin","Kerem Y. Camsari"],"pdf_url":"https://arxiv.org/pdf/2303.10728v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00496v2","updated":"2024-01-23T23:30:57Z","published":"2023-12-31T13:32:18Z","title":"SAR-RARP50: Segmentation of surgical instrumentation and Action\n  Recognition on Robot-Assisted Radical Prostatectomy Challenge","summary":"  Surgical tool segmentation and action recognition are fundamental building\nblocks in many computer-assisted intervention applications, ranging from\nsurgical skills assessment to decision support systems. Nowadays,\nlearning-based action recognition and segmentation approaches outperform\nclassical methods, relying, however, on large, annotated datasets. Furthermore,\naction recognition and tool segmentation algorithms are often trained and make\npredictions in isolation from each other, without exploiting potential\ncross-task relationships. With the EndoVis 2022 SAR-RARP50 challenge, we\nrelease the first multimodal, publicly available, in-vivo, dataset for surgical\naction recognition and semantic instrumentation segmentation, containing 50\nsuturing video segments of Robotic Assisted Radical Prostatectomy (RARP). The\naim of the challenge is twofold. First, to enable researchers to leverage the\nscale of the provided dataset and develop robust and highly accurate\nsingle-task action recognition and tool segmentation approaches in the surgical\ndomain. Second, to further explore the potential of multitask-based learning\napproaches and determine their comparative advantage against their single-task\ncounterparts. A total of 12 teams participated in the challenge, contributing 7\naction recognition methods, 9 instrument segmentation techniques, and 4\nmultitask approaches that integrated both action recognition and instrument\nsegmentation. The complete SAR-RARP50 dataset is available at:\nhttps://rdr.ucl.ac.uk/projects/SARRARP50_Segmentation_of_surgical_instrumentation_and_Action_Recognition_on_Robot-Assisted_Radical_Prostatectomy_Challenge/191091\n","authors":["Dimitrios Psychogyios","Emanuele Colleoni","Beatrice Van Amsterdam","Chih-Yang Li","Shu-Yu Huang","Yuchong Li","Fucang Jia","Baosheng Zou","Guotai Wang","Yang Liu","Maxence Boels","Jiayu Huo","Rachel Sparks","Prokar Dasgupta","Alejandro Granados","Sebastien Ourselin","Mengya Xu","An Wang","Yanan Wu","Long Bai","Hongliang Ren","Atsushi Yamada","Yuriko Harai","Yuto Ishikawa","Kazuyuki Hayashi","Jente Simoens","Pieter DeBacker","Francesco Cisternino","Gabriele Furnari","Alex Mottrie","Federica Ferraguti","Satoshi Kondo","Satoshi Kasai","Kousuke Hirasawa","Soohee Kim","Seung Hyun Lee","Kyu Eun Lee","Hyoun-Joong Kong","Kui Fu","Chao Li","Shan An","Stefanie Krell","Sebastian Bodenstedt","Nicolas Ayobi","Alejandra Perez","Santiago Rodriguez","Juanita Puentes","Pablo Arbelaez","Omid Mohareri","Danail Stoyanov"],"pdf_url":"https://arxiv.org/pdf/2401.00496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.02389v2","updated":"2024-01-23T22:32:18Z","published":"2022-08-04T00:21:10Z","title":"Risk-Aware Linear Bandits: Theory and Applications in Smart Order\n  Routing","summary":"  Motivated by practical considerations in machine learning for financial\ndecision-making, such as risk aversion and large action space, we consider\nrisk-aware bandits optimization with applications in smart order routing (SOR).\nSpecifically, based on preliminary observations of linear price impacts made\nfrom the NASDAQ ITCH dataset, we initiate the study of risk-aware linear\nbandits. In this setting, we aim at minimizing regret, which measures our\nperformance deficit compared to the optimum's, under the mean-variance metric\nwhen facing a set of actions whose rewards are linear functions of (initially)\nunknown parameters. Driven by the variance-minimizing globally-optimal\n(G-optimal) design, we propose the novel instance-independent Risk-Aware\nExplore-then-Commit (RISE) algorithm and the instance-dependent Risk-Aware\nSuccessive Elimination (RISE++) algorithm. Then, we rigorously analyze their\nnear-optimal regret upper bounds to show that, by leveraging the linear\nstructure, our algorithms can dramatically reduce the regret when compared to\nexisting methods. Finally, we demonstrate the performance of the algorithms by\nconducting extensive numerical experiments in the SOR setup using both\nsynthetic datasets and the NASDAQ ITCH dataset. Our results reveal that 1) The\nlinear structure assumption can indeed be well supported by the Nasdaq dataset;\nand more importantly 2) Both RISE and RISE++ can significantly outperform the\ncompeting methods, in terms of regret, especially in complex decision-making\nscenarios.\n","authors":["Jingwei Ji","Renyuan Xu","Ruihao Zhu"],"pdf_url":"https://arxiv.org/pdf/2208.02389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11798v2","updated":"2024-01-23T22:25:56Z","published":"2024-01-22T09:54:49Z","title":"Knowledge Distillation on Spatial-Temporal Graph Convolutional Network\n  for Traffic Prediction","summary":"  Efficient real-time traffic prediction is crucial for reducing transportation\ntime. To predict traffic conditions, we employ a spatio-temporal graph neural\nnetwork (ST-GNN) to model our real-time traffic data as temporal graphs.\nDespite its capabilities, it often encounters challenges in delivering\nefficient real-time predictions for real-world traffic data. Recognizing the\nsignificance of timely prediction due to the dynamic nature of real-time data,\nwe employ knowledge distillation (KD) as a solution to enhance the execution\ntime of ST-GNNs for traffic prediction. In this paper, We introduce a cost\nfunction designed to train a network with fewer parameters (the student) using\ndistilled data from a complex network (the teacher) while maintaining its\naccuracy close to that of the teacher. We use knowledge distillation,\nincorporating spatial-temporal correlations from the teacher network to enable\nthe student to learn the complex patterns perceived by the teacher. However, a\nchallenge arises in determining the student network architecture rather than\nconsidering it inadvertently. To address this challenge, we propose an\nalgorithm that utilizes the cost function to calculate pruning scores,\naddressing small network architecture search issues, and jointly fine-tunes the\nnetwork resulting from each pruning stage using KD. Ultimately, we evaluate our\nproposed ideas on two real-world datasets, PeMSD7 and PeMSD8. The results\nindicate that our method can maintain the student's accuracy close to that of\nthe teacher, even with the retention of only $3\\%$ of network parameters.\n","authors":["Mohammad Izadi","Mehran Safayani","Abdolreza Mirzaei"],"pdf_url":"https://arxiv.org/pdf/2401.11798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04878v4","updated":"2024-01-23T22:13:16Z","published":"2023-03-08T20:33:09Z","title":"DeepGD: A Multi-Objective Black-Box Test Selection Approach for Deep\n  Neural Networks","summary":"  Deep neural networks (DNNs) are widely used in various application domains\nsuch as image processing, speech recognition, and natural language processing.\nHowever, testing DNN models may be challenging due to the complexity and size\nof their input domain. Particularly, testing DNN models often requires\ngenerating or exploring large unlabeled datasets. In practice, DNN test\noracles, which identify the correct outputs for inputs, often require expensive\nmanual effort to label test data, possibly involving multiple experts to ensure\nlabeling correctness. In this paper, we propose DeepGD, a black-box\nmulti-objective test selection approach for DNN models. It reduces the cost of\nlabeling by prioritizing the selection of test inputs with high fault revealing\npower from large unlabeled datasets. DeepGD not only selects test inputs with\nhigh uncertainty scores to trigger as many mispredicted inputs as possible but\nalso maximizes the probability of revealing distinct faults in the DNN model by\nselecting diverse mispredicted inputs. The experimental results conducted on\nfour widely used datasets and five DNN models show that in terms of\nfault-revealing ability: (1) White-box, coverage-based approaches fare poorly,\n(2) DeepGD outperforms existing black-box test selection approaches in terms of\nfault detection, and (3) DeepGD also leads to better guidance for DNN model\nretraining when using selected inputs to augment the training set.\n","authors":["Zohreh Aghababaeyan","Manel Abdellatif","Mahboubeh Dadkhah","Lionel Briand"],"pdf_url":"https://arxiv.org/pdf/2303.04878v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15936v2","updated":"2024-01-23T22:08:20Z","published":"2023-05-25T11:05:36Z","title":"Learning DAGs from Data with Few Root Causes","summary":"  We present a novel perspective and algorithm for learning directed acyclic\ngraphs (DAGs) from data generated by a linear structural equation model (SEM).\nFirst, we show that a linear SEM can be viewed as a linear transform that, in\nprior work, computes the data from a dense input vector of random valued root\ncauses (as we will call them) associated with the nodes. Instead, we consider\nthe case of (approximately) few root causes and also introduce noise in the\nmeasurement of the data. Intuitively, this means that the DAG data is produced\nby few data-generating events whose effect percolates through the DAG. We prove\nidentifiability in this new setting and show that the true DAG is the global\nminimizer of the $L^0$-norm of the vector of root causes. For data with few\nroot causes, with and without noise, we show superior performance compared to\nprior DAG learning methods.\n","authors":["Panagiotis Misiakos","Chris Wendler","Markus Püschel"],"pdf_url":"https://arxiv.org/pdf/2305.15936v2.pdf","comment":"to be published in 37th Conference on Neural Information Processing\n  Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2312.09442v2","updated":"2024-01-23T21:56:34Z","published":"2023-11-20T10:57:11Z","title":"A Compact LSTM-SVM Fusion Model for Long-Duration Cardiovascular\n  Diseases Detection","summary":"  Globally, cardiovascular diseases (CVDs) are the leading cause of mortality,\naccounting for an estimated 17.9 million deaths annually. One critical clinical\nobjective is the early detection of CVDs using electrocardiogram (ECG) data, an\narea that has received significant attention from the research community.\nRecent advancements based on machine learning and deep learning have achieved\ngreat progress in this domain. However, existing methodologies exhibit inherent\nlimitations, including inappropriate model evaluations and instances of data\nleakage. In this study, we present a streamlined workflow paradigm for\npreprocessing ECG signals into consistent 10-second durations, eliminating the\nneed for manual feature extraction/beat detection. We also propose a hybrid\nmodel of Long Short-Term Memory (LSTM) with Support Vector Machine (SVM) for\nfraud detection. This architecture consists of two LSTM layers and an SVM\nclassifier, which achieves a SOTA results with an Average precision score of\n0.9402 on the MIT-BIH arrhythmia dataset and 0.9563 on the MIT-BIH atrial\nfibrillation dataset. Based on the results, we believe our method can\nsignificantly benefit the early detection and management of CVDs.\n","authors":["Siyang Wu"],"pdf_url":"https://arxiv.org/pdf/2312.09442v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.14359v5","updated":"2024-01-23T21:55:20Z","published":"2022-06-29T01:58:03Z","title":"TE2Rules: Explaining Tree Ensembles using Rules","summary":"  Tree Ensemble (TE) models, such as Gradient Boosted Trees, often achieve\noptimal performance on tabular datasets, yet their lack of transparency poses\nchallenges for comprehending their decision logic. This paper introduces\nTE2Rules (Tree Ensemble to Rules), a novel approach for explaining binary\nclassification tree ensemble models through a list of rules, particularly\nfocusing on explaining the minority class. Many state-of-the-art explainers\nstruggle with minority class explanations, making TE2Rules valuable in such\ncases. The rules generated by TE2Rules closely approximate the original model,\nensuring high fidelity, providing an accurate and interpretable means to\nunderstand decision-making. Experimental results demonstrate that TE2Rules\nscales effectively to tree ensembles with hundreds of trees, achieving higher\nfidelity within runtimes comparable to baselines. TE2Rules allows for a\ntrade-off between runtime and fidelity, enhancing its practical applicability.\nThe implementation is available here: https://github.com/linkedin/TE2Rules.\n","authors":["G Roshan Lal","Xiaotong Chen","Varun Mithal"],"pdf_url":"https://arxiv.org/pdf/2206.14359v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13115v1","updated":"2024-01-23T21:51:51Z","published":"2024-01-23T21:51:51Z","title":"Contractive Diffusion Probabilistic Models","summary":"  Diffusion probabilistic models (DPMs) have emerged as a promising technology\nin generative modeling. The success of DPMs relies on two ingredients: time\nreversal of Markov diffusion processes and score matching. Most existing work\nimplicitly assumes that score matching is close to perfect, while this\nassumption is questionable. In view of possibly unguaranteed score matching, we\npropose a new criterion -- the contraction of backward sampling in the design\nof DPMs. This leads to a novel class of contractive DPMs (CDPMs), including\ncontractive Ornstein-Uhlenbeck (OU) processes and contractive sub-variance\npreserving (sub-VP) stochastic differential equations (SDEs). The key insight\nis that the contraction in the backward process narrows score matching errors,\nas well as discretization error. Thus, the proposed CDPMs are robust to both\nsources of error. Our proposal is supported by theoretical results, and is\ncorroborated by experiments. Notably, contractive sub-VP shows the best\nperformance among all known SDE-based DPMs on the CIFAR-10 dataset.\n","authors":["Wenpin Tang","Hanyang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.13115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07730v2","updated":"2024-01-23T21:44:40Z","published":"2023-05-12T18:58:08Z","title":"Learning in Inverse Optimization: Incenter Cost, Augmented Suboptimality\n  Loss, and Algorithms","summary":"  In Inverse Optimization (IO), an expert agent solves an optimization problem\nparametric in an exogenous signal. From a learning perspective, the goal is to\nlearn the expert's cost function given a dataset of signals and corresponding\noptimal actions. Motivated by the geometry of the IO set of consistent cost\nvectors, we introduce the \"incenter\" concept, a new notion akin to circumcenter\nrecently proposed by Besbes et al. (2023). Discussing the geometric and\nrobustness interpretation of the incenter cost vector, we develop corresponding\ntractable convex reformulations, which are in contrast with the circumcenter,\nwhich we show is equivalent to an intractable optimization program. We further\npropose a novel loss function called Augmented Suboptimality Loss (ASL), a\nrelaxation of the incenter concept for problems with inconsistent data.\nExploiting the structure of the ASL, we propose a novel first-order algorithm,\nwhich we name Stochastic Approximate Mirror Descent. This algorithm combines\nstochastic and approximate subgradient evaluations, together with mirror\ndescent update steps, which is provably efficient for the IO problems with\ndiscrete feasible sets with high cardinality. We implement the IO approaches\ndeveloped in this paper as a Python package called InvOpt. Our numerical\nexperiments are reproducible, and the underlying source code is available as\nexamples in the InvOpt package.\n","authors":["Pedro Zattoni Scroccaro","Bilge Atasoy","Peyman Mohajerin Esfahani"],"pdf_url":"https://arxiv.org/pdf/2305.07730v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00557v3","updated":"2024-01-23T21:32:30Z","published":"2023-04-30T19:45:04Z","title":"Collective Relational Inference for learning heterogeneous interactions","summary":"  Interacting systems are ubiquitous in nature and engineering, ranging from\nparticle dynamics in physics to functionally connected brain regions. These\ninteracting systems can be modeled by graphs where edges correspond to the\ninteractions between interactive entities. Revealing interaction laws is of\nfundamental importance but also particularly challenging due to underlying\nconfigurational complexities. The associated challenges become exacerbated for\nheterogeneous systems that are prevalent in reality, where multiple interaction\ntypes coexist simultaneously and relational inference is required. Here, we\npropose a novel probabilistic method for relational inference, which possesses\ntwo distinctive characteristics compared to existing methods. First, it infers\nthe interaction types of different edges collectively by explicitly encoding\nthe correlation among incoming interactions with a joint distribution, and\nsecond, it allows handling systems with variable topological structure over\ntime. We evaluate the proposed methodology across several benchmark datasets\nand demonstrate that it outperforms existing methods in accurately inferring\ninteraction types. We further show that when combined with known constraints,\nit allows us, for example, to discover physics-consistent interaction laws of\nparticle systems. Overall the proposed model is data-efficient and\ngeneralizable to large systems when trained on smaller ones. The developed\nmethodology constitutes a key element for understanding interacting systems and\nmay find application in graph structure learning.\n","authors":["Zhichao Han","Olga Fink","David S. Kammer"],"pdf_url":"https://arxiv.org/pdf/2305.00557v3.pdf","comment":"Under review. Links to the supporting code can be found at the end of\n  the main content"},{"id":"http://arxiv.org/abs/2401.13099v1","updated":"2024-01-23T21:23:51Z","published":"2024-01-23T21:23:51Z","title":"Sparse identification of nonlinear dynamics in the presence of library\n  and system uncertainty","summary":"  The SINDy algorithm has been successfully used to identify the governing\nequations of dynamical systems from time series data. However, SINDy assumes\nthe user has prior knowledge of the variables in the system and of a function\nlibrary that can act as a basis for the system. In this paper, we demonstrate\non real world data how the Augmented SINDy algorithm outperforms SINDy in the\npresence of system variable uncertainty. We then show SINDy can be further\naugmented to perform robustly when both kinds of uncertainty are present.\n","authors":["Andrew O'Brien"],"pdf_url":"https://arxiv.org/pdf/2401.13099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13098v1","updated":"2024-01-23T21:22:51Z","published":"2024-01-23T21:22:51Z","title":"Gravity-Informed Deep Learning Framework for Predicting Ship Traffic\n  Flow and Invasion Risk of Non-Indigenous Species via Ballast Water Discharge","summary":"  Invasive species in water bodies pose a major threat to the environment and\nbiodiversity globally. Due to increased transportation and trade, non-native\nspecies have been introduced to new environments, causing damage to ecosystems\nand leading to economic losses in agriculture, forestry, and fisheries.\nTherefore, there is a pressing need for risk assessment and management\ntechniques to mitigate the impact of these invasions. This study aims to\ndevelop a new physics-inspired model to forecast maritime shipping traffic and\nthus inform risk assessment of invasive species spread through global\ntransportation networks. Inspired by the gravity model for international\ntrades, our model considers various factors that influence the likelihood and\nimpact of vessel activities, such as shipping flux density, distance between\nports, trade flow, and centrality measures of transportation hubs.\nAdditionally, by analyzing the risk network of invasive species, we provide a\ncomprehensive framework for assessing the invasion threat level given a pair of\norigin and destination. Accordingly, this paper introduces transformers to\ngravity models to rebuild the short- and long-term dependencies that make the\nrisk analysis feasible. Thus, we introduce a physics-inspired framework that\nachieves an 89% segmentation accuracy for existing and non-existing\ntrajectories and an 84.8% accuracy for the number of vessels flowing between\nkey port areas, representing more than 10% improvement over the traditional\ndeep-gravity model. Along these lines, this research contributes to a better\nunderstanding of invasive species risk assessment. It allows policymakers,\nconservationists, and stakeholders to prioritize management actions by\nidentifying high-risk invasion pathways. Besides, our model is versatile and\ncan include new data sources, making it suitable for assessing species invasion\nrisks in a changing global landscape.\n","authors":["Ruixin Song","Gabriel Spadon","Sarah Bailey","Ronald Pelot","Stan Matwin","Amilcar Soares"],"pdf_url":"https://arxiv.org/pdf/2401.13098v1.pdf","comment":"26 pages, 7 figures, under review"},{"id":"http://arxiv.org/abs/2401.13096v1","updated":"2024-01-23T21:20:48Z","published":"2024-01-23T21:20:48Z","title":"Probabilistic Demand Forecasting with Graph Neural Networks","summary":"  Demand forecasting is a prominent business use case that allows retailers to\noptimize inventory planning, logistics, and core business decisions. One of the\nkey challenges in demand forecasting is accounting for relationships and\ninteractions between articles. Most modern forecasting approaches provide\nindependent article-level predictions that do not consider the impact of\nrelated articles. Recent research has attempted addressing this challenge using\nGraph Neural Networks (GNNs) and showed promising results. This paper builds on\nprevious research on GNNs and makes two contributions. First, we integrate a\nGNN encoder into a state-of-the-art DeepAR model. The combined model produces\nprobabilistic forecasts, which are crucial for decision-making under\nuncertainty. Second, we propose to build graphs using article attribute\nsimilarity, which avoids reliance on a pre-defined graph structure. Experiments\non three real-world datasets show that the proposed approach consistently\noutperforms non-graph benchmarks. We also show that our approach produces\narticle embeddings that encode article similarity and demand dynamics and are\nuseful for other downstream business tasks beyond forecasting.\n","authors":["Nikita Kozodoi","Elizaveta Zinovyeva","Simon Valentin","João Pereira","Rodrigo Agundez"],"pdf_url":"https://arxiv.org/pdf/2401.13096v1.pdf","comment":"Preprint of the paper accepted to ECML PKDD 2023 ML4ITS Workshop"},{"id":"http://arxiv.org/abs/2401.13086v1","updated":"2024-01-23T20:55:49Z","published":"2024-01-23T20:55:49Z","title":"Towards Trustable Language Models: Investigating Information Quality of\n  Large Language Models","summary":"  Large language models (LLM) are generating information at a rapid pace,\nrequiring users to increasingly rely and trust the data. Despite remarkable\nadvances of LLM, Information generated by LLM is not completely trustworthy,\ndue to challenges in information quality. Specifically, integrity of\nInformation quality decreases due to unreliable, biased, tokenization during\npre-training of LLM. Moreover, due to decreased information quality issues, has\nled towards hallucination, fabricated information. Unreliable information can\nlead towards flawed decisions in businesses, which impacts economic activity.\nIn this work, we introduce novel mathematical information quality evaluation of\nLLM, we furthermore analyze and highlight information quality challenges,\nscaling laws to systematically scale language models.\n","authors":["Rick Rejeleene","Xiaowei Xu","John Talburt"],"pdf_url":"https://arxiv.org/pdf/2401.13086v1.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2401.13085v1","updated":"2024-01-23T20:54:40Z","published":"2024-01-23T20:54:40Z","title":"IndiText Boost: Text Augmentation for Low Resource India Languages","summary":"  Text Augmentation is an important task for low-resource languages. It helps\ndeal with the problem of data scarcity. A data augmentation strategy is used to\ndeal with the problem of data scarcity. Through the years, much work has been\ndone on data augmentation for the English language. In contrast, very less work\nhas been done on Indian languages. This is contrary to the fact that data\naugmentation is used to deal with data scarcity. In this work, we focus on\nimplementing techniques like Easy Data Augmentation, Back Translation,\nParaphrasing, Text Generation using LLMs, and Text Expansion using LLMs for\ntext classification on different languages. We focus on 6 Indian languages\nnamely: Sindhi, Marathi, Hindi, Gujarati, Telugu, and Sanskrit. According to\nour knowledge, no such work exists for text augmentation on Indian languages.\nWe carry out binary as well as multi-class text classification to make our\nresults more comparable. We get surprising results as basic data augmentation\ntechniques surpass LLMs.\n","authors":["Onkar Litake","Niraj Yagnik","Shreyas Labhsetwar"],"pdf_url":"https://arxiv.org/pdf/2401.13085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15812v2","updated":"2024-01-23T20:44:17Z","published":"2023-08-30T07:35:32Z","title":"Peering Through Preferences: Unraveling Feedback Acquisition for\n  Aligning Large Language Models","summary":"  Aligning large language models (LLMs) with human values and intents\ncritically involves the use of human or AI feedback. While dense feedback\nannotations are expensive to acquire and integrate, sparse feedback presents a\nstructural design choice between ratings (e.g., score Response A on a scale of\n1-7) and rankings (e.g., is Response A better than Response B?). In this work,\nwe analyze the effect of this design choice for the alignment and evaluation of\nLLMs. We uncover an inconsistency problem wherein the preferences inferred from\nratings and rankings significantly disagree 60% for both human and AI\nannotators. Our subsequent analysis identifies various facets of annotator\nbiases that explain this phenomena, such as human annotators would rate denser\nresponses higher while preferring accuracy during pairwise judgments. To our\nsurprise, we also observe that the choice of feedback protocol also has a\nsignificant effect on the evaluation of aligned LLMs. In particular, we find\nthat LLMs that leverage rankings data for alignment (say model X) are preferred\nover those that leverage ratings data (say model Y), with a rank-based\nevaluation protocol (is X/Y's response better than reference response?) but not\nwith a rating-based evaluation protocol (score Rank X/Y's response on a scale\nof 1-7). Our findings thus shed light on critical gaps in methods for\nevaluating the real-world utility of language models and their strong\ndependence on the feedback protocol used for alignment. Our code and data are\navailable at https://github.com/Hritikbansal/sparse_feedback.\n","authors":["Hritik Bansal","John Dang","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2308.15812v2.pdf","comment":"31 pages, Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2305.13998v5","updated":"2024-01-23T20:33:09Z","published":"2023-05-23T12:27:56Z","title":"SMT 2.0: A Surrogate Modeling Toolbox with a focus on Hierarchical and\n  Mixed Variables Gaussian Processes","summary":"  The Surrogate Modeling Toolbox (SMT) is an open-source Python package that\noffers a collection of surrogate modeling methods, sampling techniques, and a\nset of sample problems. This paper presents SMT 2.0, a major new release of SMT\nthat introduces significant upgrades and new features to the toolbox. This\nrelease adds the capability to handle mixed-variable surrogate models and\nhierarchical variables. These types of variables are becoming increasingly\nimportant in several surrogate modeling applications. SMT 2.0 also improves SMT\nby extending sampling methods, adding new surrogate models, and computing\nvariance and kernel derivatives for Kriging. This release also includes new\nfunctions to handle noisy and use multifidelity data. To the best of our\nknowledge, SMT 2.0 is the first open-source surrogate library to propose\nsurrogate models for hierarchical and mixed inputs. This open-source software\nis distributed under the New BSD license.\n","authors":["Paul Saves","Remi Lafage","Nathalie Bartoli","Youssef Diouane","Jasper Bussemaker","Thierry Lefebvre","John T. Hwang","Joseph Morlier","Joaquim R. R. A. Martins"],"pdf_url":"https://arxiv.org/pdf/2305.13998v5.pdf","comment":"10.1016/j.advengsoft.2023.103571"},{"id":"http://arxiv.org/abs/2211.08262v4","updated":"2024-01-23T20:32:20Z","published":"2022-11-15T16:13:04Z","title":"A mixed-categorical correlation kernel for Gaussian process","summary":"  Recently, there has been a growing interest for mixed-categorical meta-models\nbased on Gaussian process (GP) surrogates. In this setting, several existing\napproaches use different strategies either by using continuous kernels (e.g.,\ncontinuous relaxation and Gower distance based GP) or by using a direct\nestimation of the correlation matrix. In this paper, we present a kernel-based\napproach that extends continuous exponential kernels to handle\nmixed-categorical variables. The proposed kernel leads to a new GP surrogate\nthat generalizes both the continuous relaxation and the Gower distance based GP\nmodels. We demonstrate, on both analytical and engineering problems, that our\nproposed GP model gives a higher likelihood and a smaller residual error than\nthe other kernel-based state-of-the-art models. Our method is available in the\nopen-source software SMT.\n","authors":["P. Saves","Y. Diouane","N. Bartoli","T. Lefebvre","J. Morlier"],"pdf_url":"https://arxiv.org/pdf/2211.08262v4.pdf","comment":"Published in Neurocomputing. 10.1016/j.neucom.2023.126472"},{"id":"http://arxiv.org/abs/2401.11694v2","updated":"2024-01-23T20:06:38Z","published":"2024-01-22T05:26:18Z","title":"Parametric Matrix Models","summary":"  We present a general class of machine learning algorithms called parametric\nmatrix models. Parametric matrix models are based on matrix equations, and the\ndesign is motivated by the efficiency of reduced basis methods for\napproximating solutions of parametric equations. The dependent variables can be\ndefined implicitly or explicitly, and the equations may use algebraic,\ndifferential, or integral relations. Parametric matrix models can be trained\nwith empirical data only, and no high-fidelity model calculations are needed.\nWhile originally designed for scientific computing, parametric matrix models\nare universal function approximators that can be applied to general machine\nlearning problems. After introducing the underlying theory, we apply parametric\nmatrix models to a series of different challenges that show their performance\nfor a wide range of problems. For all the challenges tested here, parametric\nmatrix models produce accurate results within a computational framework that\nallows for parameter extrapolation and interpretability.\n","authors":["Patrick Cook","Danny Jammooa","Morten Hjorth-Jensen","Daniel D. Lee","Dean Lee"],"pdf_url":"https://arxiv.org/pdf/2401.11694v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10841v2","updated":"2024-01-23T20:05:30Z","published":"2024-01-19T17:40:50Z","title":"Using LLMs to discover emerging coded antisemitic hate-speech in\n  extremist social media","summary":"  Online hate speech proliferation has created a difficult problem for social\nmedia platforms. A particular challenge relates to the use of coded language by\ngroups interested in both creating a sense of belonging for its users and\nevading detection. Coded language evolves quickly and its use varies over time.\nThis paper proposes a methodology for detecting emerging coded hate-laden\nterminology. The methodology is tested in the context of online antisemitic\ndiscourse. The approach considers posts scraped from social media platforms,\noften used by extremist users. The posts are scraped using seed expressions\nrelated to previously known discourse of hatred towards Jews. The method begins\nby identifying the expressions most representative of each post and calculating\ntheir frequency in the whole corpus. It filters out grammatically incoherent\nexpressions as well as previously encountered ones so as to focus on emergent\nwell-formed terminology. This is followed by an assessment of semantic\nsimilarity to known antisemitic terminology using a fine-tuned large language\nmodel, and subsequent filtering out of the expressions that are too distant\nfrom known expressions of hatred. Emergent antisemitic expressions containing\nterms clearly relating to Jewish topics are then removed to return only coded\nexpressions of hatred.\n","authors":["Dhanush Kikkisetti","Raza Ul Mustafa","Wendy Melillo","Roberto Corizzo","Zois Boukouvalas","Jeff Gill","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2401.10841v2.pdf","comment":"9 pages, 4 figures, 2 algorithms, 3 tables"},{"id":"http://arxiv.org/abs/2309.17371v2","updated":"2024-01-23T19:37:29Z","published":"2023-09-29T16:20:36Z","title":"Adversarial Imitation Learning from Visual Observations using Latent\n  Information","summary":"  We focus on the problem of imitation learning from visual observations, where\nthe learning agent has access to videos of experts as its sole learning source.\nThe challenges of this framework include the absence of expert actions and the\npartial observability of the environment, as the ground-truth states can only\nbe inferred from pixels. To tackle this problem, we first conduct a theoretical\nanalysis of imitation learning in partially observable environments. We\nestablish upper bounds on the suboptimality of the learning agent with respect\nto the divergence between the expert and the agent latent state-transition\ndistributions. Motivated by this analysis, we introduce an algorithm called\nLatent Adversarial Imitation from Observations, which combines off-policy\nadversarial imitation techniques with a learned latent representation of the\nagent's state from sequences of observations. In experiments on\nhigh-dimensional continuous robotic tasks, we show that our algorithm matches\nstate-of-the-art performance while providing significant computational\nadvantages. Additionally, we show how our method can be used to improve the\nefficiency of reinforcement learning from pixels by leveraging expert videos.\nTo ensure reproducibility, we provide free access to our code.\n","authors":["Vittorio Giammarino","James Queeney","Ioannis Ch. Paschalidis"],"pdf_url":"https://arxiv.org/pdf/2309.17371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13054v1","updated":"2024-01-23T19:26:24Z","published":"2024-01-23T19:26:24Z","title":"Frustrated Random Walks: A Fast Method to Compute Node Distances on\n  Hypergraphs","summary":"  A hypergraph is a generalization of a graph that arises naturally when\nattribute-sharing among entities is considered. Although a hypergraph can be\nconverted into a graph by expanding its hyperedges into fully connected\nsubgraphs, going the reverse way is computationally complex and NP-complete. We\ntherefore hypothesize that a hypergraph contains more information than a graph.\nIn addition, it is more convenient to manipulate a hypergraph directly, rather\nthan expand it into a graph. An open problem in hypergraphs is how to\naccurately and efficiently calculate their node distances. Estimating node\ndistances enables us to find a node's nearest neighbors, and perform label\npropagation on hypergraphs using a K-nearest neighbors (KNN) approach. In this\npaper, we propose a novel approach based on random walks to achieve label\npropagation on hypergraphs. We estimate node distances as the expected hitting\ntimes of random walks. We note that simple random walks (SRW) cannot accurately\ndescribe highly complex real-world hypergraphs, which motivates us to introduce\nfrustrated random walks (FRW) to better describe them. We further benchmark our\nmethod against DeepWalk, and show that while the latter can achieve comparable\nresults, FRW has a distinct computational advantage in cases where the number\nof targets is fairly small. For such cases, we show that FRW runs in\nsignificantly shorter time than DeepWalk. Finally, we analyze the time\ncomplexity of our method, and show that for large and sparse hypergraphs, the\ncomplexity is approximately linear, rendering it superior to the DeepWalk\nalternative.\n","authors":["Enzhi Li","Bilal Fadlallah"],"pdf_url":"https://arxiv.org/pdf/2401.13054v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.13049v1","updated":"2024-01-23T19:17:20Z","published":"2024-01-23T19:17:20Z","title":"CIS-UNet: Multi-Class Segmentation of the Aorta in Computed Tomography\n  Angiography via Context-Aware Shifted Window Self-Attention","summary":"  Advancements in medical imaging and endovascular grafting have facilitated\nminimally invasive treatments for aortic diseases. Accurate 3D segmentation of\nthe aorta and its branches is crucial for interventions, as inaccurate\nsegmentation can lead to erroneous surgical planning and endograft\nconstruction. Previous methods simplified aortic segmentation as a binary image\nsegmentation problem, overlooking the necessity of distinguishing between\nindividual aortic branches. In this paper, we introduce Context Infused\nSwin-UNet (CIS-UNet), a deep learning model designed for multi-class\nsegmentation of the aorta and thirteen aortic branches. Combining the strengths\nof Convolutional Neural Networks (CNNs) and Swin transformers, CIS-UNet adopts\na hierarchical encoder-decoder structure comprising a CNN encoder, symmetric\ndecoder, skip connections, and a novel Context-aware Shifted Window\nSelf-Attention (CSW-SA) as the bottleneck block. Notably, CSW-SA introduces a\nunique utilization of the patch merging layer, distinct from conventional Swin\ntransformers. It efficiently condenses the feature map, providing a global\nspatial context and enhancing performance when applied at the bottleneck layer,\noffering superior computational efficiency and segmentation accuracy compared\nto the Swin transformers. We trained our model on computed tomography (CT)\nscans from 44 patients and tested it on 15 patients. CIS-UNet outperformed the\nstate-of-the-art SwinUNetR segmentation model, which is solely based on Swin\ntransformers, by achieving a superior mean Dice coefficient of 0.713 compared\nto 0.697, and a mean surface distance of 2.78 mm compared to 3.39 mm.\nCIS-UNet's superior 3D aortic segmentation offers improved precision and\noptimization for planning endovascular treatments. Our dataset and code will be\npublicly available.\n","authors":["Muhammad Imran","Jonathan R Krebs","Veera Rajasekhar Reddy Gopu","Brian Fazzone","Vishal Balaji Sivaraman","Amarjeet Kumar","Chelsea Viscardi","Robert Evans Heithaus","Benjamin Shickel","Yuyin Zhou","Michol A Cooper","Wei Shao"],"pdf_url":"https://arxiv.org/pdf/2401.13049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13045v1","updated":"2024-01-23T19:02:13Z","published":"2024-01-23T19:02:13Z","title":"Assessment of Sports Concussion in Female Athletes: A Role for\n  Neuroinformatics?","summary":"  Over the past decade, the intricacies of sports-related concussions among\nfemale athletes have become readily apparent. Traditional clinical methods for\ndiagnosing concussions suffer limitations when applied to female athletes,\noften failing to capture subtle changes in brain structure and function.\nAdvanced neuroinformatics techniques and machine learning models have become\ninvaluable assets in this endeavor. While these technologies have been\nextensively employed in understanding concussion in male athletes, there\nremains a significant gap in our comprehension of their effectiveness for\nfemale athletes. With its remarkable data analysis capacity, machine learning\noffers a promising avenue to bridge this deficit. By harnessing the power of\nmachine learning, researchers can link observed phenotypic neuroimaging data to\nsex-specific biological mechanisms, unraveling the mysteries of concussions in\nfemale athletes. Furthermore, embedding methods within machine learning enable\nexamining brain architecture and its alterations beyond the conventional\nanatomical reference frame. In turn, allows researchers to gain deeper insights\ninto the dynamics of concussions, treatment responses, and recovery processes.\nTo guarantee that female athletes receive the optimal care they deserve,\nresearchers must employ advanced neuroimaging techniques and sophisticated\nmachine-learning models. These tools enable an in-depth investigation of the\nunderlying mechanisms responsible for concussion symptoms stemming from\nneuronal dysfunction in female athletes. This paper endeavors to address the\ncrucial issue of sex differences in multimodal neuroimaging experimental design\nand machine learning approaches within female athlete populations, ultimately\nensuring that they receive the tailored care they require when facing the\nchallenges of concussions.\n","authors":["Rachel Edelstein","Sterling Gutterman","Benjamin Newman","John Darrell Van Horn"],"pdf_url":"https://arxiv.org/pdf/2401.13045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13034v1","updated":"2024-01-23T19:00:02Z","published":"2024-01-23T19:00:02Z","title":"Locality Sensitive Sparse Encoding for Learning World Models Online","summary":"  Acquiring an accurate world model online for model-based reinforcement\nlearning (MBRL) is challenging due to data nonstationarity, which typically\ncauses catastrophic forgetting for neural networks (NNs). From the online\nlearning perspective, a Follow-The-Leader (FTL) world model is desirable, which\noptimally fits all previous experiences at each round. Unfortunately, NN-based\nmodels need re-training on all accumulated data at every interaction step to\nachieve FTL, which is computationally expensive for lifelong agents. In this\npaper, we revisit models that can achieve FTL with incremental updates.\nSpecifically, our world model is a linear regression model supported by\nnonlinear random features. The linear part ensures efficient FTL update while\nthe nonlinear random feature empowers the fitting of complex environments. To\nbest trade off model capacity and computation efficiency, we introduce a\nlocality sensitive sparse encoding, which allows us to conduct efficient sparse\nupdates even with very high dimensional nonlinear features. We validate the\nrepresentation power of our encoding and verify that it allows efficient online\nlearning under data covariate shift. We also show, in the Dyna MBRL setting,\nthat our world models learned online using a single pass of trajectory data\neither surpass or match the performance of deep world models trained with\nreplay and other continual learning methods.\n","authors":["Zichen Liu","Chao Du","Wee Sun Lee","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2401.13034v1.pdf","comment":"ICLR 2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2401.12568v1","updated":"2024-01-23T08:54:10Z","published":"2024-01-23T08:54:10Z","title":"NeRF-AD: Neural Radiance Field with Attention-based Disentanglement for\n  Talking Face Synthesis","summary":"  Talking face synthesis driven by audio is one of the current research\nhotspots in the fields of multidimensional signal processing and multimedia.\nNeural Radiance Field (NeRF) has recently been brought to this research field\nin order to enhance the realism and 3D effect of the generated faces. However,\nmost existing NeRF-based methods either burden NeRF with complex learning tasks\nwhile lacking methods for supervised multimodal feature fusion, or cannot\nprecisely map audio to the facial region related to speech movements. These\nreasons ultimately result in existing methods generating inaccurate lip shapes.\nThis paper moves a portion of NeRF learning tasks ahead and proposes a talking\nface synthesis method via NeRF with attention-based disentanglement (NeRF-AD).\nIn particular, an Attention-based Disentanglement module is introduced to\ndisentangle the face into Audio-face and Identity-face using speech-related\nfacial action unit (AU) information. To precisely regulate how audio affects\nthe talking face, we only fuse the Audio-face with audio feature. In addition,\nAU information is also utilized to supervise the fusion of these two\nmodalities. Extensive qualitative and quantitative experiments demonstrate that\nour NeRF-AD outperforms state-of-the-art methods in generating realistic\ntalking face videos, including image quality and lip synchronization. To view\nvideo results, please refer to https://xiaoxingliu02.github.io/NeRF-AD.\n","authors":["Chongke Bi","Xiaoxing Liu","Zhilei Liu"],"pdf_url":"https://arxiv.org/pdf/2401.12568v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2310.08981v3","updated":"2024-01-23T06:13:04Z","published":"2023-10-13T09:57:09Z","title":"Low-latency Speech Enhancement via Speech Token Generation","summary":"  Existing deep learning based speech enhancement mainly employ a data-driven\napproach, which leverage large amounts of data with a variety of noise types to\nachieve noise removal from noisy signal. However, the high dependence on the\ndata limits its generalization on the unseen complex noises in real-life\nenvironment. In this paper, we focus on the low-latency scenario and regard\nspeech enhancement as a speech generation problem conditioned on the noisy\nsignal, where we generate clean speech instead of identifying and removing\nnoises. Specifically, we propose a conditional generative framework for speech\nenhancement, which models clean speech by acoustic codes of a neural speech\ncodec and generates the speech codes conditioned on past noisy frames in an\nauto-regressive way. Moreover, we propose an explicit-alignment approach to\nalign noisy frames with the generated speech tokens to improve the robustness\nand scalability to different input lengths. Different from other methods that\nleverage multiple stages to generate speech codes, we leverage a single-stage\nspeech generation approach based on the TF-Codec neural codec to achieve high\nspeech quality with low latency. Extensive results on both synthetic and\nreal-recorded test set show its superiority over data-driven approaches in\nterms of noise robustness and temporal speech coherence.\n","authors":["Huaying Xue","Xiulian Peng","Yan Lu"],"pdf_url":"https://arxiv.org/pdf/2310.08981v3.pdf","comment":"5 pages, ICASSP2024(accepted)"},{"id":"http://arxiv.org/abs/2309.09085v3","updated":"2024-01-23T05:02:45Z","published":"2023-09-16T19:40:30Z","title":"SynthTab: Leveraging Synthesized Data for Guitar Tablature Transcription","summary":"  Guitar tablature is a form of music notation widely used among guitarists. It\ncaptures not only the musical content of a piece, but also its implementation\nand ornamentation on the instrument. Guitar Tablature Transcription (GTT) is an\nimportant task with broad applications in music education, composition, and\nentertainment. Existing GTT datasets are quite limited in size and scope,\nrendering models trained on them prone to overfitting and incapable of\ngeneralizing to out-of-domain data. In order to address this issue, we present\na methodology for synthesizing large-scale GTT audio using commercial acoustic\nand electric guitar plugins. We procure SynthTab, a dataset derived from\nDadaGP, which is a vast and diverse collection of richly annotated symbolic\ntablature. The proposed synthesis pipeline produces audio which faithfully\nadheres to the original fingerings and a subset of techniques specified in the\ntablature, and covers multiple guitars and styles for each track. Experiments\nshow that pre-training a baseline GTT model on SynthTab can improve\ntranscription performance when fine-tuning and testing on an individual\ndataset. More importantly, cross-dataset experiments show that pre-training\nsignificantly mitigates issues with overfitting.\n","authors":["Yongyi Zang","Yi Zhong","Frank Cwitkowitz","Zhiyao Duan"],"pdf_url":"https://arxiv.org/pdf/2309.09085v3.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2203.13883v4","updated":"2024-01-23T03:54:48Z","published":"2022-03-25T19:45:33Z","title":"Multi-modal Misinformation Detection: Approaches, Challenges and\n  Opportunities","summary":"  As social media platforms are evolving from text-based forums into\nmulti-modal environments, the nature of misinformation in social media is also\ntransforming accordingly. Taking advantage of the fact that visual modalities\nsuch as images and videos are more favorable and attractive to the users and\ntextual contents are sometimes skimmed carelessly, misinformation spreaders\nhave recently targeted contextual connections between the modalities e.g., text\nand image. Hence many researchers have developed automatic techniques for\ndetecting possible cross-modal discordance in web-based content. We analyze,\ncategorize and identify existing approaches in addition to challenges and\nshortcomings they face in order to unearth new research opportunities in the\nfield of multi-modal misinformation detection.\n","authors":["Sara Abdali"],"pdf_url":"https://arxiv.org/pdf/2203.13883v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07294v2","updated":"2024-01-23T03:30:09Z","published":"2023-12-12T14:12:05Z","title":"Probing Commonsense Reasoning Capability of Text-to-Image Generative\n  Models via Non-visual Description","summary":"  Commonsense reasoning, the ability to make logical assumptions about daily\nscenes, is one core intelligence of human beings. In this work, we present a\nnovel task and dataset for evaluating the ability of text-to-image generative\nmodels to conduct commonsense reasoning, which we call PAINTaboo. Given a\ndescription with few visual clues of one object, the goal is to generate images\nillustrating the object correctly. The dataset was carefully hand-curated and\ncovered diverse object categories to analyze model performance comprehensively.\nOur investigation of several prevalent text-to-image generative models reveals\nthat these models are not proficient in commonsense reasoning, as anticipated.\nWe trust that PAINTaboo can improve our understanding of the reasoning\nabilities of text-to-image generative models.\n","authors":["Mianzhi Pan","Jianfei Li","Mingyue Yu","Zheng Ma","Kanzhi Cheng","Jianbing Zhang","Jiajun Chen"],"pdf_url":"https://arxiv.org/pdf/2312.07294v2.pdf","comment":"It is an incomplete work"}]},"2024-01-24T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.13660v1","updated":"2024-01-24T18:53:53Z","published":"2024-01-24T18:53:53Z","title":"MambaByte: Token-free Selective State Space Model","summary":"  Token-free language models learn directly from raw bytes and remove the bias\nof subword tokenization. Operating on bytes, however, results in significantly\nlonger sequences, and standard autoregressive Transformers scale poorly in such\nsettings. We experiment with MambaByte, a token-free adaptation of the Mamba\nstate space model, trained autoregressively on byte sequences. Our experiments\nindicate the computational efficiency of MambaByte compared to other byte-level\nmodels. We also find MambaByte to be competitive with and even outperform\nstate-of-the-art subword Transformers. Furthermore, owing to linear scaling in\nlength, MambaByte benefits from fast inference compared to Transformers. Our\nfindings establish the viability of MambaByte in enabling token-free language\nmodeling.\n","authors":["Junxiong Wang","Tushaar Gangavarapu","Jing Nathan Yan","Alexander M Rush"],"pdf_url":"https://arxiv.org/pdf/2401.13660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13649v1","updated":"2024-01-24T18:35:21Z","published":"2024-01-24T18:35:21Z","title":"VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web\n  Tasks","summary":"  Autonomous agents capable of planning, reasoning, and executing actions on\nthe web offer a promising avenue for automating computer tasks. However, the\nmajority of existing benchmarks primarily focus on text-based agents,\nneglecting many natural tasks that require visual information to effectively\nsolve. Given that most computer interfaces cater to human perception, visual\ninformation often augments textual data in ways that text-only models struggle\nto harness effectively. To bridge this gap, we introduce VisualWebArena, a\nbenchmark designed to assess the performance of multimodal web agents on\nrealistic \\textit{visually grounded tasks}. VisualWebArena comprises of a set\nof diverse and complex web-based tasks that evaluate various capabilities of\nautonomous multimodal agents. To perform on this benchmark, agents need to\naccurately process image-text inputs, interpret natural language instructions,\nand execute actions on websites to accomplish user-defined objectives. We\nconduct an extensive evaluation of state-of-the-art LLM-based autonomous\nagents, including several multimodal models. Through extensive quantitative and\nqualitative analysis, we identify several limitations of text-only LLM agents,\nand reveal gaps in the capabilities of state-of-the-art multimodal language\nagents. VisualWebArena provides a framework for evaluating multimodal\nautonomous language agents, and offers insights towards building stronger\nautonomous agents for the web. Our code, baseline models, and data is publicly\navailable at https://jykoh.com/vwa.\n","authors":["Jing Yu Koh","Robert Lo","Lawrence Jang","Vikram Duvvur","Ming Chong Lim","Po-Yu Huang","Graham Neubig","Shuyan Zhou","Ruslan Salakhutdinov","Daniel Fried"],"pdf_url":"https://arxiv.org/pdf/2401.13649v1.pdf","comment":"24 pages. Project page: https://jykoh.com/vwa"},{"id":"http://arxiv.org/abs/2309.17249v2","updated":"2024-01-24T18:27:30Z","published":"2023-09-29T13:55:45Z","title":"Batch Calibration: Rethinking Calibration for In-Context Learning and\n  Prompt Engineering","summary":"  Prompting and in-context learning (ICL) have become efficient learning\nparadigms for large language models (LLMs). However, LLMs suffer from prompt\nbrittleness and various bias factors in the prompt, including but not limited\nto the formatting, the choice verbalizers, and the ICL examples. To address\nthis problem that results in unexpected performance degradation, calibration\nmethods have been developed to mitigate the effects of these biases while\nrecovering LLM performance. In this work, we first conduct a systematic\nanalysis of the existing calibration methods, where we both provide a unified\nview and reveal the failure cases. Inspired by these analyses, we propose Batch\nCalibration (BC), a simple yet intuitive method that controls the contextual\nbias from the batched input, unifies various prior approaches, and effectively\naddresses the aforementioned issues. BC is zero-shot, inference-only, and\nincurs negligible additional costs. In the few-shot setup, we further extend BC\nto allow it to learn the contextual bias from labeled data. We validate the\neffectiveness of BC with PaLM 2-(S, M, L) and CLIP models and demonstrate\nstate-of-the-art performance over previous calibration baselines across more\nthan 10 natural language understanding and image classification tasks.\n","authors":["Han Zhou","Xingchen Wan","Lev Proleev","Diana Mincu","Jilin Chen","Katherine Heller","Subhrajit Roy"],"pdf_url":"https://arxiv.org/pdf/2309.17249v2.pdf","comment":"ICLR 2024. 9 pages, 9 figures, 3 tables (22 pages, 11 figures, 11\n  tables including references and appendices)"},{"id":"http://arxiv.org/abs/2401.02981v2","updated":"2024-01-24T18:16:34Z","published":"2024-01-01T06:22:04Z","title":"Fine-tuning and Utilization Methods of Domain-specific LLMs","summary":"  Recent releases of pre-trained Large Language Models (LLMs) have gained\nconsiderable traction, yet research on fine-tuning and employing\ndomain-specific LLMs remains scarce. This study investigates approaches for\nfine-tuning and leveraging domain-specific LLMs, highlighting trends in LLMs,\nfoundational models, and methods for domain-specific pre-training. Focusing on\nthe financial sector, it details dataset selection, preprocessing, model\nchoice, and considerations crucial for LLM fine-tuning in finance. Addressing\nthe unique characteristics of financial data, the study explores the\nconstruction of domain-specific vocabularies and considerations for security\nand regulatory compliance. In the practical application of LLM fine-tuning, the\nstudy outlines the procedure and implementation for generating domain-specific\nLLMs in finance. Various financial cases, including stock price prediction,\nsentiment analysis of financial news, automated document processing, research,\ninformation extraction, and customer service enhancement, are exemplified. The\nstudy explores the potential of LLMs in the financial domain, identifies\nlimitations, and proposes directions for improvement, contributing valuable\ninsights for future research. Ultimately, it advances natural language\nprocessing technology in business, suggesting proactive LLM utilization in\nfinancial services across industries.\n","authors":["Cheonsu Jeong"],"pdf_url":"https://arxiv.org/pdf/2401.02981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13621v1","updated":"2024-01-24T17:48:45Z","published":"2024-01-24T17:48:45Z","title":"DenoSent: A Denoising Objective for Self-Supervised Sentence\n  Representation Learning","summary":"  Contrastive-learning-based methods have dominated sentence representation\nlearning. These methods regularize the representation space by pulling similar\nsentence representations closer and pushing away the dissimilar ones and have\nbeen proven effective in various NLP tasks, e.g., semantic textual similarity\n(STS) tasks. However, it is challenging for these methods to learn fine-grained\nsemantics as they only learn from the inter-sentence perspective, i.e., their\nsupervision signal comes from the relationship between data samples. In this\nwork, we propose a novel denoising objective that inherits from another\nperspective, i.e., the intra-sentence perspective. By introducing both discrete\nand continuous noise, we generate noisy sentences and then train our model to\nrestore them to their original form. Our empirical evaluations demonstrate that\nthis approach delivers competitive results on both semantic textual similarity\n(STS) and a wide range of transfer tasks, standing up well in comparison to\ncontrastive-learning-based methods. Notably, the proposed intra-sentence\ndenoising objective complements existing inter-sentence contrastive\nmethodologies and can be integrated with them to further enhance performance.\nOur code is available at https://github.com/xinghaow99/DenoSent.\n","authors":["Xinghao Wang","Junliang He","Pengyu Wang","Yunhua Zhou","Tianxiang Sun","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2401.13621v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2309.08565v3","updated":"2024-01-24T17:48:31Z","published":"2023-09-15T17:33:24Z","title":"How Transferable are Attribute Controllers on Pretrained Multilingual\n  Translation Models?","summary":"  Customizing machine translation models to comply with desired attributes\n(e.g., formality or grammatical gender) is a well-studied topic. However, most\ncurrent approaches rely on (semi-)supervised data with attribute annotations.\nThis data scarcity bottlenecks democratizing such customization possibilities\nto a wider range of languages, particularly lower-resource ones. This gap is\nout of sync with recent progress in pretrained massively multilingual\ntranslation models. In response, we transfer the attribute controlling\ncapabilities to languages without attribute-annotated data with an NLLB-200\nmodel as a foundation. Inspired by techniques from controllable generation, we\nemploy a gradient-based inference-time controller to steer the pretrained\nmodel. The controller transfers well to zero-shot conditions, as it operates on\npretrained multilingual representations and is attribute -- rather than\nlanguage-specific. With a comprehensive comparison to finetuning-based control,\nwe demonstrate that, despite finetuning's clear dominance in supervised\nsettings, the gap to inference-time control closes when moving to zero-shot\nconditions, especially with new and distant target languages. The latter also\nshows stronger domain robustness. We further show that our inference-time\ncontrol complements finetuning. A human evaluation on a real low-resource\nlanguage, Bengali, confirms our findings. Our code is\nhttps://github.com/dannigt/attribute-controller-transfer\n","authors":["Danni Liu","Jan Niehues"],"pdf_url":"https://arxiv.org/pdf/2309.08565v3.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2401.08396v2","updated":"2024-01-24T17:12:51Z","published":"2024-01-16T14:41:20Z","title":"Hidden Flaws Behind Expert-Level Accuracy of GPT-4 Vision in Medicine","summary":"  Recent studies indicate that Generative Pre-trained Transformer 4 with Vision\n(GPT-4V) outperforms human physicians in medical challenge tasks. However,\nthese evaluations primarily focused on the accuracy of multi-choice questions\nalone. Our study extends the current scope by conducting a comprehensive\nanalysis of GPT-4V's rationales of image comprehension, recall of medical\nknowledge, and step-by-step multimodal reasoning when solving New England\nJournal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test\nthe knowledge and diagnostic capabilities of medical professionals. Evaluation\nresults confirmed that GPT-4V outperforms human physicians regarding\nmulti-choice accuracy (88.0% vs. 77.0%, p=0.034). GPT-4V also performs well in\ncases where physicians incorrectly answer, with over 80% accuracy. However, we\ndiscovered that GPT-4V frequently presents flawed rationales in cases where it\nmakes the correct final choices (27.3%), most prominent in image comprehension\n(21.6%). Regardless of GPT-4V's high accuracy in multi-choice questions, our\nfindings emphasize the necessity for further in-depth evaluations of its\nrationales before integrating such models into clinical workflows.\n","authors":["Qiao Jin","Fangyuan Chen","Yiliang Zhou","Ziyang Xu","Justin M. Cheung","Robert Chen","Ronald M. Summers","Justin F. Rousseau","Peiyun Ni","Marc J Landsman","Sally L. Baxter","Subhi J. Al'Aref","Yijia Li","Michael F. Chiang","Yifan Peng","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2401.08396v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13601v1","updated":"2024-01-24T17:10:45Z","published":"2024-01-24T17:10:45Z","title":"MM-LLMs: Recent Advances in MultiModal Large Language Models","summary":"  In the past year, MultiModal Large Language Models (MM-LLMs) have undergone\nsubstantial advancements, augmenting off-the-shelf LLMs to support MM inputs or\noutputs via cost-effective training strategies. The resulting models not only\npreserve the inherent reasoning and decision-making capabilities of LLMs but\nalso empower a diverse range of MM tasks. In this paper, we provide a\ncomprehensive survey aimed at facilitating further research of MM-LLMs.\nSpecifically, we first outline general design formulations for model\narchitecture and training pipeline. Subsequently, we provide brief\nintroductions of $26$ existing MM-LLMs, each characterized by its specific\nformulations. Additionally, we review the performance of MM-LLMs on mainstream\nbenchmarks and summarize key training recipes to enhance the potency of\nMM-LLMs. Lastly, we explore promising directions for MM-LLMs while concurrently\nmaintaining a real-time tracking website for the latest developments in the\nfield. We hope that this survey contributes to the ongoing advancement of the\nMM-LLMs domain.\n","authors":["Duzhen Zhang","Yahan Yu","Chenxing Li","Jiahua Dong","Dan Su","Chenhui Chu","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2401.13601v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.13598v1","updated":"2024-01-24T17:04:28Z","published":"2024-01-24T17:04:28Z","title":"Consistency Guided Knowledge Retrieval and Denoising in LLMs for\n  Zero-shot Document-level Relation Triplet Extraction","summary":"  Document-level Relation Triplet Extraction (DocRTE) is a fundamental task in\ninformation systems that aims to simultaneously extract entities with semantic\nrelations from a document. Existing methods heavily rely on a substantial\namount of fully labeled data. However, collecting and annotating data for newly\nemerging relations is time-consuming and labor-intensive. Recent advanced Large\nLanguage Models (LLMs), such as ChatGPT and LLaMA, exhibit impressive long-text\ngeneration capabilities, inspiring us to explore an alternative approach for\nobtaining auto-labeled documents with new relations. In this paper, we propose\na Zero-shot Document-level Relation Triplet Extraction (ZeroDocRTE) framework,\nwhich generates labeled data by retrieval and denoising knowledge from LLMs,\ncalled GenRDK. Specifically, we propose a chain-of-retrieval prompt to guide\nChatGPT to generate labeled long-text data step by step. To improve the quality\nof synthetic data, we propose a denoising strategy based on the consistency of\ncross-document knowledge. Leveraging our denoised synthetic data, we proceed to\nfine-tune the LLaMA2-13B-Chat for extracting document-level relation triplets.\nWe perform experiments for both zero-shot document-level relation and triplet\nextraction on two public datasets. The experimental results illustrate that our\nGenRDK framework outperforms strong baselines.\n","authors":["Qi Sun","Kun Huang","Xiaocui Yang","Rong Tong","Kun Zhang","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2401.13598v1.pdf","comment":"Accepted by WWW 2024"},{"id":"http://arxiv.org/abs/2401.13594v1","updated":"2024-01-24T17:01:42Z","published":"2024-01-24T17:01:42Z","title":"Graph Guided Question Answer Generation for Procedural\n  Question-Answering","summary":"  In this paper, we focus on task-specific question answering (QA). To this\nend, we introduce a method for generating exhaustive and high-quality training\ndata, which allows us to train compact (e.g., run on a mobile device),\ntask-specific QA models that are competitive against GPT variants. The key\ntechnological enabler is a novel mechanism for automatic question-answer\ngeneration from procedural text which can ingest large amounts of textual\ninstructions and produce exhaustive in-domain QA training data. While current\nQA data generation methods can produce well-formed and varied data, their\nnon-exhaustive nature is sub-optimal for training a QA model. In contrast, we\nleverage the highly structured aspect of procedural text and represent each\nstep and the overall flow of the procedure as graphs. We then condition on\ngraph nodes to automatically generate QA pairs in an exhaustive and\ncontrollable manner. Comprehensive evaluations of our method show that: 1)\nsmall models trained with our data achieve excellent performance on the target\nQA task, even exceeding that of GPT3 and ChatGPT despite being several orders\nof magnitude smaller. 2) semantic coverage is the key indicator for downstream\nQA performance. Crucially, while large language models excel at syntactic\ndiversity, this does not necessarily result in improvements on the end QA\nmodel. In contrast, the higher semantic coverage provided by our method is\ncritical for QA performance.\n","authors":["Hai X. Pham","Isma Hadji","Xinnuo Xu","Ziedune Degutyte","Jay Rainey","Evangelos Kazakos","Afsaneh Fazly","Georgios Tzimiropoulos","Brais Martinez"],"pdf_url":"https://arxiv.org/pdf/2401.13594v1.pdf","comment":"Accepted to EACL 2024 as long paper. 25 pages including appendix"},{"id":"http://arxiv.org/abs/2401.13588v1","updated":"2024-01-24T16:52:37Z","published":"2024-01-24T16:52:37Z","title":"Evaluation of General Large Language Models in Contextually Assessing\n  Semantic Concepts Extracted from Adult Critical Care Electronic Health Record\n  Notes","summary":"  The field of healthcare has increasingly turned its focus towards Large\nLanguage Models (LLMs) due to their remarkable performance. However, their\nperformance in actual clinical applications has been underexplored. Traditional\nevaluations based on question-answering tasks don't fully capture the nuanced\ncontexts. This gap highlights the need for more in-depth and practical\nassessments of LLMs in real-world healthcare settings. Objective: We sought to\nevaluate the performance of LLMs in the complex clinical context of adult\ncritical care medicine using systematic and comprehensible analytic methods,\nincluding clinician annotation and adjudication. Methods: We investigated the\nperformance of three general LLMs in understanding and processing real-world\nclinical notes. Concepts from 150 clinical notes were identified by MetaMap and\nthen labeled by 9 clinicians. Each LLM's proficiency was evaluated by\nidentifying the temporality and negation of these concepts using different\nprompts for an in-depth analysis. Results: GPT-4 showed overall superior\nperformance compared to other LLMs. In contrast, both GPT-3.5 and\ntext-davinci-003 exhibit enhanced performance when the appropriate prompting\nstrategies are employed. The GPT family models have demonstrated considerable\nefficiency, evidenced by their cost-effectiveness and time-saving capabilities.\nConclusion: A comprehensive qualitative performance evaluation framework for\nLLMs is developed and operationalized. This framework goes beyond singular\nperformance aspects. With expert annotations, this methodology not only\nvalidates LLMs' capabilities in processing complex medical data but also\nestablishes a benchmark for future LLM evaluations across specialized domains.\n","authors":["Darren Liu","Cheng Ding","Delgersuren Bold","Monique Bouvier","Jiaying Lu","Benjamin Shickel","Craig S. Jabaley","Wenhui Zhang","Soojin Park","Michael J. Young","Mark S. Wainwright","Gilles Clermont","Parisa Rashidi","Eric S. Rosenthal","Laurie Dimisko","Ran Xiao","Joo Heung Yoon","Carl Yang","Xiao Hu"],"pdf_url":"https://arxiv.org/pdf/2401.13588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13586v1","updated":"2024-01-24T16:51:23Z","published":"2024-01-24T16:51:23Z","title":"Prompt Weight Experiments for LLM Instruction Fine-Tuning","summary":"  We present a small study analyzing how prompt token classification loss\nweighting (PLW) affects the performance of 7B-size LLaMA models fine-tuned on\ninstruction tasks. We recreated Stanford's Alpaca experiment with both LLaMA 1\nand LLaMA 2 using multiple instruction datasets. We found that models\nfine-tuned on our short-completion dataset have a negative quadratic\nrelationship with PLW while models fine-tuned on long-completion datasets were\nunaffected by PLW.\n","authors":["Mathew Huerta-Enochian"],"pdf_url":"https://arxiv.org/pdf/2401.13586v1.pdf","comment":"5 pages of content. 5 pages for limitations, acknowledgments,\n  references, and appendix. 3 figures"},{"id":"http://arxiv.org/abs/2310.01749v2","updated":"2024-01-24T16:28:43Z","published":"2023-10-03T02:18:06Z","title":"Stack Attention: Improving the Ability of Transformers to Model\n  Hierarchical Patterns","summary":"  Attention, specifically scaled dot-product attention, has proven effective\nfor natural language, but it does not have a mechanism for handling\nhierarchical patterns of arbitrary nesting depth, which limits its ability to\nrecognize certain syntactic structures. To address this shortcoming, we propose\nstack attention: an attention operator that incorporates stacks, inspired by\ntheir theoretical connections to context-free languages (CFLs). We show that\nstack attention is analogous to standard attention, but with a latent model of\nsyntax that requires no syntactic supervision. We propose two variants: one\nrelated to deterministic pushdown automata (PDAs) and one based on\nnondeterministic PDAs, which allows transformers to recognize arbitrary CFLs.\nWe show that transformers with stack attention are very effective at learning\nCFLs that standard transformers struggle on, achieving strong results on a CFL\nwith theoretically maximal parsing difficulty. We also show that stack\nattention is more effective at natural language modeling under a constrained\nparameter budget, and we include results on machine translation.\n","authors":["Brian DuSell","David Chiang"],"pdf_url":"https://arxiv.org/pdf/2310.01749v2.pdf","comment":"20 pages, 4 figures. Published as a spotlight paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.13565v1","updated":"2024-01-24T16:21:28Z","published":"2024-01-24T16:21:28Z","title":"Large Malaysian Language Model Based on Mistral for Enhanced Local\n  Language Understanding","summary":"  In this paper, we present significant advancements in the pretraining of\nMistral 7B, a large-scale language model, using a dataset of 32.6 GB,\nequivalent to 1.1 billion tokens. We explore the impact of extending the\ncontext length, releasing models with context lengths of 4096 and 32768 tokens,\nand further refining performance with a specialized 16384 context length\ninstruction-tuned model, we called it Malaysian Mistral.\n  Our experiments demonstrate the efficacy of continue pretraining and the\ninfluence of extended context lengths on Mistral 7B's language understanding\ncapabilities. Additionally, we release a model specifically tuned with a 16384\ncontext length instruction, showcasing its potential for capturing nuanced\nlanguage intricacies.\n  Furthermore, our research contributes to the benchmarking of Malaysian\nMistral against prominent language models, including ChatGPT3.5 and Claude 2.\nWe present compelling results indicating Malaysian Mistral's superior\nperformance on Tatabahasa (Malay grammar) test set, particularly when\nfine-tuned with instructions.\n  All models released at\nhttps://huggingface.co/collections/mesolitica/malaysian-mistral-7b-6528f2ec825f4bba46c1700c\n","authors":["Husein Zolkepli","Aisyah Razak","Kamarul Adha","Ariff Nazhan"],"pdf_url":"https://arxiv.org/pdf/2401.13565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12143v2","updated":"2024-01-24T16:07:00Z","published":"2024-01-22T17:26:55Z","title":"Anisotropy Is Inherent to Self-Attention in Transformers","summary":"  The representation degeneration problem is a phenomenon that is widely\nobserved among self-supervised learning methods based on Transformers. In NLP,\nit takes the form of anisotropy, a singular property of hidden representations\nwhich makes them unexpectedly close to each other in terms of angular distance\n(cosine-similarity). Some recent works tend to show that anisotropy is a\nconsequence of optimizing the cross-entropy loss on long-tailed distributions\nof tokens. We show in this paper that anisotropy can also be observed\nempirically in language models with specific objectives that should not suffer\ndirectly from the same consequences. We also show that the anisotropy problem\nextends to Transformers trained on other modalities. Our observations suggest\nthat anisotropy is actually inherent to Transformers-based models.\n","authors":["Nathan Godey","Éric de la Clergerie","Benoît Sagot"],"pdf_url":"https://arxiv.org/pdf/2401.12143v2.pdf","comment":"Proceedings of EACL 2024. A previous version of the paper, published\n  as arXiv:2306.07656, was presented at ACL-SRW 2023 (non-archival)"},{"id":"http://arxiv.org/abs/2401.13527v1","updated":"2024-01-24T15:25:01Z","published":"2024-01-24T15:25:01Z","title":"SpeechGPT-Gen: Scaling Chain-of-Information Speech Generation","summary":"  Benefiting from effective speech modeling, current Speech Large Language\nModels (SLLMs) have demonstrated exceptional capabilities in in-context speech\ngeneration and efficient generalization to unseen speakers. However, the\nprevailing information modeling process is encumbered by certain redundancies,\nleading to inefficiencies in speech generation. We propose Chain-of-Information\nGeneration (CoIG), a method for decoupling semantic and perceptual information\nin large-scale speech generation. Building on this, we develop SpeechGPT-Gen,\nan 8-billion-parameter SLLM efficient in semantic and perceptual information\nmodeling. It comprises an autoregressive model based on LLM for semantic\ninformation modeling and a non-autoregressive model employing flow matching for\nperceptual information modeling. Additionally, we introduce the novel approach\nof infusing semantic information into the prior distribution to enhance the\nefficiency of flow matching. Extensive experimental results demonstrate that\nSpeechGPT-Gen markedly excels in zero-shot text-to-speech, zero-shot voice\nconversion, and speech-to-speech dialogue, underscoring CoIG's remarkable\nproficiency in capturing and modeling speech's semantic and perceptual\ndimensions. Code and models are available at\nhttps://github.com/0nutation/SpeechGPT.\n","authors":["Dong Zhang","Xin Zhang","Jun Zhan","Shimin Li","Yaqian Zhou","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2401.13527v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2401.13512v1","updated":"2024-01-24T15:10:13Z","published":"2024-01-24T15:10:13Z","title":"Can GPT-3.5 Generate and Code Discharge Summaries?","summary":"  Objective: To investigate GPT-3.5 in generating and coding medical documents\nwith ICD-10 codes for data augmentation on low-resources labels.\n  Materials and Methods: Employing GPT-3.5 we generated and coded 9,606\ndischarge summaries based on lists of ICD-10 code descriptions of patients with\ninfrequent (generation) codes within the MIMIC-IV dataset. Combined with the\nbaseline training set, this formed an augmented training set. Neural coding\nmodels were trained on baseline and augmented data and evaluated on a MIMIC-IV\ntest set. We report micro- and macro-F1 scores on the full codeset, generation\ncodes, and their families. Weak Hierarchical Confusion Matrices were employed\nto determine within-family and outside-of-family coding errors in the latter\ncodesets. The coding performance of GPT-3.5 was evaluated both on prompt-guided\nself-generated data and real MIMIC-IV data. Clinical professionals evaluated\nthe clinical acceptability of the generated documents.\n  Results: Augmentation slightly hinders the overall performance of the models\nbut improves performance for the generation candidate codes and their families,\nincluding one unseen in the baseline training data. Augmented models display\nlower out-of-family error rates. GPT-3.5 can identify ICD-10 codes by the\nprompted descriptions, but performs poorly on real data. Evaluators note the\ncorrectness of generated concepts while suffering in variety, supporting\ninformation, and narrative.\n  Discussion and Conclusion: GPT-3.5 alone is unsuitable for ICD-10 coding.\nAugmentation positively affects generation code families but mainly benefits\ncodes with existing examples. Augmentation reduces out-of-family errors.\nDischarge summaries generated by GPT-3.5 state prompted concepts correctly but\nlack variety, and authenticity in narratives. They are unsuitable for clinical\npractice.\n","authors":["Matúš Falis","Aryo Pradipta Gema","Hang Dong","Luke Daines","Siddharth Basetti","Michael Holder","Rose S Penfold","Alexandra Birch","Beatrice Alex"],"pdf_url":"https://arxiv.org/pdf/2401.13512v1.pdf","comment":"15 pages; 250 words in abstract; 3,929 words in main body; 2 figures\n  (0 black and white, 2 colour); 4 tables; 34 references"},{"id":"http://arxiv.org/abs/2307.06082v2","updated":"2024-01-24T15:10:07Z","published":"2023-07-12T11:08:24Z","title":"VELMA: Verbalization Embodiment of LLM Agents for Vision and Language\n  Navigation in Street View","summary":"  Incremental decision making in real-world environments is one of the most\nchallenging tasks in embodied artificial intelligence. One particularly\ndemanding scenario is Vision and Language Navigation~(VLN) which requires\nvisual and natural language understanding as well as spatial and temporal\nreasoning capabilities. The embodied agent needs to ground its understanding of\nnavigation instructions in observations of a real-world environment like Street\nView. Despite the impressive results of LLMs in other research areas, it is an\nongoing problem of how to best connect them with an interactive visual\nenvironment. In this work, we propose VELMA, an embodied LLM agent that uses a\nverbalization of the trajectory and of visual environment observations as\ncontextual prompt for the next action. Visual information is verbalized by a\npipeline that extracts landmarks from the human written navigation instructions\nand uses CLIP to determine their visibility in the current panorama view. We\nshow that VELMA is able to successfully follow navigation instructions in\nStreet View with only two in-context examples. We further finetune the LLM\nagent on a few thousand examples and achieve 25%-30% relative improvement in\ntask completion over the previous state-of-the-art for two datasets.\n","authors":["Raphael Schumann","Wanrong Zhu","Weixi Feng","Tsu-Jui Fu","Stefan Riezler","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.06082v2.pdf","comment":"Accepted at AAAI 2024"},{"id":"http://arxiv.org/abs/2401.06461v2","updated":"2024-01-24T14:57:42Z","published":"2024-01-12T09:15:20Z","title":"Between Lines of Code: Unraveling the Distinct Patterns of Machine and\n  Human Programmers","summary":"  Large language models have catalyzed an unprecedented wave in code\ngeneration. While achieving significant advances, they blur the distinctions\nbetween machine-and human-authored source code, causing integrity and\nauthenticity issues of software artifacts. Previous methods such as DetectGPT\nhave proven effective in discerning machine-generated texts, but they do not\nidentify and harness the unique patterns of machine-generated code. Thus, its\napplicability falters when applied to code. In this paper, we carefully study\nthe specific patterns that characterize machine and human-authored code.\nThrough a rigorous analysis of code attributes such as length, lexical\ndiversity, and naturalness, we expose unique pat-terns inherent to each source.\nWe particularly notice that the structural segmentation of code is a critical\nfactor in identifying its provenance. Based on our findings, we propose a novel\nmachine-generated code detection method called DetectCodeGPT, which improves\nDetectGPT by capturing the distinct structural patterns of code. Diverging from\nconventional techniques that depend on external LLMs for perturbations,\nDetectCodeGPT perturbs the code corpus by strategically inserting spaces and\nnewlines, ensuring both efficacy and efficiency. Experiment results show that\nour approach significantly outperforms state-of-the-art techniques in detecting\nmachine-generated code.\n","authors":["Yuling Shi","Hongyu Zhang","Chengcheng Wan","Xiaodong Gu"],"pdf_url":"https://arxiv.org/pdf/2401.06461v2.pdf","comment":"paper submitted to IEEE Transactions on Software Engineering, code\n  available at https://github.com/YerbaPage/DetectCodeGPT"},{"id":"http://arxiv.org/abs/2401.13481v1","updated":"2024-01-24T14:29:39Z","published":"2024-01-24T14:29:39Z","title":"How AI Ideas Affect the Creativity, Diversity, and Evolution of Human\n  Ideas: Evidence From a Large, Dynamic Experiment","summary":"  Exposure to large language model output is rapidly increasing. How will\nseeing AI-generated ideas affect human ideas? We conducted an experiment (800+\nparticipants, 40+ countries) where participants viewed creative ideas that were\nfrom ChatGPT or prior experimental participants and then brainstormed their own\nidea. We varied the number of AI-generated examples (none, low, or high\nexposure) and if the examples were labeled as 'AI' (disclosure). Our dynamic\nexperiment design -- ideas from prior participants in an experimental condition\nare used as stimuli for future participants in the same experimental condition\n-- mimics the interdependent process of cultural creation: creative ideas are\nbuilt upon prior ideas. Hence, we capture the compounding effects of having\nLLMs 'in the culture loop'. We find that high AI exposure (but not low AI\nexposure) did not affect the creativity of individual ideas but did increase\nthe average amount and rate of change of collective idea diversity. AI made\nideas different, not better. There were no main effects of disclosure. We also\nfound that self-reported creative people were less influenced by knowing an\nidea was from AI, and that participants were more likely to knowingly adopt AI\nideas when the task was difficult. Our findings suggest that introducing AI\nideas into society may increase collective diversity but not individual\ncreativity.\n","authors":["Joshua Ashkinaze","Julia Mendelsohn","Li Qiwei","Ceren Budak","Eric Gilbert"],"pdf_url":"https://arxiv.org/pdf/2401.13481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13478v1","updated":"2024-01-24T14:23:12Z","published":"2024-01-24T14:23:12Z","title":"SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval","summary":"  Multi-modal information retrieval (MMIR) is a rapidly evolving field, where\nsignificant progress, particularly in image-text pairing, has been made through\nadvanced representation learning and cross-modality alignment research.\nHowever, current benchmarks for evaluating MMIR performance in image-text\npairing within the scientific domain show a notable gap, where chart and table\nimages described in scholarly language usually do not play a significant role.\nTo bridge this gap, we develop a specialised scientific MMIR (SciMMIR)\nbenchmark by leveraging open-access paper collections to extract data relevant\nto the scientific domain. This benchmark comprises 530K meticulously curated\nimage-text pairs, extracted from figures and tables with detailed captions in\nscientific documents. We further annotate the image-text pairs with two-level\nsubset-subcategory hierarchy annotations to facilitate a more comprehensive\nevaluation of the baselines. We conducted zero-shot and fine-tuning evaluations\non prominent multi-modal image-captioning and visual language models, such as\nCLIP and BLIP. Our analysis offers critical insights for MMIR in the scientific\ndomain, including the impact of pre-training and fine-tuning settings and the\ninfluence of the visual and textual encoders. All our data and checkpoints are\npublicly available at https://github.com/Wusiwei0410/SciMMIR.\n","authors":["Siwei Wu","Yizhi Li","Kang Zhu","Ge Zhang","Yiming Liang","Kaijing Ma","Chenghao Xiao","Haoran Zhang","Bohao Yang","Wenhu Chen","Wenhao Huang","Noura Al Moubayed","Jie Fu","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2401.13478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13463v1","updated":"2024-01-24T14:08:38Z","published":"2024-01-24T14:08:38Z","title":"SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken\n  Question Answering","summary":"  Spoken Question Answering (SQA) is essential for machines to reply to user's\nquestion by finding the answer span within a given spoken passage. SQA has been\npreviously achieved without ASR to avoid recognition errors and\nOut-of-Vocabulary (OOV) problems. However, the real-world problem of\nOpen-domain SQA (openSQA), in which the machine needs to first retrieve\npassages that possibly contain the answer from a spoken archive in addition,\nwas never considered. This paper proposes the first known end-to-end framework,\nSpeech Dense Passage Retriever (SpeechDPR), for the retrieval component of the\nopenSQA problem. SpeechDPR learns a sentence-level semantic representation by\ndistilling knowledge from the cascading model of unsupervised ASR (UASR) and\ntext dense retriever (TDR). No manually transcribed speech data is needed.\nInitial experiments showed performance comparable to the cascading model of\nUASR and TDR, and significantly better when UASR was poor, verifying this\napproach is more robust to speech recognition errors.\n","authors":["Chyi-Jiunn Lin","Guan-Ting Lin","Yung-Sung Chuang","Wei-Lun Wu","Shang-Wen Li","Abdelrahman Mohamed","Hung-yi Lee","Lin-shan Lee"],"pdf_url":"https://arxiv.org/pdf/2401.13463v1.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2112.03203v5","updated":"2024-01-24T13:47:03Z","published":"2021-12-06T18:00:02Z","title":"A New Sentence Extraction Strategy for Unsupervised Extractive\n  Summarization Methods","summary":"  In recent years, text summarization methods have attracted much attention\nagain thanks to the researches on neural network models. Most of the current\ntext summarization methods based on neural network models are supervised\nmethods which need large-scale datasets. However, large-scale datasets are\ndifficult to obtain in practical applications. In this paper, we model the task\nof extractive text summarization methods from the perspective of Information\nTheory, and then describe the unsupervised extractive methods with a uniform\nframework. To improve the feature distribution and to decrease the mutual\ninformation of summarization sentences, we propose a new sentence extraction\nstrategy which can be applied to existing unsupervised extractive methods.\nExperiments are carried out on different datasets, and results show that our\nstrategy is indeed effective and in line with expectations.\n","authors":["Dehao Tao","Yingzhu Xiong","Zhongliang Yang","Yongfeng Huang"],"pdf_url":"https://arxiv.org/pdf/2112.03203v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13444v1","updated":"2024-01-24T13:36:50Z","published":"2024-01-24T13:36:50Z","title":"Clue-Guided Path Exploration: An Efficient Knowledge Base\n  Question-Answering Framework with Low Computational Resource Consumption","summary":"  In recent times, large language models (LLMs) have showcased remarkable\ncapabilities. However, updating their knowledge poses challenges, potentially\nleading to inaccuracies when confronted with unfamiliar queries. While\nintegrating knowledge graphs with LLMs has been explored, existing approaches\ntreat LLMs as primary decision-makers, imposing high demands on their\ncapabilities. This is particularly unsuitable for LLMs with lower computational\ncosts and relatively poorer performance. In this paper, we introduce a\nClue-Guided Path Exploration framework (CGPE) that efficiently merges a\nknowledge base with an LLM, placing less stringent requirements on the model's\ncapabilities. Inspired by the method humans use to manually retrieve knowledge,\nCGPE employs information from the question as clues to systematically explore\nthe required knowledge path within the knowledge base. Experiments on\nopen-source datasets reveal that CGPE outperforms previous methods and is\nhighly applicable to LLMs with fewer parameters. In some instances, even\nChatGLM3, with its 6 billion parameters, can rival the performance of GPT-4.\nFurthermore, the results indicate a minimal invocation frequency of CGPE on\nLLMs, suggesting reduced computational overhead. For organizations and\nindividuals facing constraints in computational resources, our research offers\nsignificant practical value.\n","authors":["Dehao Tao","Feng Huang","Yongfeng Huang","Minghu Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.13444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14743v7","updated":"2024-01-24T12:08:34Z","published":"2023-11-21T18:41:26Z","title":"A Baseline Analysis of Reward Models' Ability To Accurately Analyze\n  Foundation Models Under Distribution Shift","summary":"  Foundation models, specifically Large Language Models (LLMs), have lately\ngained wide-spread attention and adoption. Reinforcement Learning with Human\nFeedback (RLHF) involves training a reward model to capture desired behaviors,\nwhich is then used to align LLM's. These reward models are additionally used at\ninference-time to estimate LLM responses' adherence to those desired behaviors.\nHowever, there is little work measuring how robust these reward models are to\ndistribution shifts. In this work, we evaluate how reward model performance -\nmeasured via accuracy and calibration (i.e. alignment between accuracy and\nconfidence) - is affected by distribution shift. We show novel calibration\npatterns and accuracy drops due to OOD prompts and responses, and that the\nreward model is more sensitive to shifts in responses than prompts.\nAdditionally, we adapt an OOD detection technique commonly used in\nclassification to the reward model setting to detect these distribution shifts\nin prompts and responses.\n","authors":["Will LeVine","Benjamin Pikus","Anthony Chen","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2311.14743v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13398v1","updated":"2024-01-24T11:52:05Z","published":"2024-01-24T11:52:05Z","title":"Text Categorization Can Enhance Domain-Agnostic Stopword Extraction","summary":"  This paper investigates the role of text categorization in streamlining\nstopword extraction in natural language processing (NLP), specifically focusing\non nine African languages alongside French. By leveraging the MasakhaNEWS,\nAfrican Stopwords Project, and MasakhaPOS datasets, our findings emphasize that\ntext categorization effectively identifies domain-agnostic stopwords with over\n80% detection success rate for most examined languages. Nevertheless,\nlinguistic variances result in lower detection rates for certain languages.\nInterestingly, we find that while over 40% of stopwords are common across news\ncategories, less than 15% are unique to a single category. Uncommon stopwords\nadd depth to text but their classification as stopwords depends on context.\nTherefore combining statistical and linguistic approaches creates comprehensive\nstopword lists, highlighting the value of our hybrid method. This research\nenhances NLP for African languages and underscores the importance of text\ncategorization in stopword extraction.\n","authors":["Houcemeddine Turki","Naome A. Etori","Mohamed Ali Hadj Taieb","Abdul-Hakeem Omotayo","Chris Chinenye Emezue","Mohamed Ben Aouicha","Ayodele Awokoya","Falalu Ibrahim Lawan","Doreen Nixdorf"],"pdf_url":"https://arxiv.org/pdf/2401.13398v1.pdf","comment":"A Project Report for the Masakhane Research Community"},{"id":"http://arxiv.org/abs/2312.09084v3","updated":"2024-01-24T10:56:24Z","published":"2023-12-14T16:16:35Z","title":"Language Modeling on a SpiNNaker 2 Neuromorphic Chip","summary":"  As large language models continue to scale in size rapidly, so too does the\ncomputational power required to run them. Event-based networks on neuromorphic\ndevices offer a potential way to reduce energy consumption for inference\nsignificantly. However, to date, most event-based networks that can run on\nneuromorphic hardware, including spiking neural networks (SNNs), have not\nachieved task performance even on par with LSTM models for language modeling.\nAs a result, language modeling on neuromorphic devices has seemed a distant\nprospect. In this work, we demonstrate the first-ever implementation of a\nlanguage model on a neuromorphic device - specifically the SpiNNaker 2 chip -\nbased on a recently published event-based architecture called the EGRU.\nSpiNNaker 2 is a many-core neuromorphic chip designed for large-scale\nasynchronous processing, while the EGRU is architected to leverage such\nhardware efficiently while maintaining competitive task performance. This\nimplementation marks the first time a neuromorphic language model matches\nLSTMs, setting the stage for taking task performance to the level of large\nlanguage models. We also demonstrate results on a gesture recognition task\nbased on inputs from a DVS camera. Overall, our results showcase the\nfeasibility of this neuro-inspired neural network in hardware, highlighting\nsignificant gains versus conventional hardware in energy efficiency for the\ncommon use case of single batch inference.\n","authors":["Khaleelulla Khan Nazeer","Mark Schöne","Rishav Mukherji","Bernhard Vogginger","Christian Mayr","David Kappel","Anand Subramoney"],"pdf_url":"https://arxiv.org/pdf/2312.09084v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08517v3","updated":"2024-01-24T09:55:37Z","published":"2024-01-16T17:31:35Z","title":"Supporting Student Decisions on Learning Recommendations: An LLM-Based\n  Chatbot with Knowledge Graph Contextualization for Conversational\n  Explainability and Mentoring","summary":"  Student commitment towards a learning recommendation is not separable from\ntheir understanding of the reasons it was recommended to them; and their\nability to modify it based on that understanding. Among explainability\napproaches, chatbots offer the potential to engage the student in a\nconversation, similar to a discussion with a peer or a mentor. The capabilities\nof chatbots, however, are still not sufficient to replace a human mentor,\ndespite the advancements of generative AI (GenAI) and large language models\n(LLM). Therefore, we propose an approach to utilize chatbots as mediators of\nthe conversation and sources of limited and controlled generation of\nexplanations, to harvest the potential of LLMs while reducing their potential\nrisks at the same time. The proposed LLM-based chatbot supports students in\nunderstanding learning-paths recommendations. We use a knowledge graph (KG) as\na human-curated source of information, to regulate the LLM's output through\ndefining its prompt's context. A group chat approach is developed to connect\nstudents with human mentors, either on demand or in cases that exceed the\nchatbot's pre-defined tasks. We evaluate the chatbot with a user study, to\nprovide a proof-of-concept and highlight the potential requirements and\nlimitations of utilizing chatbots in conversational explainability.\n","authors":["Hasan Abu-Rasheed","Mohamad Hussam Abdulsalam","Christian Weber","Madjid Fathi"],"pdf_url":"https://arxiv.org/pdf/2401.08517v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13313v1","updated":"2024-01-24T09:09:37Z","published":"2024-01-24T09:09:37Z","title":"InstructDoc: A Dataset for Zero-Shot Generalization of Visual Document\n  Understanding with Instructions","summary":"  We study the problem of completing various visual document understanding\n(VDU) tasks, e.g., question answering and information extraction, on real-world\ndocuments through human-written instructions. To this end, we propose\nInstructDoc, the first large-scale collection of 30 publicly available VDU\ndatasets, each with diverse instructions in a unified format, which covers a\nwide range of 12 tasks and includes open document types/formats. Furthermore,\nto enhance the generalization performance on VDU tasks, we design a new\ninstruction-based document reading and understanding model, InstructDr, that\nconnects document images, image encoders, and large language models (LLMs)\nthrough a trainable bridging module. Experiments demonstrate that InstructDr\ncan effectively adapt to new VDU datasets, tasks, and domains via given\ninstructions and outperforms existing multimodal LLMs and ChatGPT without\nspecific training.\n","authors":["Ryota Tanaka","Taichi Iki","Kyosuke Nishida","Kuniko Saito","Jun Suzuki"],"pdf_url":"https://arxiv.org/pdf/2401.13313v1.pdf","comment":"Accepted by AAAI2024; project page:\n  https://github.com/nttmdlab-nlp/InstructDoc"},{"id":"http://arxiv.org/abs/2401.13303v1","updated":"2024-01-24T08:57:39Z","published":"2024-01-24T08:57:39Z","title":"MaLA-500: Massive Language Adaptation of Large Language Models","summary":"  Large language models have advanced the state of the art in natural language\nprocessing. However, their predominant design for English or a limited set of\nlanguages creates a substantial gap in their effectiveness for low-resource\nlanguages. To bridge this gap, we introduce MaLA-500, a novel large language\nmodel designed to cover an extensive range of 534 languages. To train MaLA-500,\nwe employ vocabulary extension and continued pretraining on LLaMA 2 with\nGlot500-c. Our experiments on SIB-200 show that MaLA-500 achieves\nstate-of-the-art in-context learning results. We release MaLA-500 at\nhttps://huggingface.co/MaLA-LM\n","authors":["Peiqin Lin","Shaoxiong Ji","Jörg Tiedemann","André F. T. Martins","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2401.13303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13298v1","updated":"2024-01-24T08:37:16Z","published":"2024-01-24T08:37:16Z","title":"Towards Explainable Harmful Meme Detection through Multimodal Debate\n  between Large Language Models","summary":"  The age of social media is flooded with Internet memes, necessitating a clear\ngrasp and effective identification of harmful ones. This task presents a\nsignificant challenge due to the implicit meaning embedded in memes, which is\nnot explicitly conveyed through the surface text and image. However, existing\nharmful meme detection methods do not present readable explanations that unveil\nsuch implicit meaning to support their detection decisions. In this paper, we\npropose an explainable approach to detect harmful memes, achieved through\nreasoning over conflicting rationales from both harmless and harmful positions.\nSpecifically, inspired by the powerful capacity of Large Language Models (LLMs)\non text generation and reasoning, we first elicit multimodal debate between\nLLMs to generate the explanations derived from the contradictory arguments.\nThen we propose to fine-tune a small language model as the debate judge for\nharmfulness inference, to facilitate multimodal fusion between the harmfulness\nrationales and the intrinsic multimodal information within memes. In this way,\nour model is empowered to perform dialectical reasoning over intricate and\nimplicit harm-indicative patterns, utilizing multimodal explanations\noriginating from both harmless and harmful arguments. Extensive experiments on\nthree public meme datasets demonstrate that our harmful meme detection approach\nachieves much better performance than state-of-the-art methods and exhibits a\nsuperior capacity for explaining the meme harmfulness of the model predictions.\n","authors":["Hongzhan Lin","Ziyang Luo","Wei Gao","Jing Ma","Bo Wang","Ruichao Yang"],"pdf_url":"https://arxiv.org/pdf/2401.13298v1.pdf","comment":"The first work towards explainable harmful meme detection by\n  harnessing advanced LLMs"},{"id":"http://arxiv.org/abs/2309.07414v3","updated":"2024-01-24T08:29:09Z","published":"2023-09-14T03:43:07Z","title":"PromptASR for contextualized ASR with controllable style","summary":"  Prompts are crucial to large language models as they provide context\ninformation such as topic or logical relationships. Inspired by this, we\npropose PromptASR, a framework that integrates prompts in end-to-end automatic\nspeech recognition (E2E ASR) systems to achieve contextualized ASR with\ncontrollable style of transcriptions. Specifically, a dedicated text encoder\nencodes the text prompts and the encodings are injected into the speech encoder\nby cross-attending the features from two modalities. When using the ground\ntruth text from preceding utterances as content prompt, the proposed system\nachieves 21.9% and 6.8% relative word error rate reductions on a book reading\ndataset and an in-house dataset compared to a baseline ASR system. The system\ncan also take word-level biasing lists as prompt to improve recognition\naccuracy on rare words. An additional style prompt can be given to the text\nencoder and guide the ASR system to output different styles of transcriptions.\nThe code is available at icefall.\n","authors":["Xiaoyu Yang","Wei Kang","Zengwei Yao","Yifan Yang","Liyong Guo","Fangjun Kuang","Long Lin","Daniel Povey"],"pdf_url":"https://arxiv.org/pdf/2309.07414v3.pdf","comment":"Proc. ICASSP 2024"},{"id":"http://arxiv.org/abs/2306.03268v2","updated":"2024-01-24T07:53:30Z","published":"2023-06-05T21:38:30Z","title":"\"Medium\" LMs of Code in the Era of LLMs: Lessons From StackOverflow","summary":"  Large pre-trained neural language models have brought immense progress to\nboth NLP and software engineering. Models in OpenAI's GPT series now dwarf\nGoogle's BERT and Meta's RoBERTa, which previously set new benchmarks on a wide\nrange of NLP applications. These models are trained on massive corpora of\nheterogeneous data from web crawls, which enables them to learn general\nlanguage patterns and semantic relationships. However, the largest models are\nboth expensive to train and deploy and are often closed-source, so we lack\naccess to their data and design decisions. We argue that this trend towards\nlarge, general-purpose models should be complemented with single-purpose, more\nmodestly sized pre-trained models. In this work, we take StackOverflow (SO) as\na domain example in which large volumes of rich aligned code and text data is\navailable. We adopt standard practices for pre-training large language models,\nincluding using a very large context size (2,048 tokens), batch size (0.5M\ntokens) and training set (27B tokens), coupled with a powerful toolkit\n(Megatron-LM), to train two models: SOBertBase, with 109M parameters, and\nSOBertLarge with 762M parameters, at a budget of just $\\$187$ and $\\$800$ each.\nWe compare the performance of our models with both the previous SOTA model\ntrained on SO data exclusively as well general-purpose BERT models and OpenAI's\nChatGPT on four SO-specific downstream tasks - question quality prediction,\nclosed question prediction, named entity recognition and obsoletion prediction\n(a new task we introduce). Not only do our models consistently outperform all\nbaselines, the smaller model is often sufficient for strong results. Both\nmodels are released to the public. These results demonstrate that pre-training\nboth extensively and properly on in-domain data can yield a powerful and\naffordable alternative to leveraging closed-source general-purpose models.\n","authors":["Manisha Mukherjee","Vincent J. Hellendoorn"],"pdf_url":"https://arxiv.org/pdf/2306.03268v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13275v1","updated":"2024-01-24T07:34:55Z","published":"2024-01-24T07:34:55Z","title":"Can AI Assistants Know What They Don't Know?","summary":"  Recently, AI assistants based on large language models (LLMs) show surprising\nperformance in many tasks, such as dialogue, solving math problems, writing\ncode, and using tools. Although LLMs possess intensive world knowledge, they\nstill make factual errors when facing some knowledge intensive tasks, like\nopen-domain question answering. These untruthful responses from the AI\nassistant may cause significant risks in practical applications. We believe\nthat an AI assistant's refusal to answer questions it does not know is a\ncrucial method for reducing hallucinations and making the assistant truthful.\nTherefore, in this paper, we ask the question \"Can AI assistants know what they\ndon't know and express them through natural language?\" To answer this question,\nwe construct a model-specific \"I don't know\" (Idk) dataset for an assistant,\nwhich contains its known and unknown questions, based on existing open-domain\nquestion answering datasets. Then we align the assistant with its corresponding\nIdk dataset and observe whether it can refuse to answer its unknown questions\nafter alignment. Experimental results show that after alignment with Idk\ndatasets, the assistant can refuse to answer most its unknown questions. For\nquestions they attempt to answer, the accuracy is significantly higher than\nbefore the alignment.\n","authors":["Qinyuan Cheng","Tianxiang Sun","Xiangyang Liu","Wenwei Zhang","Zhangyue Yin","Shimin Li","Linyang Li","Kai Chen","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2401.13275v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.13260v1","updated":"2024-01-24T06:55:55Z","published":"2024-01-24T06:55:55Z","title":"MF-AED-AEC: Speech Emotion Recognition by Leveraging Multimodal Fusion,\n  ASR Error Detection, and ASR Error Correction","summary":"  The prevalent approach in speech emotion recognition (SER) involves\nintegrating both audio and textual information to comprehensively identify the\nspeaker's emotion, with the text generally obtained through automatic speech\nrecognition (ASR). An essential issue of this approach is that ASR errors from\nthe text modality can worsen the performance of SER. Previous studies have\nproposed using an auxiliary ASR error detection task to adaptively assign\nweights of each word in ASR hypotheses. However, this approach has limited\nimprovement potential because it does not address the coherence of semantic\ninformation in the text. Additionally, the inherent heterogeneity of different\nmodalities leads to distribution gaps between their representations, making\ntheir fusion challenging. Therefore, in this paper, we incorporate two\nauxiliary tasks, ASR error detection (AED) and ASR error correction (AEC), to\nenhance the semantic coherence of ASR text, and further introduce a novel\nmulti-modal fusion (MF) method to learn shared representations across\nmodalities. We refer to our method as MF-AED-AEC. Experimental results indicate\nthat MF-AED-AEC significantly outperforms the baseline model by a margin of\n4.1\\%.\n","authors":["Jiajun He","Xiaohan Shi","Xingfeng Li","Tomoki Toda"],"pdf_url":"https://arxiv.org/pdf/2401.13260v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.13256v1","updated":"2024-01-24T06:50:20Z","published":"2024-01-24T06:50:20Z","title":"UniMS-RAG: A Unified Multi-source Retrieval-Augmented Generation for\n  Personalized Dialogue Systems","summary":"  Large Language Models (LLMs) has shown exceptional capabilities in many\nnatual language understanding and generation tasks. However, the\npersonalization issue still remains a much-coveted property, especially when it\ncomes to the multiple sources involved in the dialogue system. To better plan\nand incorporate the use of multiple sources in generating personalized\nresponse, we firstly decompose it into three sub-tasks: Knowledge Source\nSelection, Knowledge Retrieval, and Response Generation. We then propose a\nnovel Unified Multi-Source Retrieval-Augmented Generation system (UniMS-RAG)\nSpecifically, we unify these three sub-tasks with different formulations into\nthe same sequence-to-sequence paradigm during the training, to adaptively\nretrieve evidences and evaluate the relevance on-demand using special tokens,\ncalled acting tokens and evaluation tokens. Enabling language models to\ngenerate acting tokens facilitates interaction with various knowledge sources,\nallowing them to adapt their behavior to diverse task requirements. Meanwhile,\nevaluation tokens gauge the relevance score between the dialogue context and\nthe retrieved evidence. In addition, we carefully design a self-refinement\nmechanism to iteratively refine the generated response considering 1) the\nconsistency scores between the generated response and retrieved evidence; and\n2) the relevance scores. Experiments on two personalized datasets (DuLeMon and\nKBP) show that UniMS-RAG achieves state-of-the-art performance on the knowledge\nsource selection and response generation task with itself as a retriever in a\nunified manner. Extensive analyses and discussions are provided for shedding\nsome new perspectives for personalized dialogue systems.\n","authors":["Hongru Wang","Wenyu Huang","Yang Deng","Rui Wang","Zezhong Wang","Yufei Wang","Fei Mi","Jeff Z. Pan","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2401.13256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13246v1","updated":"2024-01-24T06:10:51Z","published":"2024-01-24T06:10:51Z","title":"SEER: Facilitating Structured Reasoning and Explanation via\n  Reinforcement Learning","summary":"  Elucidating the reasoning process with structured explanations from question\nto answer is fundamentally crucial, as it significantly enhances the\ninterpretability and trustworthiness of question-answering (QA) systems.\nHowever, structured explanations demand models to perform intricate structured\nreasoning, which poses great challenges. Most existing methods focus on\nsingle-step reasoning through supervised learning, ignoring logical\ndependencies between steps. Meanwhile, existing reinforcement learning\n(RL)-based methods overlook the structured relationships, impeding RL's\npotential in structured reasoning. In this paper, we propose SEER, a novel\nmethod that maximizes a structure-based return to facilitate structured\nreasoning and explanation. Our proposed structure-based return precisely\ndescribes the hierarchical and branching structure inherent in structured\nreasoning, effectively capturing the intricate relationships between states. We\nalso introduce a fine-grained reward function to meticulously delineate diverse\nreasoning steps. Extensive experiments show that SEER significantly outperforms\nstate-of-the-art methods, achieving an absolute improvement of 6.9% over\nRL-based methods on EntailmentBank, a 4.4% average improvement on STREET\nbenchmark, and exhibiting outstanding efficiency and cross-dataset\ngeneralization performance.\n","authors":["Guoxin Chen","Kexin Tang","Chao Yang","Fuying Ye","Yu Qiao","Yiming Qian"],"pdf_url":"https://arxiv.org/pdf/2401.13246v1.pdf","comment":"Ongoing Work"},{"id":"http://arxiv.org/abs/2310.08535v3","updated":"2024-01-24T06:07:20Z","published":"2023-10-12T17:24:15Z","title":"Formally Specifying the High-Level Behavior of LLM-Based Agents","summary":"  Autonomous, goal-driven agents powered by LLMs have recently emerged as\npromising tools for solving challenging problems without the need for\ntask-specific finetuned models that can be expensive to procure. Currently, the\ndesign and implementation of such agents is ad hoc, as the wide variety of\ntasks that LLM-based agents may be applied to naturally means there can be no\none-size-fits-all approach to agent design. In this work we aim to alleviate\nthe difficulty of designing and implementing new agents by proposing a\nminimalistic generation framework that simplifies the process of building\nagents. The framework we introduce allows the user to define desired agent\nbehaviors in a high-level, declarative specification that is then used to\nconstruct a decoding monitor which guarantees the LLM will produce an output\nexhibiting the desired behavior. Our declarative approach, in which the\nbehavior is described without concern for how it should be implemented or\nenforced, enables rapid design, implementation, and experimentation with\ndifferent LLM-based agents. We demonstrate how the proposed framework can be\nused to implement recent LLM-based agents (e.g., ReACT), and show how the\nflexibility of our approach can be leveraged to define a new agent with more\ncomplex behavior, the Plan-Act-Summarize-Solve (PASS) agent. Lastly, we\ndemonstrate that our method outperforms other agents on multiple popular\nreasoning-centric question-answering benchmarks.\n","authors":["Maxwell Crouse","Ibrahim Abdelaziz","Ramon Astudillo","Kinjal Basu","Soham Dan","Sadhana Kumaravel","Achille Fokoue","Pavan Kapanipathi","Salim Roukos","Luis Lastras"],"pdf_url":"https://arxiv.org/pdf/2310.08535v3.pdf","comment":"Preprint under review"},{"id":"http://arxiv.org/abs/2401.13229v1","updated":"2024-01-24T04:57:32Z","published":"2024-01-24T04:57:32Z","title":"From Random to Informed Data Selection: A Diversity-Based Approach to\n  Optimize Human Annotation and Few-Shot Learning","summary":"  A major challenge in Natural Language Processing is obtaining annotated data\nfor supervised learning. An option is the use of crowdsourcing platforms for\ndata annotation. However, crowdsourcing introduces issues related to the\nannotator's experience, consistency, and biases. An alternative is to use\nzero-shot methods, which in turn have limitations compared to their few-shot or\nfully supervised counterparts. Recent advancements driven by large language\nmodels show potential, but struggle to adapt to specialized domains with\nseverely limited data. The most common approaches therefore involve the human\nitself randomly annotating a set of datapoints to build initial datasets. But\nrandomly sampling data to be annotated is often inefficient as it ignores the\ncharacteristics of the data and the specific needs of the model. The situation\nworsens when working with imbalanced datasets, as random sampling tends to\nheavily bias towards the majority classes, leading to excessive annotated data.\nTo address these issues, this paper contributes an automatic and informed data\nselection architecture to build a small dataset for few-shot learning. Our\nproposal minimizes the quantity and maximizes diversity of data selected for\nhuman annotation, while improving model performance.\n","authors":["Alexandre Alcoforado","Thomas Palmeira Ferraz","Lucas Hideki Okamura","Israel Campos Fama","Arnold Moya Lavado","Bárbara Dias Bueno","Bruno Veloso","Anna Helena Reali Costa"],"pdf_url":"https://arxiv.org/pdf/2401.13229v1.pdf","comment":"Accepted at PROPOR 2024 - The 16th International Conference on\n  Computational Processing of Portuguese"},{"id":"http://arxiv.org/abs/2309.08347v2","updated":"2024-01-24T04:53:13Z","published":"2023-09-15T12:10:03Z","title":"Reward Engineering for Generating Semi-structured Explanation","summary":"  Semi-structured explanation depicts the implicit process of a reasoner with\nan explicit representation. This explanation highlights how available\ninformation in a specific query is utilised and supplemented with information a\nreasoner produces from its internal weights towards generating an answer.\nDespite the recent improvements in generative capabilities of language models,\nproducing structured explanations to verify a model's true reasoning\ncapabilities remains a challenge. This issue is particularly pronounced for\nnot-so-large LMs (e.g., FLAN-T5-XXL). In this work, we first underscore the\nlimitations of supervised fine-tuning (SFT) in tackling this challenge, and\nthen introduce a carefully crafted reward engineering method in reinforcement\nlearning (RL) to better address this problem. We investigate multiple reward\naggregation methods and provide a detailed discussion which sheds light on the\npromising potential of RL for future research. Our proposed method on two\nsemi-structured explanation generation benchmarks (ExplaGraph and COPA-SSE)\nachieves new state-of-the-art results.\n","authors":["Jiuzhou Han","Wray Buntine","Ehsan Shareghi"],"pdf_url":"https://arxiv.org/pdf/2309.08347v2.pdf","comment":"Accepted to EACL2024; code is available at\n  https://github.com/Jiuzhouh/Reward-Engineering-for-Generating-SEG"},{"id":"http://arxiv.org/abs/2401.13227v1","updated":"2024-01-24T04:50:16Z","published":"2024-01-24T04:50:16Z","title":"Scalable Link Prediction on Large-Scale Heterogeneous Graphs with Large\n  Language Models","summary":"  Exploring the application of large-scale language models to graph learning is\na novel endeavor. However, the vast amount of information inherent in large\ngraphs poses significant challenges to this process. This paper focuses on the\nlink prediction task and introduces LPNL (Link Prediction via Natural\nLanguage), a framework based on a large language model designed for scalable\nlink prediction on large-scale heterogeneous graphs.We design novel prompts for\nlink prediction that articulate graph details in natural language. We propose a\ntwo-stage sampling pipeline to extract crucial information from large-scale\nheterogeneous graphs, and a divide-and-conquer strategy to control the input\ntoken count within predefined limits, addressing the challenge of overwhelming\ninformation. We fine-tune a T5 model based on our self-supervised learning\ndesigned for for link prediction. Extensive experiments on a large public\nheterogeneous graphs demonstrate that LPNL outperforms various advanced\nbaselines, highlighting its remarkable performance in link prediction tasks on\nlarge-scale graphs.\n","authors":["Baolong Bi","Shenghua Liu","Yiwei Wang","Lingrui Mei","Xueqi Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08845v2","updated":"2024-01-24T04:41:01Z","published":"2023-05-15T17:57:39Z","title":"Large Language Models are Zero-Shot Rankers for Recommender Systems","summary":"  Recently, large language models (LLMs) (e.g., GPT-4) have demonstrated\nimpressive general-purpose task-solving abilities, including the potential to\napproach recommendation tasks. Along this line of research, this work aims to\ninvestigate the capacity of LLMs that act as the ranking model for recommender\nsystems. We first formalize the recommendation problem as a conditional ranking\ntask, considering sequential interaction histories as conditions and the items\nretrieved by other candidate generation models as candidates. To solve the\nranking task by LLMs, we carefully design the prompting template and conduct\nextensive experiments on two widely-used datasets. We show that LLMs have\npromising zero-shot ranking abilities but (1) struggle to perceive the order of\nhistorical interactions, and (2) can be biased by popularity or item positions\nin the prompts. We demonstrate that these issues can be alleviated using\nspecially designed prompting and bootstrapping strategies. Equipped with these\ninsights, zero-shot LLMs can even challenge conventional recommendation models\nwhen ranking candidates are retrieved by multiple candidate generators. The\ncode and processed datasets are available at\nhttps://github.com/RUCAIBox/LLMRank.\n","authors":["Yupeng Hou","Junjie Zhang","Zihan Lin","Hongyu Lu","Ruobing Xie","Julian McAuley","Wayne Xin Zhao"],"pdf_url":"https://arxiv.org/pdf/2305.08845v2.pdf","comment":"Accepted by ECIR 2024"},{"id":"http://arxiv.org/abs/2401.13223v1","updated":"2024-01-24T04:28:50Z","published":"2024-01-24T04:28:50Z","title":"TAT-LLM: A Specialized Language Model for Discrete Reasoning over\n  Tabular and Textual Data","summary":"  In this work, we address question answering (QA) over a hybrid of tabular and\ntextual data that are very common content on the Web (e.g. SEC filings), where\ndiscrete reasoning capabilities are often required. Recently, large language\nmodels (LLMs) like GPT-4 have demonstrated strong multi-step reasoning\ncapabilities. We then consider harnessing the amazing power of LLMs to solve\nour task. We abstract a Step-wise Pipeline for tabular and textual QA, which\nconsists of three key steps, including Extractor, Reasoner and Executor, and\ninitially design an instruction to instantiate the pipeline and validate that\nGPT-4 outperforms all existing methods. However, utilizing an online LLM like\nGPT-4 holds various challenges in terms of cost, latency, and data security\nrisk, which motivates us to specialize smaller LLMs in this task. We develop a\nTAT-LLM language model by fine-tuning LLaMA 2 with the training data generated\nautomatically from existing expert-annotated datasets following the Step-wise\nPipeline. The experimental results have verified that our TAT-LLM model can\noutperform all baseline models, including the previous best fine-tuned models\nand very large-scale LLMs like GPT-4 on FinQA, TAT-QA and TAT-DQA benchmarks.\nWe hope our work can serve as a pioneering example of specializing smaller\nlanguage models for specific tasks.\n","authors":["Fengbin Zhu","Ziyang Liu","Fuli Feng","Chao Wang","Moxin Li","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2401.13223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13218v1","updated":"2024-01-24T04:13:28Z","published":"2024-01-24T04:13:28Z","title":"ULTRA: Unleash LLMs' Potential for Event Argument Extraction through\n  Hierarchical Modeling and Pair-wise Refinement","summary":"  Structural extraction of events within discourse is critical since it avails\na deeper understanding of communication patterns and behavior trends. Event\nargument extraction (EAE), at the core of event-centric understanding, is the\ntask of identifying role-specific text spans (i.e., arguments) for a given\nevent. Document-level EAE (DocEAE) focuses on arguments that are scattered\nacross an entire document. In this work, we explore the capabilities of open\nsource Large Language Models (LLMs), i.e., Flan-UL2, for the DocEAE task. To\nthis end, we propose ULTRA, a hierarchical framework that extracts event\narguments more cost-effectively -- the method needs as few as 50 annotations\nand doesn't require hitting costly API endpoints. Further, it alleviates the\npositional bias issue intrinsic to LLMs. ULTRA first sequentially reads text\nchunks of a document to generate a candidate argument set, upon which ULTRA\nlearns to drop non-pertinent candidates through self-refinement. We further\nintroduce LEAFER to address the challenge LLMs face in locating the exact\nboundary of an argument span. ULTRA outperforms strong baselines, which include\nstrong supervised models and ChatGPT, by 9.8% when evaluated by the exact match\n(EM) metric.\n","authors":["Xinliang Frederick Zhang","Carter Blum","Temma Choji","Shalin Shah","Alakananda Vempala"],"pdf_url":"https://arxiv.org/pdf/2401.13218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13201v1","updated":"2024-01-24T03:07:26Z","published":"2024-01-24T03:07:26Z","title":"MLLMReID: Multimodal Large Language Model-based Person Re-identification","summary":"  Multimodal large language models (MLLM) have achieved satisfactory results in\nmany tasks. However, their performance in the task of person re-identification\n(ReID) has not been explored to date. This paper will investigate how to adapt\nthem for the task of ReID. An intuitive idea is to fine-tune MLLM with ReID\nimage-text datasets, and then use their visual encoder as a backbone for ReID.\nHowever, there still exist two apparent issues: (1) Designing instructions for\nReID, MLLMs may overfit specific instructions, and designing a variety of\ninstructions will lead to higher costs. (2) Latent image feature vectors from\nLLMs are not involved in loss computation. Instructional learning, aligning\nimage-text features, results in indirect optimization and a learning objective\nthat inadequately utilizes features, limiting effectiveness in person feature\nlearning. To address these problems, this paper proposes MLLMReID: Multimodal\nLarge Language Model-based ReID. Firstly, we proposed Common Instruction, a\nsimple approach that leverages the essence ability of LLMs to continue writing,\navoiding complex and diverse instruction design. Secondly, we proposed\nDirectReID, which effectively employs the latent image feature vectors of\nimages outputted by LLMs in ReID tasks. The experimental results demonstrate\nthe superiority of our method. We will open-source the code on GitHub.\n","authors":["Shan Yang","Yongfei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.13201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02272v4","updated":"2024-01-24T02:53:27Z","published":"2023-06-04T06:33:13Z","title":"OWQ: Outlier-Aware Weight Quantization for Efficient Fine-Tuning and\n  Inference of Large Language Models","summary":"  Large language models (LLMs) with hundreds of billions of parameters require\npowerful server-grade GPUs for inference, limiting their practical deployment.\nTo address this challenge, we introduce the outlier-aware weight quantization\n(OWQ) method, which aims to minimize LLM's footprint through low-precision\nrepresentation. OWQ prioritizes a small subset of structured weights sensitive\nto quantization, storing them in high-precision, while applying highly tuned\nquantization to the remaining dense weights. This sensitivity-aware\nmixed-precision scheme reduces the quantization error notably, and extensive\nexperiments demonstrate that 3.1-bit models using OWQ perform comparably to\n4-bit models optimized by OPTQ. Furthermore, OWQ incorporates a\nparameter-efficient fine-tuning for task-specific adaptation, called weak\ncolumn tuning (WCT), enabling accurate task-specific LLM adaptation with\nminimal memory overhead in the optimized format. OWQ represents a notable\nadvancement in the flexibility, efficiency, and practicality of LLM\noptimization literature. The source code is available at\nhttps://github.com/xvyaward/owq\n","authors":["Changhun Lee","Jungyu Jin","Taesu Kim","Hyungjun Kim","Eunhyeok Park"],"pdf_url":"https://arxiv.org/pdf/2306.02272v4.pdf","comment":"Accepted at AAAI 2024 (oral presentation)"},{"id":"http://arxiv.org/abs/2401.13178v1","updated":"2024-01-24T01:51:00Z","published":"2024-01-24T01:51:00Z","title":"AgentBoard: An Analytical Evaluation Board of Multi-turn LLM Agents","summary":"  Evaluating large language models (LLMs) as general-purpose agents is\nessential for understanding their capabilities and facilitating their\nintegration into practical applications. However, the evaluation process\npresents substantial challenges. A primary obstacle is the benchmarking of\nagent performance across diverse scenarios within a unified framework,\nespecially in maintaining partially-observable environments and ensuring\nmulti-round interactions. Moreover, current evaluation frameworks mostly focus\non the final success rate, revealing few insights during the process and\nfailing to provide a deep understanding of the model abilities. To address\nthese challenges, we introduce AgentBoard, a pioneering comprehensive benchmark\nand accompanied open-source evaluation framework tailored to analytical\nevaluation of LLM agents. AgentBoard offers a fine-grained progress rate metric\nthat captures incremental advancements as well as a comprehensive evaluation\ntoolkit that features easy assessment of agents for multi-faceted analysis\nthrough interactive visualization. This not only sheds light on the\ncapabilities and limitations of LLM agents but also propels the\ninterpretability of their performance to the forefront. Ultimately, AgentBoard\nserves as a significant step towards demystifying agent behaviors and\naccelerating the development of stronger LLM agents.\n","authors":["Chang Ma","Junlei Zhang","Zhihao Zhu","Cheng Yang","Yujiu Yang","Yaohui Jin","Zhenzhong Lan","Lingpeng Kong","Junxian He"],"pdf_url":"https://arxiv.org/pdf/2401.13178v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.13170v1","updated":"2024-01-24T01:30:25Z","published":"2024-01-24T01:30:25Z","title":"CFMatch: Aligning Automated Answer Equivalence Evaluation with Expert\n  Judgments For Open-Domain Question Answering","summary":"  Question answering (QA) can only make progress if we know if an answer is\ncorrect, but for many of the most challenging and interesting QA examples,\ncurrent evaluation metrics to determine answer equivalence (AE) often do not\nalign with human judgments, particularly more verbose, free-form answers from\nlarge language models (LLM). There are two challenges: a lack of data and that\nmodels are too big: LLM-based scorers can correlate better with human judges,\nbut this task has only been tested on limited QA datasets, and even when\navailable, update of the model is limited because LLMs are large and often\nexpensive. We rectify both of these issues by providing clear and consistent\nguidelines for evaluating AE in machine QA adopted from professional human QA\ncontests. We also introduce a combination of standard evaluation and a more\nefficient, robust, and lightweight discriminate AE classifier-based matching\nmethod (CFMatch, smaller than 1 MB), trained and validated to more accurately\nevaluate answer correctness in accordance with adopted expert AE rules that are\nmore aligned with human judgments.\n","authors":["Zongxia Li","Ishani Mondal","Yijun Liang","Huy Nghiem","Jordan Boyd-Graber"],"pdf_url":"https://arxiv.org/pdf/2401.13170v1.pdf","comment":"18 pages, two figures, 6 tables. QA evaluation python package\n  available in https://github.com/zli12321/qa_metrics.git"},{"id":"http://arxiv.org/abs/2308.12539v2","updated":"2024-01-24T01:09:01Z","published":"2023-08-24T03:53:55Z","title":"CALM : A Multi-task Benchmark for Comprehensive Assessment of Language\n  Model Bias","summary":"  As language models (LMs) become increasingly powerful and widely used, it is\nimportant to quantify them for sociodemographic bias with potential for harm.\nPrior measures of bias are sensitive to perturbations in the templates designed\nto compare performance across social groups, due to factors such as low\ndiversity or limited number of templates. Also, most previous work considers\nonly one NLP task. We introduce Comprehensive Assessment of Language Models\n(CALM) for robust measurement of two types of universally relevant\nsociodemographic bias, gender and race. CALM integrates sixteen datasets for\nquestion-answering, sentiment analysis and natural language inference. Examples\nfrom each dataset are filtered to produce 224 templates with high diversity\n(e.g., length, vocabulary). We assemble 50 highly frequent person names for\neach of seven distinct demographic groups to generate 78,400 prompts covering\nthe three NLP tasks. Our empirical evaluation shows that CALM bias scores are\nmore robust and far less sensitive than previous bias measurements to\nperturbations in the templates, such as synonym substitution, or to random\nsubset selection of templates. We apply CALM to 20 large language models, and\nfind that for 2 language model series, larger parameter models tend to be more\nbiased than smaller ones. The T0 series is the least biased model families, of\nthe 20 LLMs investigated here. The code is available at\nhttps://github.com/vipulgupta1011/CALM.\n","authors":["Vipul Gupta","Pranav Narayanan Venkit","Hugo Laurençon","Shomir Wilson","Rebecca J. Passonneau"],"pdf_url":"https://arxiv.org/pdf/2308.12539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13165v1","updated":"2024-01-24T00:58:30Z","published":"2024-01-24T00:58:30Z","title":"Misgendering and Assuming Gender in Machine Translation when Working\n  with Low-Resource Languages","summary":"  This chapter focuses on gender-related errors in machine translation (MT) in\nthe context of low-resource languages. We begin by explaining what low-resource\nlanguages are, examining the inseparable social and computational factors that\ncreate such linguistic hierarchies. We demonstrate through a case study of our\nmother tongue Bengali, a global language spoken by almost 300 million people\nbut still classified as low-resource, how gender is assumed and inferred in\ntranslations to and from the high(est)-resource English when no such\ninformation is provided in source texts. We discuss the postcolonial and\nsocietal impacts of such errors leading to linguistic erasure and\nrepresentational harms, and conclude by discussing potential solutions towards\nuplifting languages by providing them more agency in MT conversations.\n","authors":["Sourojit Ghosh","Srishti Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2401.13165v1.pdf","comment":"Upcoming Publication, Gendered Technology in Translation and\n  Interpreting Centering Rights in the Development of Language Technology"},{"id":"http://arxiv.org/abs/2401.13160v1","updated":"2024-01-24T00:36:13Z","published":"2024-01-24T00:36:13Z","title":"SpacTor-T5: Pre-training T5 Models with Span Corruption and Replaced\n  Token Detection","summary":"  Pre-training large language models is known to be extremely resource\nintensive and often times inefficient, under-utilizing the information\nencapsulated in the training text sequences. In this paper, we present SpacTor,\na new training procedure consisting of (1) a hybrid objective combining span\ncorruption (SC) and token replacement detection (RTD), and (2) a two-stage\ncurriculum that optimizes the hybrid objective over the initial $\\tau$\niterations, then transitions to standard SC loss. We show empirically that the\neffectiveness of the hybrid objective is tied to the two-stage pre-training\nschedule, and provide extensive analysis on why this is the case. In our\nexperiments with encoder-decoder architectures (T5) on a variety of NLP tasks,\nSpacTor-T5 yields the same downstream performance as standard SC pre-training,\nwhile enabling a 50% reduction in pre-training iterations and 40% reduction in\ntotal FLOPs. Alternatively, given the same amount of computing budget, we find\nthat SpacTor results in significantly improved downstream benchmark\nperformance.\n","authors":["Ke Ye","Heinrich Jiang","Afshin Rostamizadeh","Ayan Chakrabarti","Giulia DeSalvo","Jean-François Kagy","Lazaros Karydas","Gui Citovsky","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2401.13160v1.pdf","comment":"9+13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.13849v1","updated":"2024-01-24T23:11:33Z","published":"2024-01-24T23:11:33Z","title":"TPD: Enhancing Student Language Model Reasoning via Principle Discovery\n  and Guidance","summary":"  Large Language Models (LLMs) have recently showcased remarkable reasoning\nabilities. However, larger models often surpass their smaller counterparts in\nreasoning tasks, posing the challenge of effectively transferring these\ncapabilities from larger models. Existing approaches heavily rely on extensive\nfine-tuning data or continuous interactions with a superior teacher LLM during\ninference. We introduce a principle-based teacher-student framework called\n``Teaching via Principle Discovery'' (TPD) to address these limitations.\nInspired by human learning mechanisms, TPD mimics the interaction between a\nteacher and a student using a principle-based approach. The teacher LLM\ngenerates problem-solving instructions and corrective principles based on the\nstudent LLM's errors. These principles guide the refinement of instructions and\nthe selection of instructive examples from a validation set. This enables the\nstudent model to learn from both the teacher's guidance and its own mistakes.\nOnce the student model begins making inferences, TPD requires no further\nintervention from the teacher LLM or humans. Through extensive experiments\nacross eight reasoning tasks, we demonstrate the effectiveness of TPD. Compared\nto standard chain-of-thought prompting, TPD significantly improves the student\nmodel's performance, achieving $6.2\\%$ improvement on average.\n","authors":["Haorui Wang","Rongzhi Zhang","Yinghao Li","Lingkai Kong","Yuchen Zhuang","Xiusi Chen","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.13849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08491v2","updated":"2024-01-24T23:04:02Z","published":"2024-01-16T16:49:39Z","title":"Contrastive Perplexity for Controlled Generation: An Application in\n  Detoxifying Large Language Models","summary":"  The generation of undesirable and factually incorrect content of large\nlanguage models poses a significant challenge and remains largely an unsolved\nissue. This paper studies the integration of a contrastive learning objective\nfor fine-tuning LLMs for implicit knowledge editing and controlled text\ngeneration. Optimizing the training objective entails aligning text\nperplexities in a contrastive fashion. To facilitate training the model in a\nself-supervised fashion, we leverage an off-the-shelf LLM for training data\ngeneration. We showcase applicability in the domain of detoxification. Herein,\nthe proposed approach leads to a significant decrease in the generation of\ntoxic content while preserving general utility for downstream tasks such as\ncommonsense reasoning and reading comprehension. The proposed approach is\nconceptually simple but empirically powerful.\n","authors":["Tassilo Klein","Moin Nabi"],"pdf_url":"https://arxiv.org/pdf/2401.08491v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13835v1","updated":"2024-01-24T22:21:04Z","published":"2024-01-24T22:21:04Z","title":"The Calibration Gap between Model and Human Confidence in Large Language\n  Models","summary":"  For large language models (LLMs) to be trusted by humans they need to be\nwell-calibrated in the sense that they can accurately assess and communicate\nhow likely it is that their predictions are correct. Recent work has focused on\nthe quality of internal LLM confidence assessments, but the question remains of\nhow well LLMs can communicate this internal model confidence to human users.\nThis paper explores the disparity between external human confidence in an LLM's\nresponses and the internal confidence of the model. Through experiments\ninvolving multiple-choice questions, we systematically examine human users'\nability to discern the reliability of LLM outputs. Our study focuses on two key\nareas: (1) assessing users' perception of true LLM confidence and (2)\ninvestigating the impact of tailored explanations on this perception. The\nresearch highlights that default explanations from LLMs often lead to user\noverestimation of both the model's confidence and its' accuracy. By modifying\nthe explanations to more accurately reflect the LLM's internal confidence, we\nobserve a significant shift in user perception, aligning it more closely with\nthe model's actual confidence levels. This adjustment in explanatory approach\ndemonstrates potential for enhancing user trust and accuracy in assessing LLM\noutputs. The findings underscore the importance of transparent communication of\nconfidence levels in LLMs, particularly in high-stakes applications where\nunderstanding the reliability of AI-generated information is essential.\n","authors":["Mark Steyvers","Heliodoro Tejeda","Aakriti Kumar","Catarina Belem","Sheer Karny","Xinyue Hu","Lukas Mayer","Padhraic Smyth"],"pdf_url":"https://arxiv.org/pdf/2401.13835v1.pdf","comment":"27 pages, 10 figures"},{"id":"http://arxiv.org/abs/2401.13810v1","updated":"2024-01-24T21:02:07Z","published":"2024-01-24T21:02:07Z","title":"Automated Root Causing of Cloud Incidents using In-Context Learning with\n  GPT-4","summary":"  Root Cause Analysis (RCA) plays a pivotal role in the incident diagnosis\nprocess for cloud services, requiring on-call engineers to identify the primary\nissues and implement corrective actions to prevent future recurrences.\nImproving the incident RCA process is vital for minimizing service downtime,\ncustomer impact and manual toil. Recent advances in artificial intelligence\nhave introduced state-of-the-art Large Language Models (LLMs) like GPT-4, which\nhave proven effective in tackling various AIOps problems, ranging from code\nauthoring to incident management. Nonetheless, the GPT-4 model's immense size\npresents challenges when trying to fine-tune it on user data because of the\nsignificant GPU resource demand and the necessity for continuous model\nfine-tuning with the emergence of new data. To address the high cost of\nfine-tuning LLM, we propose an in-context learning approach for automated root\ncausing, which eliminates the need for fine-tuning. We conduct extensive study\nover 100,000 production incidents, comparing several large language models\nusing multiple metrics. The results reveal that our in-context learning\napproach outperforms the previous fine-tuned large language models such as\nGPT-3 by an average of 24.8\\% across all metrics, with an impressive 49.7\\%\nimprovement over the zero-shot model. Moreover, human evaluation involving\nactual incident owners demonstrates its superiority over the fine-tuned model,\nachieving a 43.5\\% improvement in correctness and an 8.7\\% enhancement in\nreadability. The impressive results demonstrate the viability of utilizing a\nvanilla GPT model for the RCA task, thereby avoiding the high computational and\nmaintenance costs associated with a fine-tuned model.\n","authors":["Xuchao Zhang","Supriyo Ghosh","Chetan Bansal","Rujia Wang","Minghua Ma","Yu Kang","Saravan Rajmohan"],"pdf_url":"https://arxiv.org/pdf/2401.13810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14067v2","updated":"2024-01-24T20:56:59Z","published":"2023-11-23T15:50:42Z","title":"Enhancing Task-Oriented Dialogues with Chitchat: a Comparative Study\n  Based on Lexical Diversity and Divergence","summary":"  As a recent development, task-oriented dialogues (TODs) have been enriched\nwith chitchat in an effort to make dialogues more diverse and engaging. This\nenhancement is particularly valuable as TODs are often confined to narrow\ndomains, making the mitigation of repetitive and predictable responses a\nsignificant challenge. This paper presents a comparative analysis of three\nchitchat enhancements, aiming to identify the most effective approach in terms\nof diversity. Additionally, we quantify the divergence between the added\nchitchat, the original task-oriented language, and chitchat typically found in\nchitchat datasets, highlighting the top 20 divergent keywords for each\ncomparison. Our findings drive a discussion on future enhancements for\naugmenting TODs, emphasizing the importance of grounding dialogues beyond the\ntask to achieve more diverse and natural exchanges.\n","authors":["Armand Stricker","Patrick Paroubek"],"pdf_url":"https://arxiv.org/pdf/2311.14067v2.pdf","comment":"Accepted @ ASRU 2023 Code:\n  https://github.com/armandstrickernlp/Task-Chitchat-Entropy"},{"id":"http://arxiv.org/abs/2401.13802v1","updated":"2024-01-24T20:43:36Z","published":"2024-01-24T20:43:36Z","title":"Investigating the Efficacy of Large Language Models for Code Clone\n  Detection","summary":"  Large Language Models (LLMs) have demonstrated remarkable success in various\nnatural language processing and software engineering tasks, such as code\ngeneration. The LLMs are mainly utilized in the prompt-based zero/few-shot\nparadigm to guide the model in accomplishing the task. %\\textbf{Goal:}\nGPT-based models are one of the popular ones studied for tasks such as code\ncomment generation or test generation. These tasks are `generative' tasks.\nHowever, there is limited research on the usage of LLMs for `non-generative'\ntasks such as classification using the prompt-based paradigm. In this\npreliminary exploratory study, we investigated the applicability of LLMs for\nCode Clone Detection (CCD), a non-generative task. %\\textbf{Method:} By\nbuilding a mono-lingual and cross-lingual CCD dataset derived from CodeNet, we\nfirst investigated two different prompts using ChatGPT to detect\n\\textcolor{black}{Type-4} code clones in Java-Java and Java-Ruby pairs in a\nzero-shot setting. We \\textcolor{black}{then} conducted an analysis to\nunderstand the strengths and weaknesses of ChatGPT in CCD. %\\textbf{Results:}\nChatGPT surpasses the baselines in cross-language CCD\n\\textcolor{black}{attaining an F1-score of 0.877 } and achieves comparable\nperformance to fully fine-tuned models for mono-lingual CCD,\n\\textcolor{black}{with an F1-score of 0.878}. Also, the\n\\textcolor{black}{prompt and the} difficulty level of the problems has an\nimpact on the performance of ChatGPT. \\textcolor{black}{Finally,} we provide\ninsights and future directions based on our initial analysis\n","authors":["Mohamad Khajezade","Jie Wu","Fatemeh Hendijani Fard","Gema Rodríguez-Pérez","Mohamed Sami Shehata"],"pdf_url":"https://arxiv.org/pdf/2401.13802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13789v1","updated":"2024-01-24T20:17:11Z","published":"2024-01-24T20:17:11Z","title":"A Unified Approach to Emotion Detection and Task-Oriented Dialogue\n  Modeling","summary":"  In current text-based task-oriented dialogue (TOD) systems, user emotion\ndetection (ED) is often overlooked or is typically treated as a separate and\nindependent task, requiring additional training. In contrast, our work\ndemonstrates that seamlessly unifying ED and TOD modeling brings about mutual\nbenefits, and is therefore an alternative to be considered. Our method consists\nin augmenting SimpleToD, an end-to-end TOD system, by extending belief state\ntracking to include ED, relying on a single language model. We evaluate our\napproach using GPT-2 and Llama-2 on the EmoWOZ benchmark, a version of MultiWOZ\nannotated with emotions. Our results reveal a general increase in performance\nfor ED and task results. Our findings also indicate that user emotions provide\nuseful contextual conditioning for system responses, and can be leveraged to\nfurther refine responses in terms of empathy.\n","authors":["Armand Stricker","Patrick Paroubek"],"pdf_url":"https://arxiv.org/pdf/2401.13789v1.pdf","comment":"Accepted @ IWSDS 2024"},{"id":"http://arxiv.org/abs/2401.13782v1","updated":"2024-01-24T20:05:49Z","published":"2024-01-24T20:05:49Z","title":"Tweets to Citations: Unveiling the Impact of Social Media Influencers on\n  AI Research Visibility","summary":"  As the number of accepted papers at AI and ML conferences reaches into the\nthousands, it has become unclear how researchers access and read research\npublications. In this paper, we investigate the role of social media\ninfluencers in enhancing the visibility of machine learning research,\nparticularly the citation counts of papers they share. We have compiled a\ncomprehensive dataset of over 8,000 papers, spanning tweets from December 2018\nto October 2023, alongside 1:1 matched controls based on publication year,\nvenue, and abstract topics. Our analysis reveals a significant increase in\ncitations for papers endorsed by these influencers, with median citation counts\n2-3 times higher than those of the control group. Additionally, the study\ndelves into the geographic, gender, and institutional diversity of highlighted\nauthors. These findings highlight the expanding influence of social media in\nscholarly communication and underscore the importance of an evolving ecosystem\nin today's digital academic landscape.\n","authors":["Iain Xie Weissburg","Mehir Arora","Liangming Pan","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.13782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04928v2","updated":"2024-01-24T19:38:52Z","published":"2023-11-03T18:27:21Z","title":"Leveraging Large Language Models for Collective Decision-Making","summary":"  In various work contexts, such as meeting scheduling, collaborating, and\nproject planning, collective decision-making is essential but often challenging\ndue to diverse individual preferences, varying work focuses, and power dynamics\namong members. To address this, we propose a system leveraging Large Language\nModels (LLMs) to facilitate group decision-making by managing conversations and\nbalancing preferences among individuals. Our system aims to extract individual\npreferences from conversations and suggest options that satisfy the preferences\nof the members. We specifically apply this system to corporate meeting\nscheduling. We create synthetic employee profiles and simulate conversations at\nscale, leveraging LLMs to evaluate the system performance as a novel approach\nto conducting a user study. Our results indicate efficient coordination with\nreduced interactions between the members and the LLM-based system. The system\nrefines and improves its proposed options over time, ensuring that many of the\nmembers' individual preferences are satisfied in an equitable way. Finally, we\nconduct a survey study involving human participants to assess our system's\nability to aggregate preferences and reasoning about them. Our findings show\nthat the system exhibits strong performance in both dimensions.\n","authors":["Marios Papachristou","Longqi Yang","Chin-Chia Hsu"],"pdf_url":"https://arxiv.org/pdf/2311.04928v2.pdf","comment":"Comparison with baselines, requirements analysis, expand related work"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.13666v1","updated":"2024-01-24T18:58:21Z","published":"2024-01-24T18:58:21Z","title":"Algebraic methods for solving recognition problems with non-crossing\n  classes","summary":"  In this paper, we propose to consider various models of pattern recognition.\nAt the same time, it is proposed to consider models in the form of two\noperators: a recognizing operator and a decision rule. Algebraic operations are\nintroduced on recognizing operators, and based on the application of these\noperators, a family of recognizing algorithms is created. An upper estimate is\nconstructed for the model, which guarantees the completeness of the extension.\n","authors":["Anvar Kabulov","Alimdzhan Babadzhanov","Islambek Saymanov"],"pdf_url":"https://arxiv.org/pdf/2401.13666v1.pdf","comment":"VII World Congress of Turkic World Mathematicians, 20-23 September\n  2023, Turkestan, Kazakhstan"},{"id":"http://arxiv.org/abs/2110.13694v4","updated":"2024-01-24T18:48:17Z","published":"2021-10-26T13:31:50Z","title":"A fast horizon detector and a new annotated dataset for maritime video\n  processing","summary":"  Accurate and fast sea horizon detection is vital for tasks in autonomous\nnavigation and maritime security, such as video stabilization, target region\nreduction, precise tracking, and obstacle avoidance. This paper introduces a\nnovel sea horizon detector from RGB videos, focusing on rapid and effective sea\nnoise suppression while preserving weak horizon edges. Line fitting methods are\nsubsequently employed on filtered edges for horizon detection. We address the\nfiltering problem by extracting line segments with a very low edge threshold,\nensuring the detection of line segments even in low-contrast horizon\nconditions. We show that horizon line segments have simple and relevant\nproperties in RGB images, which we exploit to suppress noisy segments. Then we\nuse the surviving segments to construct a filtered edge map and infer the\nhorizon from the filtered edges. We propose a careful incorporation of temporal\ninformation for horizon inference and experimentally show its effectiveness. We\naddress the computational constraint by providing a vectorized implementation\nfor efficient CPU execution, and leveraging image downsizing with minimal loss\nof accuracy on the original size. Moreover, we contribute a public horizon line\ndataset to enrich existing data resources. Our algorithm's performance is\nrigorously evaluated against state-of-the-art methods, and its components are\nvalidated through ablation experiments. Source code and dataset files are\navailable at:\n","authors":["Yassir Zardoua","Boulaala Mohammed","Mhamed El Mrabet","Astito Abdelali"],"pdf_url":"https://arxiv.org/pdf/2110.13694v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13650v1","updated":"2024-01-24T18:35:55Z","published":"2024-01-24T18:35:55Z","title":"Tyche: Stochastic In-Context Learning for Medical Image Segmentation","summary":"  Existing learning-based solutions to medical image segmentation have two\nimportant shortcomings. First, for most new segmentation task, a new model has\nto be trained or fine-tuned. This requires extensive resources and machine\nlearning expertise, and is therefore often infeasible for medical researchers\nand clinicians. Second, most existing segmentation methods produce a single\ndeterministic segmentation mask for a given image. In practice however, there\nis often considerable uncertainty about what constitutes the correct\nsegmentation, and different expert annotators will often segment the same image\ndifferently. We tackle both of these problems with Tyche, a model that uses a\ncontext set to generate stochastic predictions for previously unseen tasks\nwithout the need to retrain. Tyche differs from other in-context segmentation\nmethods in two important ways. (1) We introduce a novel convolution block\narchitecture that enables interactions among predictions. (2) We introduce\nin-context test-time augmentation, a new mechanism to provide prediction\nstochasticity. When combined with appropriate model design and loss functions,\nTyche can predict a set of plausible diverse segmentation candidates for new or\nunseen medical images and segmentation tasks without the need to retrain.\n","authors":["Marianne Rakic","Hallee E. Wong","Jose Javier Gonzalez Ortiz","Beth Cimini","John Guttag","Adrian V. Dalca"],"pdf_url":"https://arxiv.org/pdf/2401.13650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13649v1","updated":"2024-01-24T18:35:21Z","published":"2024-01-24T18:35:21Z","title":"VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web\n  Tasks","summary":"  Autonomous agents capable of planning, reasoning, and executing actions on\nthe web offer a promising avenue for automating computer tasks. However, the\nmajority of existing benchmarks primarily focus on text-based agents,\nneglecting many natural tasks that require visual information to effectively\nsolve. Given that most computer interfaces cater to human perception, visual\ninformation often augments textual data in ways that text-only models struggle\nto harness effectively. To bridge this gap, we introduce VisualWebArena, a\nbenchmark designed to assess the performance of multimodal web agents on\nrealistic \\textit{visually grounded tasks}. VisualWebArena comprises of a set\nof diverse and complex web-based tasks that evaluate various capabilities of\nautonomous multimodal agents. To perform on this benchmark, agents need to\naccurately process image-text inputs, interpret natural language instructions,\nand execute actions on websites to accomplish user-defined objectives. We\nconduct an extensive evaluation of state-of-the-art LLM-based autonomous\nagents, including several multimodal models. Through extensive quantitative and\nqualitative analysis, we identify several limitations of text-only LLM agents,\nand reveal gaps in the capabilities of state-of-the-art multimodal language\nagents. VisualWebArena provides a framework for evaluating multimodal\nautonomous language agents, and offers insights towards building stronger\nautonomous agents for the web. Our code, baseline models, and data is publicly\navailable at https://jykoh.com/vwa.\n","authors":["Jing Yu Koh","Robert Lo","Lawrence Jang","Vikram Duvvur","Ming Chong Lim","Po-Yu Huang","Graham Neubig","Shuyan Zhou","Ruslan Salakhutdinov","Daniel Fried"],"pdf_url":"https://arxiv.org/pdf/2401.13649v1.pdf","comment":"24 pages. Project page: https://jykoh.com/vwa"},{"id":"http://arxiv.org/abs/2401.12946v2","updated":"2024-01-24T18:31:30Z","published":"2024-01-23T18:07:07Z","title":"Coverage Axis++: Efficient Inner Point Selection for 3D Shape\n  Skeletonization","summary":"  We introduce Coverage Axis++, a novel and efficient approach to 3D shape\nskeletonization. The current state-of-the-art approaches for this task often\nrely on the watertightness of the input or suffer from substantial\ncomputational costs, thereby limiting their practicality. To address this\nchallenge, Coverage Axis++ proposes a heuristic algorithm to select skeletal\npoints, offering a high-accuracy approximation of the Medial Axis Transform\n(MAT) while significantly mitigating computational intensity for various shape\nrepresentations. We introduce a simple yet effective strategy that considers\nboth shape coverage and uniformity to derive skeletal points. The selection\nprocedure enforces consistency with the shape structure while favoring the\ndominant medial balls, which thus introduces a compact underlying shape\nrepresentation in terms of MAT. As a result, Coverage Axis++ allows for\nskeletonization for various shape representations (e.g., water-tight meshes,\ntriangle soups, point clouds), specification of the number of skeletal points,\nfew hyperparameters, and highly efficient computation with improved\nreconstruction accuracy. Extensive experiments across a wide range of 3D shapes\nvalidate the efficiency and effectiveness of Coverage Axis++. The code will be\npublicly available once the paper is published.\n","authors":["Zimeng Wang","Zhiyang Dou","Rui Xu","Cheng Lin","Yuan Liu","Xiaoxiao Long","Shiqing Xin","Lingjie Liu","Taku Komura","Xiaoming Yuan","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01403v2","updated":"2024-01-24T18:11:53Z","published":"2023-10-02T17:58:52Z","title":"CLIPSelf: Vision Transformer Distills Itself for Open-Vocabulary Dense\n  Prediction","summary":"  Open-vocabulary dense prediction tasks including object detection and image\nsegmentation have been advanced by the success of Contrastive Language-Image\nPre-training (CLIP). CLIP models, particularly those incorporating vision\ntransformers (ViTs), have exhibited remarkable generalization ability in\nzero-shot image classification. However, when transferring the vision-language\nalignment of CLIP from global image representation to local region\nrepresentation for the open-vocabulary dense prediction tasks, CLIP ViTs suffer\nfrom the domain shift from full images to local image regions. In this paper,\nwe embark on an in-depth analysis of the region-language alignment in CLIP\nmodels, which is essential for downstream open-vocabulary dense prediction\ntasks. Subsequently, we propose an approach named CLIPSelf, which adapts the\nimage-level recognition ability of CLIP ViT to local image regions without\nneeding any region-text pairs. CLIPSelf empowers ViTs to distill itself by\naligning a region representation extracted from its dense feature map with the\nimage-level representation of the corresponding image crop. With the enhanced\nCLIP ViTs, we achieve new state-of-the-art performance on open-vocabulary\nobject detection, semantic segmentation, and panoptic segmentation across\nvarious benchmarks. Models and code are released at\nhttps://github.com/wusize/CLIPSelf.\n","authors":["Size Wu","Wenwei Zhang","Lumin Xu","Sheng Jin","Xiangtai Li","Wentao Liu","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2310.01403v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13641v1","updated":"2024-01-24T18:10:39Z","published":"2024-01-24T18:10:39Z","title":"How Good is ChatGPT at Face Biometrics? A First Look into Recognition,\n  Soft Biometrics, and Explainability","summary":"  Large Language Models (LLMs) such as GPT developed by OpenAI, have already\nshown astonishing results, introducing quick changes in our society. This has\nbeen intensified by the release of ChatGPT which allows anyone to interact in a\nsimple conversational way with LLMs, without any experience in the field\nneeded. As a result, ChatGPT has been rapidly applied to many different tasks\nsuch as code- and song-writer, education, virtual assistants, etc., showing\nimpressive results for tasks for which it was not trained (zero-shot learning).\n  The present study aims to explore the ability of ChatGPT, based on the recent\nGPT-4 multimodal LLM, for the task of face biometrics. In particular, we\nanalyze the ability of ChatGPT to perform tasks such as face verification,\nsoft-biometrics estimation, and explainability of the results. ChatGPT could be\nvery valuable to further increase the explainability and transparency of the\nautomatic decisions in human scenarios. Experiments are carried out in order to\nevaluate the performance and robustness of ChatGPT, using popular public\nbenchmarks and comparing the results with state-of-the-art methods in the\nfield. The results achieved in this study show the potential of LLMs such as\nChatGPT for face biometrics, especially to enhance explainability. For\nreproducibility reasons, we release all the code in GitHub.\n","authors":["Ivan DeAndres-Tame","Ruben Tolosana","Ruben Vera-Rodriguez","Aythami Morales","Julian Fierrez","Javier Ortega-Garcia"],"pdf_url":"https://arxiv.org/pdf/2401.13641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13627v1","updated":"2024-01-24T17:58:07Z","published":"2024-01-24T17:58:07Z","title":"Scaling Up to Excellence: Practicing Model Scaling for Photo-Realistic\n  Image Restoration In the Wild","summary":"  We introduce SUPIR (Scaling-UP Image Restoration), a groundbreaking image\nrestoration method that harnesses generative prior and the power of model\nscaling up. Leveraging multi-modal techniques and advanced generative prior,\nSUPIR marks a significant advance in intelligent and realistic image\nrestoration. As a pivotal catalyst within SUPIR, model scaling dramatically\nenhances its capabilities and demonstrates new potential for image restoration.\nWe collect a dataset comprising 20 million high-resolution, high-quality images\nfor model training, each enriched with descriptive text annotations. SUPIR\nprovides the capability to restore images guided by textual prompts, broadening\nits application scope and potential. Moreover, we introduce negative-quality\nprompts to further improve perceptual quality. We also develop a\nrestoration-guided sampling method to suppress the fidelity issue encountered\nin generative-based restoration. Experiments demonstrate SUPIR's exceptional\nrestoration effects and its novel capacity to manipulate restoration through\ntextual prompts.\n","authors":["Fanghua Yu","Jinjin Gu","Zheyuan Li","Jinfan Hu","Xiangtao Kong","Xintao Wang","Jingwen He","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2401.13627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13616v1","updated":"2024-01-24T17:44:33Z","published":"2024-01-24T17:44:33Z","title":"FLLIC: Functionally Lossless Image Compression","summary":"  Recently, DNN models for lossless image coding have surpassed their\ntraditional counterparts in compression performance, reducing the bit rate by\nabout ten percent for natural color images. But even with these advances,\nmathematically lossless image compression (MLLIC) ratios for natural images\nstill fall short of the bandwidth and cost-effectiveness requirements of most\npractical imaging and vision systems at present and beyond. To break the\nbottleneck of MLLIC in compression performance, we question the necessity of\nMLLIC, as almost all digital sensors inherently introduce acquisition noises,\nmaking mathematically lossless compression counterproductive. Therefore, in\ncontrast to MLLIC, we propose a new paradigm of joint denoising and compression\ncalled functionally lossless image compression (FLLIC), which performs lossless\ncompression of optimally denoised images (the optimality may be task-specific).\nAlthough not literally lossless with respect to the noisy input, FLLIC aims to\nachieve the best possible reconstruction of the latent noise-free original\nimage. Extensive experiments show that FLLIC achieves state-of-the-art\nperformance in joint denoising and compression of noisy images and does so at a\nlower computational cost.\n","authors":["Xi Zhang","Xiaolin Wu"],"pdf_url":"https://arxiv.org/pdf/2401.13616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13613v1","updated":"2024-01-24T17:35:38Z","published":"2024-01-24T17:35:38Z","title":"Enhancing Image Retrieval : A Comprehensive Study on Photo Search using\n  the CLIP Mode","summary":"  Photo search, the task of retrieving images based on textual queries, has\nwitnessed significant advancements with the introduction of CLIP (Contrastive\nLanguage-Image Pretraining) model. CLIP leverages a vision-language pre\ntraining approach, wherein it learns a shared representation space for images\nand text, enabling cross-modal understanding. This model demonstrates the\ncapability to understand the semantic relationships between diverse image and\ntext pairs, allowing for efficient and accurate retrieval of images based on\nnatural language queries. By training on a large-scale dataset containing\nimages and their associated textual descriptions, CLIP achieves remarkable\ngeneralization, providing a powerful tool for tasks such as zero-shot learning\nand few-shot classification. This abstract summarizes the foundational\nprinciples of CLIP and highlights its potential impact on advancing the field\nof photo search, fostering a seamless integration of natural language\nunderstanding and computer vision for improved information retrieval in\nmultimedia applications\n","authors":["Naresh Kumar Lahajal","Harini S"],"pdf_url":"https://arxiv.org/pdf/2401.13613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08396v2","updated":"2024-01-24T17:12:51Z","published":"2024-01-16T14:41:20Z","title":"Hidden Flaws Behind Expert-Level Accuracy of GPT-4 Vision in Medicine","summary":"  Recent studies indicate that Generative Pre-trained Transformer 4 with Vision\n(GPT-4V) outperforms human physicians in medical challenge tasks. However,\nthese evaluations primarily focused on the accuracy of multi-choice questions\nalone. Our study extends the current scope by conducting a comprehensive\nanalysis of GPT-4V's rationales of image comprehension, recall of medical\nknowledge, and step-by-step multimodal reasoning when solving New England\nJournal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test\nthe knowledge and diagnostic capabilities of medical professionals. Evaluation\nresults confirmed that GPT-4V outperforms human physicians regarding\nmulti-choice accuracy (88.0% vs. 77.0%, p=0.034). GPT-4V also performs well in\ncases where physicians incorrectly answer, with over 80% accuracy. However, we\ndiscovered that GPT-4V frequently presents flawed rationales in cases where it\nmakes the correct final choices (27.3%), most prominent in image comprehension\n(21.6%). Regardless of GPT-4V's high accuracy in multi-choice questions, our\nfindings emphasize the necessity for further in-depth evaluations of its\nrationales before integrating such models into clinical workflows.\n","authors":["Qiao Jin","Fangyuan Chen","Yiliang Zhou","Ziyang Xu","Justin M. Cheung","Robert Chen","Ronald M. Summers","Justin F. Rousseau","Peiyun Ni","Marc J Landsman","Sally L. Baxter","Subhi J. Al'Aref","Yijia Li","Michael F. Chiang","Yifan Peng","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2401.08396v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13596v1","updated":"2024-01-24T17:04:18Z","published":"2024-01-24T17:04:18Z","title":"PLATE: A perception-latency aware estimator,","summary":"  Target tracking is a popular problem with many potential applications. There\nhas been a lot of effort on improving the quality of the detection of targets\nusing cameras through different techniques. In general, with higher\ncomputational effort applied, i.e., a longer perception-latency, a better\ndetection accuracy is obtained. However, it is not always useful to apply the\nlongest perception-latency allowed, particularly when the environment doesn't\nrequire to and when the computational resources are shared between other tasks.\nIn this work, we propose a new Perception-LATency aware Estimator (PLATE),\nwhich uses different perception configurations in different moments of time in\norder to optimize a certain performance measure. This measure takes into\naccount a perception-latency and accuracy trade-off aiming for a good\ncompromise between quality and resource usage. Compared to other heuristic\nframe-skipping techniques, PLATE comes with a formal complexity and optimality\nanalysis. The advantages of PLATE are verified by several experiments including\nan evaluation over a standard benchmark with real data and using state of the\nart deep learning object detection methods for the perception stage.\n","authors":["Rodrigo Aldana-López","Rosario Aragüés","Carlos Sagüés"],"pdf_url":"https://arxiv.org/pdf/2401.13596v1.pdf","comment":"This is the accepted version an already published manuscript. See\n  journal reference for details"},{"id":"http://arxiv.org/abs/2401.13581v1","updated":"2024-01-24T16:45:42Z","published":"2024-01-24T16:45:42Z","title":"Towards Efficient and Effective Deep Clustering with Dynamic Grouping\n  and Prototype Aggregation","summary":"  Previous contrastive deep clustering methods mostly focus on instance-level\ninformation while overlooking the member relationship within groups/clusters,\nwhich may significantly undermine their representation learning and clustering\ncapability. Recently, some group-contrastive methods have been developed,\nwhich, however, typically rely on the samples of the entire dataset to obtain\npseudo labels and lack the ability to efficiently update the group assignments\nin a batch-wise manner. To tackle these critical issues, we present a novel\nend-to-end deep clustering framework with dynamic grouping and prototype\naggregation, termed as DigPro. Specifically, the proposed dynamic grouping\nextends contrastive learning from instance-level to group-level, which is\neffective and efficient for timely updating groups. Meanwhile, we perform\ncontrastive learning on prototypes in a spherical feature space, termed as\nprototype aggregation, which aims to maximize the inter-cluster distance.\nNotably, with an expectation-maximization framework, DigPro simultaneously\ntakes advantage of compact intra-cluster connections, well-separated clusters,\nand efficient group updating during the self-supervised training. Extensive\nexperiments on six image benchmarks demonstrate the superior performance of our\napproach over the state-of-the-art. Code is available at\nhttps://github.com/Regan-Zhang/DigPro.\n","authors":["Haixin Zhang","Dong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.13581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13560v1","updated":"2024-01-24T16:17:23Z","published":"2024-01-24T16:17:23Z","title":"SegMamba: Long-range Sequential Modeling Mamba For 3D Medical Image\n  Segmentation","summary":"  The Transformer architecture has shown a remarkable ability in modeling\nglobal relationships. However, it poses a significant computational challenge\nwhen processing high-dimensional medical images. This hinders its development\nand widespread adoption in this task. Mamba, as a State Space Model (SSM),\nrecently emerged as a notable manner for long-range dependencies in sequential\nmodeling, excelling in natural language processing filed with its remarkable\nmemory efficiency and computational speed. Inspired by its success, we\nintroduce SegMamba, a novel 3D medical image \\textbf{Seg}mentation\n\\textbf{Mamba} model, designed to effectively capture long-range dependencies\nwithin whole volume features at every scale. Our SegMamba, in contrast to\nTransformer-based methods, excels in whole volume feature modeling from a state\nspace model standpoint, maintaining superior processing speed, even with volume\nfeatures at a resolution of {$64\\times 64\\times 64$}. Comprehensive experiments\non the BraTS2023 dataset demonstrate the effectiveness and efficiency of our\nSegMamba. The code for SegMamba is available at:\nhttps://github.com/ge-xing/SegMamba\n","authors":["Zhaohu Xing","Tian Ye","Yijun Yang","Guang Liu","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.13560v1.pdf","comment":"Code has released"},{"id":"http://arxiv.org/abs/2310.03658v2","updated":"2024-01-24T16:13:52Z","published":"2023-10-05T16:35:27Z","title":"Visual inspection for illicit items in X-ray images using Deep Learning","summary":"  Automated detection of contraband items in X-ray images can significantly\nincrease public safety, by enhancing the productivity and alleviating the\nmental load of security officers in airports, subways, customs/post offices,\netc. The large volume and high throughput of passengers, mailed parcels, etc.,\nduring rush hours practically make it a Big Data problem. Modern computer\nvision algorithms relying on Deep Neural Networks (DNNs) have proven capable of\nundertaking this task even under resource-constrained and embedded execution\nscenarios, e.g., as is the case with fast, single-stage object detectors.\nHowever, no comparative experimental assessment of the various relevant DNN\ncomponents/methods has been performed under a common evaluation protocol, which\nmeans that reliable cross-method comparisons are missing. This paper presents\nexactly such a comparative assessment, utilizing a public relevant dataset and\na well-defined methodology for selecting the specific DNN components/modules\nthat are being evaluated. The results indicate the superiority of Transformer\ndetectors, the obsolete nature of auxiliary neural modules that have been\ndeveloped in the past few years for security applications and the efficiency of\nthe CSP-DarkNet backbone CNN.\n","authors":["Ioannis Mademlis","Georgios Batsis","Adamantia Anna Rebolledo Chrysochoou","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2310.03658v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2305.01936"},{"id":"http://arxiv.org/abs/2401.13555v1","updated":"2024-01-24T16:13:26Z","published":"2024-01-24T16:13:26Z","title":"Benchmarking the Fairness of Image Upsampling Methods","summary":"  Recent years have witnessed a rapid development of deep generative models for\ncreating synthetic media, such as images and videos. While the practical\napplications of these models in everyday tasks are enticing, it is crucial to\nassess the inherent risks regarding their fairness. In this work, we introduce\na comprehensive framework for benchmarking the performance and fairness of\nconditional generative models. We develop a set of\nmetrics$\\unicode{x2013}$inspired by their supervised fairness\ncounterparts$\\unicode{x2013}$to evaluate the models on their fairness and\ndiversity. Focusing on the specific application of image upsampling, we create\na benchmark covering a wide variety of modern upsampling methods. As part of\nthe benchmark, we introduce UnfairFace, a subset of FairFace that replicates\nthe racial distribution of common large-scale face datasets. Our empirical\nstudy highlights the importance of using an unbiased training set and reveals\nvariations in how the algorithms respond to dataset imbalances. Alarmingly, we\nfind that none of the considered methods produces statistically fair and\ndiverse results.\n","authors":["Mike Laszkiewicz","Imant Daunhawer","Julia E. Vogt","Asja Fischer","Johannes Lederer"],"pdf_url":"https://arxiv.org/pdf/2401.13555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13554v1","updated":"2024-01-24T16:13:24Z","published":"2024-01-24T16:13:24Z","title":"PanAf20K: A Large Video Dataset for Wild Ape Detection and Behaviour\n  Recognition","summary":"  We present the PanAf20K dataset, the largest and most diverse open-access\nannotated video dataset of great apes in their natural environment. It\ncomprises more than 7 million frames across ~20,000 camera trap videos of\nchimpanzees and gorillas collected at 18 field sites in tropical Africa as part\nof the Pan African Programme: The Cultured Chimpanzee. The footage is\naccompanied by a rich set of annotations and benchmarks making it suitable for\ntraining and testing a variety of challenging and ecologically important\ncomputer vision tasks including ape detection and behaviour recognition.\nFurthering AI analysis of camera trap information is critical given the\nInternational Union for Conservation of Nature now lists all species in the\ngreat ape family as either Endangered or Critically Endangered. We hope the\ndataset can form a solid basis for engagement of the AI community to improve\nperformance, efficiency, and result interpretation in order to support\nassessments of great ape presence, abundance, distribution, and behaviour and\nthereby aid conservation efforts.\n","authors":["Otto Brookes","Majid Mirmehdi","Colleen Stephens","Samuel Angedakin","Katherine Corogenes","Dervla Dowd","Paula Dieguez","Thurston C. Hicks","Sorrel Jones","Kevin Lee","Vera Leinert","Juan Lapuente","Maureen S. McCarthy","Amelia Meier","Mizuki Murai","Emmanuelle Normand","Virginie Vergnes","Erin G. Wessling","Roman M. Wittig","Kevin Langergraber","Nuria Maldonado","Xinyu Yang","Klaus Zuberbuhler","Christophe Boesch","Mimi Arandjelovic","Hjalmar Kuhl","Tilo Burghardt"],"pdf_url":"https://arxiv.org/pdf/2401.13554v1.pdf","comment":"Accepted at IJCV"},{"id":"http://arxiv.org/abs/2401.13551v1","updated":"2024-01-24T16:11:42Z","published":"2024-01-24T16:11:42Z","title":"Interleaving One-Class and Weakly-Supervised Models with Adaptive\n  Thresholding for Unsupervised Video Anomaly Detection","summary":"  Without human annotations, a typical Unsupervised Video Anomaly Detection\n(UVAD) method needs to train two models that generate pseudo labels for each\nother. In previous work, the two models are closely entangled with each other,\nand it is not known how to upgrade their method without modifying their\ntraining framework significantly. Second, previous work usually adopts fixed\nthresholding to obtain pseudo labels, however the user-specified threshold is\nnot reliable which inevitably introduces errors into the training process. To\nalleviate these two problems, we propose a novel interleaved framework that\nalternately trains a One-Class Classification (OCC) model and a\nWeakly-Supervised (WS) model for UVAD. The OCC or WS models in our method can\nbe easily replaced with other OCC or WS models, which facilitates our method to\nupgrade with the most recent developments in both fields. For handling the\nfixed thresholding problem, we break through the conventional cognitive\nboundary and propose a weighted OCC model that can be trained on both normal\nand abnormal data. We also propose an adaptive mechanism for automatically\nfinding the optimal threshold for the WS model in a loose to strict manner.\nExperiments demonstrate that the proposed UVAD method outperforms previous\napproaches.\n","authors":["Yongwei Nie","Hao Huang","Chengjiang Long","Qing Zhang","Pradipta Maji","Hongmin Cai"],"pdf_url":"https://arxiv.org/pdf/2401.13551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05836v5","updated":"2024-01-24T15:54:17Z","published":"2023-11-10T02:47:15Z","title":"UMedNeRF: Uncertainty-aware Single View Volumetric Rendering for Medical\n  Neural Radiance Fields","summary":"  In the field of clinical medicine, computed tomography (CT) is an effective\nmedical imaging modality for the diagnosis of various pathologies. Compared\nwith X-ray images, CT images can provide more information, including\nmulti-planar slices and three-dimensional structures for clinical diagnosis.\nHowever, CT imaging requires patients to be exposed to large doses of ionizing\nradiation for a long time, which may cause irreversible physical harm. In this\npaper, we propose an Uncertainty-aware MedNeRF (UMedNeRF) network based on\ngenerated radiation fields. The network can learn a continuous representation\nof CT projections from 2D X-ray images by obtaining the internal structure and\ndepth information and using adaptive loss weights to ensure the quality of the\ngenerated images. Our model is trained on publicly available knee and chest\ndatasets, and we show the results of CT projection rendering with a single\nX-ray and compare our method with other methods based on generated radiation\nfields.\n","authors":["Jing Hu","Qinrui Fan","Shu Hu","Siwei Lyu","Xi Wu","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.05836v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11751v2","updated":"2024-01-24T15:53:08Z","published":"2024-01-22T08:23:52Z","title":"Boosting Multi-view Stereo with Late Cost Aggregation","summary":"  Pairwise matching cost aggregation is a crucial step for modern\nlearning-based Multi-view Stereo (MVS). Prior works adopt an early aggregation\nscheme, which adds up pairwise costs into an intermediate cost. However, we\nanalyze that this process can degrade informative pairwise matchings, thereby\nblocking the depth network from fully utilizing the original geometric matching\ncues. To address this challenge, we present a late aggregation approach that\nallows for aggregating pairwise costs throughout the network feed-forward\nprocess, achieving accurate estimations with only minor changes of the plain\nCasMVSNet. Instead of building an intermediate cost by weighted sum, late\naggregation preserves all pairwise costs along a distinct view channel. This\nenables the succeeding depth network to fully utilize the crucial geometric\ncues without loss of cost fidelity. Grounded in the new aggregation scheme, we\npropose further techniques addressing view order dependence inside the\npreserved cost, handling flexible testing views, and improving the depth\nfiltering process. Despite its technical simplicity, our method improves\nsignificantly upon the baseline cascade-based approach, achieving comparable\nresults with state-of-the-art methods with favorable computation overhead.\n","authors":["Jiang Wu","Rui Li","Yu Zhu","Wenxun Zhao","Jinqiu Sun","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11751v2.pdf","comment":"Code and models are available at https://github.com/Wuuu3511/LAMVSNET"},{"id":"http://arxiv.org/abs/2401.13531v1","updated":"2024-01-24T15:37:31Z","published":"2024-01-24T15:37:31Z","title":"QAGait: Revisit Gait Recognition from a Quality Perspective","summary":"  Gait recognition is a promising biometric method that aims to identify\npedestrians from their unique walking patterns. Silhouette modality, renowned\nfor its easy acquisition, simple structure, sparse representation, and\nconvenient modeling, has been widely employed in controlled in-the-lab\nresearch. However, as gait recognition rapidly advances from in-the-lab to\nin-the-wild scenarios, various conditions raise significant challenges for\nsilhouette modality, including 1) unidentifiable low-quality silhouettes\n(abnormal segmentation, severe occlusion, or even non-human shape), and 2)\nidentifiable but challenging silhouettes (background noise, non-standard\nposture, slight occlusion). To address these challenges, we revisit gait\nrecognition pipeline and approach gait recognition from a quality perspective,\nnamely QAGait. Specifically, we propose a series of cost-effective quality\nassessment strategies, including Maxmial Connect Area and Template Match to\neliminate background noises and unidentifiable silhouettes, Alignment strategy\nto handle non-standard postures. We also propose two quality-aware loss\nfunctions to integrate silhouette quality into optimization within the\nembedding space. Extensive experiments demonstrate our QAGait can guarantee\nboth gait reliability and performance enhancement. Furthermore, our quality\nassessment strategies can seamlessly integrate with existing gait datasets,\nshowcasing our superiority. Code is available at\nhttps://github.com/wzb-bupt/QAGait.\n","authors":["Zengbin Wang","Saihui Hou","Man Zhang","Xu Liu","Chunshui Cao","Yongzhen Huang","Peipei Li","Shibiao Xu"],"pdf_url":"https://arxiv.org/pdf/2401.13531v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.13516v1","updated":"2024-01-24T15:14:05Z","published":"2024-01-24T15:14:05Z","title":"Delocate: Detection and Localization for Deepfake Videos with\n  Randomly-Located Tampered Traces","summary":"  Deepfake videos are becoming increasingly realistic, showing subtle tampering\ntraces on facial areasthat vary between frames. Consequently, many existing\nDeepfake detection methods struggle to detect unknown domain Deepfake videos\nwhile accurately locating the tampered region. To address thislimitation, we\npropose Delocate, a novel Deepfake detection model that can both recognize\nandlocalize unknown domain Deepfake videos. Ourmethod consists of two stages\nnamed recoveringand localization. In the recovering stage, the modelrandomly\nmasks regions of interest (ROIs) and reconstructs real faces without tampering\ntraces, resulting in a relatively good recovery effect for realfaces and a poor\nrecovery effect for fake faces. Inthe localization stage, the output of the\nrecoveryphase and the forgery ground truth mask serve assupervision to guide\nthe forgery localization process. This process strategically emphasizes the\nrecovery phase of fake faces with poor recovery, facilitating the localization\nof tampered regions. Ourextensive experiments on four widely used benchmark\ndatasets demonstrate that Delocate not onlyexcels in localizing tampered areas\nbut also enhances cross-domain detection performance.\n","authors":["Juan Hu","Xin Liao","Difei Gao","Satoshi Tsutsui","Qian Wang","Zheng Qin","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2401.13516v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2308.09921,\n  arXiv:2305.05943"},{"id":"http://arxiv.org/abs/2307.06082v2","updated":"2024-01-24T15:10:07Z","published":"2023-07-12T11:08:24Z","title":"VELMA: Verbalization Embodiment of LLM Agents for Vision and Language\n  Navigation in Street View","summary":"  Incremental decision making in real-world environments is one of the most\nchallenging tasks in embodied artificial intelligence. One particularly\ndemanding scenario is Vision and Language Navigation~(VLN) which requires\nvisual and natural language understanding as well as spatial and temporal\nreasoning capabilities. The embodied agent needs to ground its understanding of\nnavigation instructions in observations of a real-world environment like Street\nView. Despite the impressive results of LLMs in other research areas, it is an\nongoing problem of how to best connect them with an interactive visual\nenvironment. In this work, we propose VELMA, an embodied LLM agent that uses a\nverbalization of the trajectory and of visual environment observations as\ncontextual prompt for the next action. Visual information is verbalized by a\npipeline that extracts landmarks from the human written navigation instructions\nand uses CLIP to determine their visibility in the current panorama view. We\nshow that VELMA is able to successfully follow navigation instructions in\nStreet View with only two in-context examples. We further finetune the LLM\nagent on a few thousand examples and achieve 25%-30% relative improvement in\ntask completion over the previous state-of-the-art for two datasets.\n","authors":["Raphael Schumann","Wanrong Zhu","Weixi Feng","Tsu-Jui Fu","Stefan Riezler","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.06082v2.pdf","comment":"Accepted at AAAI 2024"},{"id":"http://arxiv.org/abs/2401.13511v1","updated":"2024-01-24T15:09:12Z","published":"2024-01-24T15:09:12Z","title":"Tissue Cross-Section and Pen Marking Segmentation in Whole Slide Images","summary":"  Tissue segmentation is a routine preprocessing step to reduce the\ncomputational cost of whole slide image (WSI) analysis by excluding background\nregions. Traditional image processing techniques are commonly used for tissue\nsegmentation, but often require manual adjustments to parameter values for\natypical cases, fail to exclude all slide and scanning artifacts from the\nbackground, and are unable to segment adipose tissue. Pen marking artifacts in\nparticular can be a potential source of bias for subsequent analyses if not\nremoved. In addition, several applications require the separation of individual\ncross-sections, which can be challenging due to tissue fragmentation and\nadjacent positioning. To address these problems, we develop a convolutional\nneural network for tissue and pen marking segmentation using a dataset of 200\nH&E stained WSIs. For separating tissue cross-sections, we propose a novel\npost-processing method based on clustering predicted centroid locations of the\ncross-sections in a 2D histogram. On an independent test set, the model\nachieved a mean Dice score of 0.981$\\pm$0.033 for tissue segmentation and a\nmean Dice score of 0.912$\\pm$0.090 for pen marking segmentation. The mean\nabsolute difference between the number of annotated and separated\ncross-sections was 0.075$\\pm$0.350. Our results demonstrate that the proposed\nmodel can accurately segment H&E stained tissue cross-sections and pen markings\nin WSIs while being robust to many common slide and scanning artifacts. The\nmodel with trained model parameters and post-processing method are made\npublicly available as a Python package called SlideSegmenter.\n","authors":["Ruben T. Lucassen","Willeke A. M. Blokx","Mitko Veta"],"pdf_url":"https://arxiv.org/pdf/2401.13511v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.13505v1","updated":"2024-01-24T14:53:13Z","published":"2024-01-24T14:53:13Z","title":"Generative Human Motion Stylization in Latent Space","summary":"  Human motion stylization aims to revise the style of an input motion while\nkeeping its content unaltered. Unlike existing works that operate directly in\npose space, we leverage the latent space of pretrained autoencoders as a more\nexpressive and robust representation for motion extraction and infusion.\nBuilding upon this, we present a novel generative model that produces diverse\nstylization results of a single motion (latent) code. During training, a motion\ncode is decomposed into two coding components: a deterministic content code,\nand a probabilistic style code adhering to a prior distribution; then a\ngenerator massages the random combination of content and style codes to\nreconstruct the corresponding motion codes. Our approach is versatile, allowing\nthe learning of probabilistic style space from either style labeled or\nunlabeled motions, providing notable flexibility in stylization as well. In\ninference, users can opt to stylize a motion using style cues from a reference\nmotion or a label. Even in the absence of explicit style input, our model\nfacilitates novel re-stylization by sampling from the unconditional style prior\ndistribution. Experimental results show that our proposed stylization models,\ndespite their lightweight design, outperform the state-of-the-arts in style\nreeanactment, content preservation, and generalization across various\napplications and settings. Project Page: https://yxmu.foo/GenMoStyle\n","authors":["Chuan Guo","Yuxuan Mu","Xinxin Zuo","Peng Dai","Youliang Yan","Juwei Lu","Li Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.13505v1.pdf","comment":"Accepted for ICLR2024"},{"id":"http://arxiv.org/abs/2401.13504v1","updated":"2024-01-24T14:53:06Z","published":"2024-01-24T14:53:06Z","title":"Research about the Ability of LLM in the Tamper-Detection Area","summary":"  In recent years, particularly since the early 2020s, Large Language Models\n(LLMs) have emerged as the most powerful AI tools in addressing a diverse range\nof challenges, from natural language processing to complex problem-solving in\nvarious domains. In the field of tamper detection, LLMs are capable of\nidentifying basic tampering activities.To assess the capabilities of LLMs in\nmore specialized domains, we have collected five different LLMs developed by\nvarious companies: GPT-4, LLaMA, Bard, ERNIE Bot 4.0, and Tongyi Qianwen. This\ndiverse range of models allows for a comprehensive evaluation of their\nperformance in detecting sophisticated tampering instances.We devised two\ndomains of detection: AI-Generated Content (AIGC) detection and manipulation\ndetection. AIGC detection aims to test the ability to distinguish whether an\nimage is real or AI-generated. Manipulation detection, on the other hand,\nfocuses on identifying tampered images. According to our experiments, most LLMs\ncan identify composite pictures that are inconsistent with logic, and only more\npowerful LLMs can distinguish logical, but visible signs of tampering to the\nhuman eye. All of the LLMs can't identify carefully forged images and very\nrealistic images generated by AI. In the area of tamper detection, LLMs still\nhave a long way to go, particularly in reliably identifying highly\nsophisticated forgeries and AI-generated images that closely mimic reality.\n","authors":["Xinyu Yang","Jizhe Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.13504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13503v1","updated":"2024-01-24T14:51:33Z","published":"2024-01-24T14:51:33Z","title":"Learning Representations for Clustering via Partial Information\n  Discrimination and Cross-Level Interaction","summary":"  In this paper, we present a novel deep image clustering approach termed PICI,\nwhich enforces the partial information discrimination and the cross-level\ninteraction in a joint learning framework. In particular, we leverage a\nTransformer encoder as the backbone, through which the masked image modeling\nwith two paralleled augmented views is formulated. After deriving the class\ntokens from the masked images by the Transformer encoder, three partial\ninformation learning modules are further incorporated, including the PISD\nmodule for training the auto-encoder via masked image reconstruction, the PICD\nmodule for employing two levels of contrastive learning, and the CLI module for\nmutual interaction between the instance-level and cluster-level subspaces.\nExtensive experiments have been conducted on six real-world image datasets,\nwhich demononstrate the superior clustering performance of the proposed PICI\napproach over the state-of-the-art deep clustering approaches. The source code\nis available at https://github.com/Regan-Zhang/PICI.\n","authors":["Hai-Xin Zhang","Dong Huang","Hua-Bao Ling","Guang-Yu Zhang","Wei-jun Sun","Zi-hao Wen"],"pdf_url":"https://arxiv.org/pdf/2401.13503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13499v1","updated":"2024-01-24T14:44:48Z","published":"2024-01-24T14:44:48Z","title":"LDCA: Local Descriptors with Contextual Augmentation for Few-Shot\n  Learning","summary":"  Few-shot image classification has emerged as a key challenge in the field of\ncomputer vision, highlighting the capability to rapidly adapt to new tasks with\nminimal labeled data. Existing methods predominantly rely on image-level\nfeatures or local descriptors, often overlooking the holistic context\nsurrounding these descriptors. In this work, we introduce a novel approach\ntermed \"Local Descriptor with Contextual Augmentation (LDCA)\". Specifically,\nthis method bridges the gap between local and global understanding uniquely by\nleveraging an adaptive global contextual enhancement module. This module\nincorporates a visual transformer, endowing local descriptors with contextual\nawareness capabilities, ranging from broad global perspectives to intricate\nsurrounding nuances. By doing so, LDCA transcends traditional descriptor-based\napproaches, ensuring each local feature is interpreted within its larger visual\nnarrative. Extensive experiments underscore the efficacy of our method, showing\na maximal absolute improvement of 20\\% over the next-best on fine-grained\nclassification datasets, thus demonstrating significant advancements in\nfew-shot classification tasks.\n","authors":["Maofa Wang","Bingchen Yan"],"pdf_url":"https://arxiv.org/pdf/2401.13499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09099v2","updated":"2024-01-24T14:33:32Z","published":"2023-10-13T13:35:19Z","title":"Vision Transformers increase efficiency of 3D cardiac CT multi-label\n  segmentation","summary":"  Accurate segmentation of the heart is essential for personalized blood flow\nsimulations and surgical intervention planning. Segmentations need to be\naccurate in every spatial dimension, which is not ensured by segmenting data\nslice by slice. Two cardiac computed tomography (CT) datasets consisting of 760\nvolumes across the whole cardiac cycle from 39 patients, and of 60 volumes from\n60 patients respectively were used to train networks to simultaneously segment\nmultiple regions representing the whole heart in 3D. The segmented regions\nincluded the left and right atrium and ventricle, left ventricular myocardium,\nascending aorta, pulmonary arteries, pulmonary veins, and left atrial\nappendage. The widely used 3D U-Net and the UNETR architecture were compared to\nour proposed method optimized for large volumetric inputs. The proposed network\narchitecture, termed Transformer Residual U-Net (TRUNet), maintains the cascade\ndownsampling encoder, cascade upsampling decoder and skip connections from\nU-Net, while incorporating a Vision Transformer (ViT) block in the encoder\nalongside a modified ResNet50 block. TRUNet reached higher segmentation\nperformance for all structures within approximately half the training time\nneeded for 3D U-Net and UNETR. The proposed method achieved more precise vessel\nboundary segmentations and better captured the heart's overall anatomical\nstructure compared to the other methods. The fast training time and accurate\ndelineation of adjacent structures makes TRUNet a promising candidate for\nmedical image segmentation tasks. The code for TRUNet is available at\ngithub.com/ljollans/TRUNet.\n","authors":["Lee Jollans","Mariana Bustamante","Lilian Henriksson","Anders Persson","Tino Ebbers"],"pdf_url":"https://arxiv.org/pdf/2310.09099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13478v1","updated":"2024-01-24T14:23:12Z","published":"2024-01-24T14:23:12Z","title":"SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval","summary":"  Multi-modal information retrieval (MMIR) is a rapidly evolving field, where\nsignificant progress, particularly in image-text pairing, has been made through\nadvanced representation learning and cross-modality alignment research.\nHowever, current benchmarks for evaluating MMIR performance in image-text\npairing within the scientific domain show a notable gap, where chart and table\nimages described in scholarly language usually do not play a significant role.\nTo bridge this gap, we develop a specialised scientific MMIR (SciMMIR)\nbenchmark by leveraging open-access paper collections to extract data relevant\nto the scientific domain. This benchmark comprises 530K meticulously curated\nimage-text pairs, extracted from figures and tables with detailed captions in\nscientific documents. We further annotate the image-text pairs with two-level\nsubset-subcategory hierarchy annotations to facilitate a more comprehensive\nevaluation of the baselines. We conducted zero-shot and fine-tuning evaluations\non prominent multi-modal image-captioning and visual language models, such as\nCLIP and BLIP. Our analysis offers critical insights for MMIR in the scientific\ndomain, including the impact of pre-training and fine-tuning settings and the\ninfluence of the visual and textual encoders. All our data and checkpoints are\npublicly available at https://github.com/Wusiwei0410/SciMMIR.\n","authors":["Siwei Wu","Yizhi Li","Kang Zhu","Ge Zhang","Yiming Liang","Kaijing Ma","Chenghao Xiao","Haoran Zhang","Bohao Yang","Wenhu Chen","Wenhao Huang","Noura Al Moubayed","Jie Fu","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2401.13478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13472v1","updated":"2024-01-24T14:18:02Z","published":"2024-01-24T14:18:02Z","title":"Segmenting Cardiac Muscle Z-disks with Deep Neural Networks","summary":"  Z-disks are complex structures that delineate repeating sarcomeres in\nstriated muscle. They play significant roles in cardiomyocytes such as\nproviding mechanical stability for the contracting sarcomere, cell signalling\nand autophagy. Changes in Z-disk architecture have been associated with\nimpaired cardiac function. Hence, there is a strong need to create tools to\nsegment Z-disks from microscopy images, that overcome traditional limitations\nsuch as variability in image brightness and staining technique. In this study,\nwe apply deep learning based segmentation models to extract Z-disks in images\nof striated muscle tissue. We leverage a novel Airyscan confocal dataset, which\ncomprises high resolution images of Z-disks of healthy heart tissue, stained\nwith Affimers for specific Z-disk proteins. We employed an interactive\nlabelling tool, Ilastik to obtain ground truth segmentation masks and use the\nresulting data set to train and evaluate the performance of several\nstate-of-the-art segmentation networks. On the test set, UNet++ achieves best\nsegmentation performance for Z-disks in cardiomyocytes, with an average Dice\nscore of 0.91 and outperforms other established segmentation methods including\nUNet, FPN, DeepLabv3+ and pix2pix. However, pix2pix demonstrates improved\ngeneralisation, when tested on an additional dataset of cardiomyocytes with a\ntitin mutation. This is the first study to demonstrate that automated machine\nlearning-based segmentation approaches may be used effectively to segment\nZ-disks in confocal microscopy images. Automated segmentation approaches and\npredicted segmentation masks could be used to derive morphological features of\nZ-disks (e.g. width and orientation), and subsequently, to quantify\ndisease-related changes to cardiac microstructure.\n","authors":["Mihaela Croitor Ibrahim","Nishant Ravikumar","Alistair Curd","Joanna Leng","Oliver Umney","Michelle Peckham"],"pdf_url":"https://arxiv.org/pdf/2401.13472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10608v2","updated":"2024-01-24T13:47:33Z","published":"2024-01-19T10:37:27Z","title":"M2ORT: Many-To-One Regression Transformer for Spatial Transcriptomics\n  Prediction from Histopathology Images","summary":"  The advancement of Spatial Transcriptomics (ST) has facilitated the\nspatially-aware profiling of gene expressions based on histopathology images.\nAlthough ST data offers valuable insights into the micro-environment of tumors,\nits acquisition cost remains expensive. Therefore, directly predicting the ST\nexpressions from digital pathology images is desired. Current methods usually\nadopt existing regression backbones for this task, which ignore the inherent\nmulti-scale hierarchical data structure of digital pathology images. To address\nthis limit, we propose M2ORT, a many-to-one regression Transformer that can\naccommodate the hierarchical structure of the pathology images through a\ndecoupled multi-scale feature extractor. Different from traditional models that\nare trained with one-to-one image-label pairs, M2ORT accepts multiple pathology\nimages of different magnifications at a time to jointly predict the gene\nexpressions at their corresponding common ST spot, aiming at learning a\nmany-to-one relationship through training. We have tested M2ORT on three public\nST datasets and the experimental results show that M2ORT can achieve\nstate-of-the-art performance with fewer parameters and floating-point\noperations (FLOPs). The code is available at:\nhttps://github.com/Dootmaan/M2ORT/.\n","authors":["Hongyi Wang","Xiuju Du","Jing Liu","Shuyi Ouyang","Yen-Wei Chen","Lanfen Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11115v3","updated":"2024-01-24T13:08:59Z","published":"2024-01-20T04:58:06Z","title":"MotionMix: Weakly-Supervised Diffusion for Controllable Motion\n  Generation","summary":"  Controllable generation of 3D human motions becomes an important topic as the\nworld embraces digital transformation. Existing works, though making promising\nprogress with the advent of diffusion models, heavily rely on meticulously\ncaptured and annotated (e.g., text) high-quality motion corpus, a\nresource-intensive endeavor in the real world. This motivates our proposed\nMotionMix, a simple yet effective weakly-supervised diffusion model that\nleverages both noisy and unannotated motion sequences. Specifically, we\nseparate the denoising objectives of a diffusion model into two stages:\nobtaining conditional rough motion approximations in the initial $T-T^*$ steps\nby learning the noisy annotated motions, followed by the unconditional\nrefinement of these preliminary motions during the last $T^*$ steps using\nunannotated motions. Notably, though learning from two sources of imperfect\ndata, our model does not compromise motion generation quality compared to fully\nsupervised approaches that access gold data. Extensive experiments on several\nbenchmarks demonstrate that our MotionMix, as a versatile framework,\nconsistently achieves state-of-the-art performances on text-to-motion,\naction-to-motion, and music-to-dance tasks. Project page:\nhttps://nhathoang2002.github.io/MotionMix-page/\n","authors":["Nhat M. Hoang","Kehong Gong","Chuan Guo","Michael Bi Mi"],"pdf_url":"https://arxiv.org/pdf/2401.11115v3.pdf","comment":"Accepted at the 38th Association for the Advancement of Artificial\n  Intelligence (AAAI) Conference on Artificial Intelligence, Main Conference"},{"id":"http://arxiv.org/abs/2401.13432v1","updated":"2024-01-24T13:03:28Z","published":"2024-01-24T13:03:28Z","title":"Semi-Supervised Coupled Thin-Plate Spline Model for Rotation Correction\n  and Beyond","summary":"  Thin-plate spline (TPS) is a principal warp that allows for representing\nelastic, nonlinear transformation with control point motions. With the increase\nof control points, the warp becomes increasingly flexible but usually\nencounters a bottleneck caused by undesired issues, e.g., content distortion.\nIn this paper, we explore generic applications of TPS in single-image-based\nwarping tasks, such as rotation correction, rectangling, and portrait\ncorrection. To break this bottleneck, we propose the coupled thin-plate spline\nmodel (CoupledTPS), which iteratively couples multiple TPS with limited control\npoints into a more flexible and powerful transformation. Concretely, we first\ndesign an iterative search to predict new control points according to the\ncurrent latent condition. Then, we present the warping flow as a bridge for the\ncoupling of different TPS transformations, effectively eliminating\ninterpolation errors caused by multiple warps. Besides, in light of the\nlaborious annotation cost, we develop a semi-supervised learning scheme to\nimprove warping quality by exploiting unlabeled data. It is formulated through\ndual transformation between the searched control points of unlabeled data and\nits graphic augmentation, yielding an implicit correction consistency\nconstraint. Finally, we collect massive unlabeled data to exhibit the benefit\nof our semi-supervised scheme in rotation correction. Extensive experiments\ndemonstrate the superiority and universality of CoupledTPS over the existing\nstate-of-the-art (SoTA) solutions for rotation correction and beyond. The code\nand data will be available at https://github.com/nie-lang/CoupledTPS.\n","authors":["Lang Nie","Chunyu Lin","Kang Liao","Shuaicheng Liu","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.13432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12343v3","updated":"2024-01-24T12:51:43Z","published":"2022-11-20T01:09:49Z","title":"Diffusion Model Based Posterior Sampling for Noisy Linear Inverse\n  Problems","summary":"  We consider the ubiquitous linear inverse problems with additive Gaussian\nnoise and propose an unsupervised sampling approach called diffusion model\nbased posterior sampling (DMPS) to reconstruct the unknown signal from noisy\nlinear measurements. Specifically, using one diffusion model (DM) as an\nimplicit prior, the fundamental difficulty in performing posterior sampling is\nthat the noise-perturbed likelihood score, i.e., gradient of an annealed\nlikelihood function, is intractable. To circumvent this problem, we introduce a\nsimple yet effective closed-form approximation using an uninformative prior\nassumption. Extensive experiments are conducted on a variety of noisy linear\ninverse problems such as noisy super-resolution, denoising, deblurring, and\ncolorization. In all tasks, the proposed DMPS demonstrates highly competitive\nor even better performances on various tasks while being 3 times faster than\nthe state-of-the-art competitor diffusion posterior sampling (DPS).\n","authors":["Xiangming Meng","Yoshiyuki Kabashima"],"pdf_url":"https://arxiv.org/pdf/2211.12343v3.pdf","comment":"Code is available at https://github.com/mengxiangming/dmps"},{"id":"http://arxiv.org/abs/2401.13418v1","updated":"2024-01-24T12:30:04Z","published":"2024-01-24T12:30:04Z","title":"Serial fusion of multi-modal biometric systems","summary":"  Serial, or sequential, fusion of multiple biometric matchers has been not\nthoroughly investigated so far. However, this approach exhibits some advantages\nwith respect to the widely adopted parallel approaches. In this paper, we\npropose a novel theoretical framework for the assessment of performance of such\nsystems, based on a previous work of the authors. Benefits in terms of\nperformance are theoretically evaluated, as well as estimation errors in the\nmodel parameters computation. Model is analyzed from the viewpoint of its pros\nand cons, by mean of preliminary experiments performed on NIST Biometric Score\nSet 1.\n","authors":["Gian Luca Marcialis","Paolo Mastinu","Fabio Roli"],"pdf_url":"https://arxiv.org/pdf/2401.13418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17425v2","updated":"2024-01-24T12:22:55Z","published":"2023-11-29T07:57:30Z","title":"SpeechAct: Towards Generating Whole-body Motion from Speech","summary":"  This paper addresses the problem of generating whole-body motion from speech.\nDespite great successes, prior methods still struggle to produce reasonable and\ndiverse whole-body motions from speech. This is due to their reliance on\nsuboptimal representations and a lack of strategies for generating diverse\nresults. To address these challenges, we present a novel hybrid point\nrepresentation to achieve accurate and continuous motion generation, e.g.,\navoiding foot skating, and this representation can be transformed into an\neasy-to-use representation, i.e., SMPL-X body mesh, for many applications. To\ngenerate whole-body motion from speech, for facial motion, closely tied to the\naudio signal, we introduce an encoder-decoder architecture to achieve\ndeterministic outcomes. However, for the body and hands, which have weaker\nconnections to the audio signal, we aim to generate diverse yet reasonable\nmotions. To boost diversity in motion generation, we propose a contrastive\nmotion learning method to encourage the model to produce more distinctive\nrepresentations. Specifically, we design a robust VQ-VAE to learn a quantized\nmotion codebook using our hybrid representation. Then, we regress the motion\nrepresentation from the audio signal by a translation model employing our\ncontrastive motion learning method. Experimental results validate the superior\nperformance and the correctness of our model. The project page is available for\nresearch purposes at http://cic.tju.edu.cn/faculty/likun/projects/SpeechAct.\n","authors":["Jinsong Zhang","Minjie Zhu","Yuxiang Zhang","Yebin Liu","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2311.17425v2.pdf","comment":"the manuscript should be revised"},{"id":"http://arxiv.org/abs/2305.17369v2","updated":"2024-01-24T12:22:40Z","published":"2023-05-27T05:00:14Z","title":"Modularized Zero-shot VQA with Pre-trained Models","summary":"  Large-scale pre-trained models (PTMs) show great zero-shot capabilities. In\nthis paper, we study how to leverage them for zero-shot visual question\nanswering (VQA). Our approach is motivated by a few observations. First, VQA\nquestions often require multiple steps of reasoning, which is still a\ncapability that most PTMs lack. Second, different steps in VQA reasoning chains\nrequire different skills such as object detection and relational reasoning, but\na single PTM may not possess all these skills. Third, recent work on zero-shot\nVQA does not explicitly consider multi-step reasoning chains, which makes them\nless interpretable compared with a decomposition-based approach. We propose a\nmodularized zero-shot network that explicitly decomposes questions into sub\nreasoning steps and is highly interpretable. We convert sub reasoning tasks to\nacceptable objectives of PTMs and assign tasks to proper PTMs without any\nadaptation. Our experiments on two VQA benchmarks under the zero-shot setting\ndemonstrate the effectiveness of our method and better interpretability\ncompared with several baselines.\n","authors":["Rui Cao","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2305.17369v2.pdf","comment":"accepted as Findings in ACL 2023; Code available:\n  https://github.com/abril4416/Mod-Zero-VQA"},{"id":"http://arxiv.org/abs/2401.13414v1","updated":"2024-01-24T12:18:31Z","published":"2024-01-24T12:18:31Z","title":"GTAutoAct: An Automatic Datasets Generation Framework Based on Game\n  Engine Redevelopment for Action Recognition","summary":"  Current datasets for action recognition tasks face limitations stemming from\ntraditional collection and generation methods, including the constrained range\nof action classes, absence of multi-viewpoint recordings, limited diversity,\npoor video quality, and labor-intensive manually collection. To address these\nchallenges, we introduce GTAutoAct, a innovative dataset generation framework\nleveraging game engine technology to facilitate advancements in action\nrecognition. GTAutoAct excels in automatically creating large-scale,\nwell-annotated datasets with extensive action classes and superior video\nquality. Our framework's distinctive contributions encompass: (1) it\ninnovatively transforms readily available coordinate-based 3D human motion into\nrotation-orientated representation with enhanced suitability in multiple\nviewpoints; (2) it employs dynamic segmentation and interpolation of rotation\nsequences to create smooth and realistic animations of action; (3) it offers\nextensively customizable animation scenes; (4) it implements an autonomous\nvideo capture and processing pipeline, featuring a randomly navigating camera,\nwith auto-trimming and labeling functionalities. Experimental results\nunderscore the framework's robustness and highlights its potential to\nsignificantly improve action recognition model training.\n","authors":["Xingyu Song","Zhan Li","Shi Chen","Kazuyuki Demachi"],"pdf_url":"https://arxiv.org/pdf/2401.13414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10623v2","updated":"2024-01-24T12:06:49Z","published":"2023-08-21T10:47:52Z","title":"GaitPT: Skeletons Are All You Need For Gait Recognition","summary":"  The analysis of patterns of walking is an important area of research that has\nnumerous applications in security, healthcare, sports and human-computer\ninteraction. Lately, walking patterns have been regarded as a unique\nfingerprinting method for automatic person identification at a distance. In\nthis work, we propose a novel gait recognition architecture called Gait Pyramid\nTransformer (GaitPT) that leverages pose estimation skeletons to capture unique\nwalking patterns, without relying on appearance information. GaitPT adopts a\nhierarchical transformer architecture that effectively extracts both spatial\nand temporal features of movement in an anatomically consistent manner, guided\nby the structure of the human skeleton. Our results show that GaitPT achieves\nstate-of-the-art performance compared to other skeleton-based gait recognition\nworks, in both controlled and in-the-wild scenarios. GaitPT obtains 82.6%\naverage accuracy on CASIA-B, surpassing other works by a margin of 6%.\nMoreover, it obtains 52.16% Rank-1 accuracy on GREW, outperforming both\nskeleton-based and appearance-based approaches.\n","authors":["Andy Catruna","Adrian Cosma","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2308.10623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.05967v2","updated":"2024-01-24T12:00:47Z","published":"2022-05-12T09:11:00Z","title":"Target Aware Network Architecture Search and Compression for Efficient\n  Knowledge Transfer","summary":"  Transfer Learning enables Convolutional Neural Networks (CNN) to acquire\nknowledge from a source domain and transfer it to a target domain, where\ncollecting large-scale annotated examples is time-consuming and expensive.\nConventionally, while transferring the knowledge learned from one task to\nanother task, the deeper layers of a pre-trained CNN are finetuned over the\ntarget dataset. However, these layers are originally designed for the source\ntask which may be over-parameterized for the target task. Thus, finetuning\nthese layers over the target dataset may affect the generalization ability of\nthe CNN due to high network complexity. To tackle this problem, we propose a\ntwo-stage framework called TASCNet which enables efficient knowledge transfer.\nIn the first stage, the configuration of the deeper layers is learned\nautomatically and finetuned over the target dataset. Later, in the second\nstage, the redundant filters are pruned from the fine-tuned CNN to decrease the\nnetwork's complexity for the target task while preserving the performance. This\ntwo-stage mechanism finds a compact version of the pre-trained CNN with optimal\nstructure (number of filters in a convolutional layer, number of neurons in a\ndense layer, and so on) from the hypothesis space. The efficacy of the proposed\nmethod is evaluated using VGG-16, ResNet-50, and DenseNet-121 on CalTech-101,\nCalTech-256, and Stanford Dogs datasets. Similar to computer vision tasks, we\nhave also conducted experiments on Movie Review Sentiment Analysis task. The\nproposed TASCNet reduces the computational complexity of pre-trained CNNs over\nthe target task by reducing both trainable parameters and FLOPs which enables\nresource-efficient knowledge transfer. The source code is available at:\nhttps://github.com/Debapriya-Tula/TASCNet.\n","authors":["S. H. Shabbeer Basha","Debapriya Tula","Sravan Kumar Vinakota","Shiv Ram Dubey"],"pdf_url":"https://arxiv.org/pdf/2205.05967v2.pdf","comment":"This paper is accepted for publication in Multimedia Systems Journal"},{"id":"http://arxiv.org/abs/2401.13405v1","updated":"2024-01-24T11:58:30Z","published":"2024-01-24T11:58:30Z","title":"Synthetic data enables faster annotation and robust segmentation for\n  multi-object grasping in clutter","summary":"  Object recognition and object pose estimation in robotic grasping continue to\nbe significant challenges, since building a labelled dataset can be time\nconsuming and financially costly in terms of data collection and annotation. In\nthis work, we propose a synthetic data generation method that minimizes human\nintervention and makes downstream image segmentation algorithms more robust by\ncombining a generated synthetic dataset with a smaller real-world dataset\n(hybrid dataset). Annotation experiments show that the proposed synthetic scene\ngeneration can diminish labelling time dramatically. RGB image segmentation is\ntrained with hybrid dataset and combined with depth information to produce\npixel-to-point correspondence of individual segmented objects. The object to\ngrasp is then determined by the confidence score of the segmentation algorithm.\nPick-and-place experiments demonstrate that segmentation trained on our hybrid\ndataset (98.9%, 70%) outperforms the real dataset and a publicly available\ndataset by (6.7%, 18.8%) and (2.8%, 10%) in terms of labelling and grasping\nsuccess rate, respectively. Supplementary material is available at\nhttps://sites.google.com/view/synthetic-dataset-generation.\n","authors":["Dongmyoung Lee","Wei Chen","Nicolas Rojas"],"pdf_url":"https://arxiv.org/pdf/2401.13405v1.pdf","comment":"Accepted for 2024 10th International Conference on Mechatronics and\n  Robotics Engineering (ICMRE)"},{"id":"http://arxiv.org/abs/2401.13403v1","updated":"2024-01-24T11:57:25Z","published":"2024-01-24T11:57:25Z","title":"SEDNet: Shallow Encoder-Decoder Network for Brain Tumor Segmentation","summary":"  Despite the advancement in computational modeling towards brain tumor\nsegmentation, of which several models have been developed, it is evident from\nthe computational complexity of existing models which are still at an all-time\nhigh, that performance and efficiency under clinical application scenarios are\nlimited. Therefore, this paper proposes a shallow encoder and decoder network\nnamed SEDNet for brain tumor segmentation. The proposed network is adapted from\nthe U-Net structure. Though brain tumors do not assume complex structures like\nthe task the traditional U-Net was designed for, their variance in appearance,\nshape, and ambiguity of boundaries makes it a compelling complex task to solve.\nSEDNet architecture design is inspired by the localized nature of brain tumors\nin brain images, thus consists of sufficient hierarchical convolutional blocks\nin the encoding pathway capable of learning the intrinsic features of brain\ntumors in brain slices, and a decoding pathway with selective skip path\nsufficient for capturing miniature local-level spatial features alongside the\nglobal-level features of brain tumor. SEDNet with the integration of the\nproposed preprocessing algorithm and optimization function on the BraTS2020 set\nreserved for testing achieves impressive dice and Hausdorff scores of 0.9308,\n0.9451, 0.9026, and 0.7040, 1.2866, 0.7762 for non-enhancing tumor core (NTC),\nperitumoral edema (ED), and enhancing tumor (ET), respectively. Furthermore,\nthrough transfer learning with initialized SEDNet pre-trained weights, termed\nSEDNetX, a performance increase is observed. The dice and Hausdorff scores\nrecorded are 0.9336, 0.9478, 0.9061, 0.6983, 1.2691, and 0.7711 for NTC, ED,\nand ET, respectively. With about 1.3 million parameters and impressive\nperformance in comparison to the state-of-the-art, SEDNet(X) is shown to be\ncomputationally efficient for real-time clinical diagnosis.\n","authors":["Chollette C. Olisah"],"pdf_url":"https://arxiv.org/pdf/2401.13403v1.pdf","comment":"8 pages, 7 figures, 3 Tables"},{"id":"http://arxiv.org/abs/2309.15508v2","updated":"2024-01-24T11:52:15Z","published":"2023-09-27T09:23:50Z","title":"DreamCom: Finetuning Text-guided Inpainting Model for Image Composition","summary":"  The goal of image composition is merging a foreground object into a\nbackground image to obtain a realistic composite image. Recently, generative\ncomposition methods are built on large pretrained diffusion models, due to\ntheir unprecedented image generation ability. However, they are weak in\npreserving the foreground object details. Inspired by recent text-to-image\ngeneration customized for certain object, we propose DreamCom by treating image\ncomposition as text-guided image inpainting customized for certain object.\nSpecifically , we finetune pretrained text-guided image inpainting model based\non a few reference images containing the same object, during which the text\nprompt contains a special token associated with this object. Then, given a new\nbackground, we can insert this object into the background with the text prompt\ncontaining the special token. In practice, the inserted object may be adversely\naffected by the background, so we propose masked attention mechanisms to avoid\nnegative background interference. Experimental results on DreamEditBench and\nour contributed MureCom dataset show the outstanding performance of our\nDreamCom.\n","authors":["Lingxiao Lu","Jiangtong Li","Bo Zhang","Li Niu"],"pdf_url":"https://arxiv.org/pdf/2309.15508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12978v2","updated":"2024-01-24T11:46:22Z","published":"2024-01-23T18:59:59Z","title":"Zero-Shot Learning for the Primitives of 3D Affordance in General\n  Objects","summary":"  One of the major challenges in AI is teaching machines to precisely respond\nand utilize environmental functionalities, thereby achieving the affordance\nawareness that humans possess. Despite its importance, the field has been\nlagging in terms of learning, especially in 3D, as annotating affordance\naccompanies a laborious process due to the numerous variations of human-object\ninteraction. The low availability of affordance data limits the learning in\nterms of generalization for object categories, and also simplifies the\nrepresentation of affordance, capturing only a fraction of the affordance. To\novercome these challenges, we propose a novel, self-supervised method to\ngenerate the 3D affordance examples given only a 3D object, without any manual\nannotations. The method starts by capturing the 3D object into images and\ncreating 2D affordance images by inserting humans into the image via inpainting\ndiffusion models, where we present the Adaptive Mask algorithm to enable human\ninsertion without altering the original details of the object. The method\nconsequently lifts inserted humans back to 3D to create 3D human-object pairs,\nwhere the depth ambiguity is resolved within a depth optimization framework\nthat utilizes pre-generated human postures from multiple viewpoints. We also\nprovide a novel affordance representation defined on relative orientations and\nproximity between dense human and object points, that can be easily aggregated\nfrom any 3D HOI datasets. The proposed representation serves as a primitive\nthat can be manifested to conventional affordance representations via simple\ntransformations, ranging from physically exerted affordances to nonphysical\nones. We demonstrate the efficacy of our method and representation by\ngenerating the 3D affordance samples and deriving high-quality affordance\nexamples from the representation, including contact, orientation, and spatial\noccupancies.\n","authors":["Hyeonwoo Kim","Sookwan Han","Patrick Kwon","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2401.12978v2.pdf","comment":"Project Page: https://sshowbiz.github.io/ZSP3A/"},{"id":"http://arxiv.org/abs/2211.10227v4","updated":"2024-01-24T11:38:25Z","published":"2022-11-18T13:26:57Z","title":"Adversarial Detection by Approximation of Ensemble Boundary","summary":"  A new method of detecting adversarial attacks is proposed for an ensemble of\nDeep Neural Networks (DNNs) solving two-class pattern recognition problems. The\nensemble is combined using Walsh coefficients which are capable of\napproximating Boolean functions and thereby controlling the complexity of the\nensemble decision boundary. The hypothesis in this paper is that decision\nboundaries with high curvature allow adversarial perturbations to be found, but\nchange the curvature of the decision boundary, which is then approximated in a\ndifferent way by Walsh coefficients compared to the clean images. By observing\nthe difference in Walsh coefficient approximation between clean and adversarial\nimages, it is shown experimentally that transferability of attack may be used\nfor detection. Furthermore, approximating the decision boundary may aid in\nunderstanding the learning and transferability properties of DNNs. While the\nexperiments here use images, the proposed approach of modelling two-class\nensemble decision boundaries could in principle be applied to any application\narea. Code for approximating Boolean functions using Walsh coefficients:\nhttps://doi.org/10.24433/CO.3695905.v1\n","authors":["T. Windeatt"],"pdf_url":"https://arxiv.org/pdf/2211.10227v4.pdf","comment":"17 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2401.13388v1","updated":"2024-01-24T11:36:44Z","published":"2024-01-24T11:36:44Z","title":"UNIMO-G: Unified Image Generation through Multimodal Conditional\n  Diffusion","summary":"  Existing text-to-image diffusion models primarily generate images from text\nprompts. However, the inherent conciseness of textual descriptions poses\nchallenges in faithfully synthesizing images with intricate details, such as\nspecific entities or scenes. This paper presents \\textbf{UNIMO-G}, a simple\nmultimodal conditional diffusion framework that operates on multimodal prompts\nwith interleaved textual and visual inputs, which demonstrates a unified\nability for both text-driven and subject-driven image generation. UNIMO-G\ncomprises two core components: a Multimodal Large Language Model (MLLM) for\nencoding multimodal prompts, and a conditional denoising diffusion network for\ngenerating images based on the encoded multimodal input. We leverage a\ntwo-stage training strategy to effectively train the framework: firstly\npre-training on large-scale text-image pairs to develop conditional image\ngeneration capabilities, and then instruction tuning with multimodal prompts to\nachieve unified image generation proficiency. A well-designed data processing\npipeline involving language grounding and image segmentation is employed to\nconstruct multi-modal prompts. UNIMO-G excels in both text-to-image generation\nand zero-shot subject-driven synthesis, and is notably effective in generating\nhigh-fidelity images from complex multimodal prompts involving multiple image\nentities.\n","authors":["Wei Li","Xue Xu","Jiachen Liu","Xinyan Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.13388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13386v1","updated":"2024-01-24T11:27:32Z","published":"2024-01-24T11:27:32Z","title":"Privacy-Preserving Face Recognition in Hybrid Frequency-Color Domain","summary":"  Face recognition technology has been deployed in various real-life\napplications. The most sophisticated deep learning-based face recognition\nsystems rely on training millions of face images through complex deep neural\nnetworks to achieve high accuracy. It is quite common for clients to upload\nface images to the service provider in order to access the model inference.\nHowever, the face image is a type of sensitive biometric attribute tied to the\nidentity information of each user. Directly exposing the raw face image to the\nservice provider poses a threat to the user's privacy. Current\nprivacy-preserving approaches to face recognition focus on either concealing\nvisual information on model input or protecting model output face embedding.\nThe noticeable drop in recognition accuracy is a pitfall for most methods. This\npaper proposes a hybrid frequency-color fusion approach to reduce the input\ndimensionality of face recognition in the frequency domain. Moreover, sparse\ncolor information is also introduced to alleviate significant accuracy\ndegradation after adding differential privacy noise. Besides, an\nidentity-specific embedding mapping scheme is applied to protect original face\nembedding by enlarging the distance among identities. Lastly, secure multiparty\ncomputation is implemented for safely computing the embedding distance during\nmodel inference. The proposed method performs well on multiple widely used\nverification datasets. Moreover, it has around 2.6% to 4.2% higher accuracy\nthan the state-of-the-art in the 1:N verification scenario.\n","authors":["Dong Han","Yong Li","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2401.13386v1.pdf","comment":"This work is already accepted at the conference International\n  Conference on Computer Vision Theory and Applications (VISAPP) 2024 as a\n  regular paper"},{"id":"http://arxiv.org/abs/2401.11859v2","updated":"2024-01-24T11:24:40Z","published":"2024-01-22T11:28:24Z","title":"LKFormer: Large Kernel Transformer for Infrared Image Super-Resolution","summary":"  Given the broad application of infrared technology across diverse fields,\nthere is an increasing emphasis on investigating super-resolution techniques\nfor infrared images within the realm of deep learning. Despite the impressive\nresults of current Transformer-based methods in image super-resolution tasks,\ntheir reliance on the self-attentive mechanism intrinsic to the Transformer\narchitecture results in images being treated as one-dimensional sequences,\nthereby neglecting their inherent two-dimensional structure. Moreover, infrared\nimages exhibit a uniform pixel distribution and a limited gradient range,\nposing challenges for the model to capture effective feature information.\nConsequently, we suggest a potent Transformer model, termed Large Kernel\nTransformer (LKFormer), to address this issue. Specifically, we have designed a\nLarge Kernel Residual Attention (LKRA) module with linear complexity. This\nmainly employs depth-wise convolution with large kernels to execute non-local\nfeature modeling, thereby substituting the standard self-attentive layer.\nAdditionally, we have devised a novel feed-forward network structure called\nGated-Pixel Feed-Forward Network (GPFN) to augment the LKFormer's capacity to\nmanage the information flow within the network. Comprehensive experimental\nresults reveal that our method surpasses the most advanced techniques\navailable, using fewer parameters and yielding considerably superior\nperformance.The source code will be available at\nhttps://github.com/sad192/large-kernel-Transformer.\n","authors":["Feiwei Qin","Kang Yan","Changmiao Wang","Ruiquan Ge","Yong Peng","Kai Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11859v2.pdf","comment":"14 pages, 4 figures, accept Multimedia Tools and Applications"},{"id":"http://arxiv.org/abs/2401.13363v1","updated":"2024-01-24T10:44:16Z","published":"2024-01-24T10:44:16Z","title":"Do You Guys Want to Dance: Zero-Shot Compositional Human Dance\n  Generation with Multiple Persons","summary":"  Human dance generation (HDG) aims to synthesize realistic videos from images\nand sequences of driving poses. Despite great success, existing methods are\nlimited to generating videos of a single person with specific backgrounds,\nwhile the generalizability for real-world scenarios with multiple persons and\ncomplex backgrounds remains unclear. To systematically measure the\ngeneralizability of HDG models, we introduce a new task, dataset, and\nevaluation protocol of compositional human dance generation (cHDG). Evaluating\nthe state-of-the-art methods on cHDG, we empirically find that they fail to\ngeneralize to real-world scenarios. To tackle the issue, we propose a novel\nzero-shot framework, dubbed MultiDance-Zero, that can synthesize videos\nconsistent with arbitrary multiple persons and background while precisely\nfollowing the driving poses. Specifically, in contrast to straightforward DDIM\nor null-text inversion, we first present a pose-aware inversion method to\nobtain the noisy latent code and initialization text embeddings, which can\naccurately reconstruct the composed reference image. Since directly generating\nvideos from them will lead to severe appearance inconsistency, we propose a\ncompositional augmentation strategy to generate augmented images and utilize\nthem to optimize a set of generalizable text embeddings. In addition,\nconsistency-guided sampling is elaborated to encourage the background and\nkeypoints of the estimated clean image at each reverse step to be close to\nthose of the reference image, further improving the temporal consistency of\ngenerated videos. Extensive qualitative and quantitative results demonstrate\nthe effectiveness and superiority of our approach.\n","authors":["Zhe Xu","Kun Wei","Xu Yang","Cheng Deng"],"pdf_url":"https://arxiv.org/pdf/2401.13363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13357v1","updated":"2024-01-24T10:35:34Z","published":"2024-01-24T10:35:34Z","title":"Linear Relative Pose Estimation Founded on Pose-only Imaging Geometry","summary":"  How to efficiently and accurately handle image matching outliers is a\ncritical issue in two-view relative estimation. The prevailing RANSAC method\nnecessitates that the minimal point pairs be inliers. This paper introduces a\nlinear relative pose estimation algorithm for n $( n \\geq 6$) point pairs,\nwhich is founded on the recent pose-only imaging geometry to filter out\noutliers by proper reweighting. The proposed algorithm is able to handle planar\ndegenerate scenes, and enhance robustness and accuracy in the presence of a\nsubstantial ratio of outliers. Specifically, we embed the linear global\ntranslation (LiGT) constraint into the strategies of iteratively reweighted\nleast-squares (IRLS) and RANSAC so as to realize robust outlier removal.\nSimulations and real tests of the Strecha dataset show that the proposed\nalgorithm achieves relative rotation accuracy improvement of 2 $\\sim$ 10 times\nin face of as large as 80% outliers.\n","authors":["Qi Cai","Xinrui Li","Yuanxin Wu"],"pdf_url":"https://arxiv.org/pdf/2401.13357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13352v1","updated":"2024-01-24T10:27:50Z","published":"2024-01-24T10:27:50Z","title":"EndoGaussians: Single View Dynamic Gaussian Splatting for Deformable\n  Endoscopic Tissues Reconstruction","summary":"  The accurate 3D reconstruction of deformable soft body tissues from\nendoscopic videos is a pivotal challenge in medical applications such as VR\nsurgery and medical image analysis. Existing methods often struggle with\naccuracy and the ambiguity of hallucinated tissue parts, limiting their\npractical utility. In this work, we introduce EndoGaussians, a novel approach\nthat employs Gaussian Splatting for dynamic endoscopic 3D reconstruction. This\nmethod marks the first use of Gaussian Splatting in this context, overcoming\nthe limitations of previous NeRF-based techniques. Our method sets new\nstate-of-the-art standards, as demonstrated by quantitative assessments on\nvarious endoscope datasets. These advancements make our method a promising tool\nfor medical professionals, offering more reliable and efficient 3D\nreconstructions for practical applications in the medical field.\n","authors":["Yangsen Chen","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2401.13352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12900v2","updated":"2024-01-24T10:00:22Z","published":"2024-01-23T16:40:47Z","title":"PSAvatar: A Point-based Morphable Shape Model for Real-Time Head Avatar\n  Creation with 3D Gaussian Splatting","summary":"  Despite much progress, achieving real-time high-fidelity head avatar\nanimation is still difficult and existing methods have to trade-off between\nspeed and quality. 3DMM based methods often fail to model non-facial structures\nsuch as eyeglasses and hairstyles, while neural implicit models suffer from\ndeformation inflexibility and rendering inefficiency. Although 3D Gaussian has\nbeen demonstrated to possess promising capability for geometry representation\nand radiance field reconstruction, applying 3D Gaussian in head avatar creation\nremains a major challenge since it is difficult for 3D Gaussian to model the\nhead shape variations caused by changing poses and expressions. In this paper,\nwe introduce PSAvatar, a novel framework for animatable head avatar creation\nthat utilizes discrete geometric primitive to create a parametric morphable\nshape model and employs 3D Gaussian for fine detail representation and high\nfidelity rendering. The parametric morphable shape model is a Point-based\nMorphable Shape Model (PMSM) which uses points instead of meshes for 3D\nrepresentation to achieve enhanced representation flexibility. The PMSM first\nconverts the FLAME mesh to points by sampling on the surfaces as well as off\nthe meshes to enable the reconstruction of not only surface-like structures but\nalso complex geometries such as eyeglasses and hairstyles. By aligning these\npoints with the head shape in an analysis-by-synthesis manner, the PMSM makes\nit possible to utilize 3D Gaussian for fine detail representation and\nappearance modeling, thus enabling the creation of high-fidelity avatars. We\nshow that PSAvatar can reconstruct high-fidelity head avatars of a variety of\nsubjects and the avatars can be animated in real-time ($\\ge$ 25 fps at a\nresolution of 512 $\\times$ 512 ).\n","authors":["Zhongyuan Zhao","Zhenyu Bao","Qing Li","Guoping Qiu","Kanglin Liu"],"pdf_url":"https://arxiv.org/pdf/2401.12900v2.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2401.13330v1","updated":"2024-01-24T09:48:12Z","published":"2024-01-24T09:48:12Z","title":"NACHOS: Neural Architecture Search for Hardware Constrained Early Exit\n  Neural Networks","summary":"  Early Exit Neural Networks (EENNs) endow astandard Deep Neural Network (DNN)\nwith Early Exit Classifiers (EECs), to provide predictions at intermediate\npoints of the processing when enough confidence in classification is achieved.\nThis leads to many benefits in terms of effectiveness and efficiency.\nCurrently, the design of EENNs is carried out manually by experts, a complex\nand time-consuming task that requires accounting for many aspects, including\nthe correct placement, the thresholding, and the computational overhead of the\nEECs. For this reason, the research is exploring the use of Neural Architecture\nSearch (NAS) to automatize the design of EENNs. Currently, few comprehensive\nNAS solutions for EENNs have been proposed in the literature, and a fully\nautomated, joint design strategy taking into consideration both the backbone\nand the EECs remains an open problem. To this end, this work presents Neural\nArchitecture Search for Hardware Constrained Early Exit Neural Networks\n(NACHOS), the first NAS framework for the design of optimal EENNs satisfying\nconstraints on the accuracy and the number of Multiply and Accumulate (MAC)\noperations performed by the EENNs at inference time. In particular, this\nprovides the joint design of backbone and EECs to select a set of admissible\n(i.e., respecting the constraints) Pareto Optimal Solutions in terms of best\ntradeoff between the accuracy and number of MACs. The results show that the\nmodels designed by NACHOS are competitive with the state-of-the-art EENNs.\nAdditionally, this work investigates the effectiveness of two novel\nregularization terms designed for the optimization of the auxiliary classifiers\nof the EENN\n","authors":["Matteo Gambella","Jary Pomponi","Simone Scardapane","Manuel Roveri"],"pdf_url":"https://arxiv.org/pdf/2401.13330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02803v3","updated":"2024-01-24T09:47:27Z","published":"2023-04-11T14:53:57Z","title":"Tensor PCA from basis in tensor space","summary":"  The aim of this paper is to present a mathematical framework for tensor PCA.\nThe proposed approach is able to overcome the limitations of previous methods\nthat extract a low dimensional subspace by iteratively solving an optimization\nproblem. The core of the proposed approach is the derivation of a basis in\ntensor space from a real self-adjoint tensor operator, thus reducing the\nproblem of deriving a basis to an eigenvalue problem. Three different cases\nhave been studied to derive: i) a basis from a self-adjoint tensor operator;\nii) a rank-1 basis; iii) a basis in a subspace. In particular, the equivalence\nbetween eigenvalue equation for a real self-adjoint tensor operator and\nstandard matrix eigenvalue equation has been proven. For all the three cases\nconsidered, a subspace approach has been adopted to derive a tensor PCA.\nExperiments on image datasets validate the proposed mathematical framework.\n","authors":["Claudio Turchetti","Laura Falaschetti"],"pdf_url":"https://arxiv.org/pdf/2305.02803v3.pdf","comment":"This version contains a new experiment better showing the\n  potentiality of the paper and a corrected autor list. This work has been\n  submitted to the IEEE for possible publication. Copyright may be transferred\n  without notice, after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2401.13329v1","updated":"2024-01-24T09:45:40Z","published":"2024-01-24T09:45:40Z","title":"Generative Video Diffusion for Unseen Cross-Domain Video Moment\n  Retrieval","summary":"  Video Moment Retrieval (VMR) requires precise modelling of fine-grained\nmoment-text associations to capture intricate visual-language relationships.\nDue to the lack of a diverse and generalisable VMR dataset to facilitate\nlearning scalable moment-text associations, existing methods resort to joint\ntraining on both source and target domain videos for cross-domain applications.\nMeanwhile, recent developments in vision-language multimodal models pre-trained\non large-scale image-text and/or video-text pairs are only based on coarse\nassociations (weakly labelled). They are inadequate to provide fine-grained\nmoment-text correlations required for cross-domain VMR. In this work, we solve\nthe problem of unseen cross-domain VMR, where certain visual and textual\nconcepts do not overlap across domains, by only utilising target domain\nsentences (text prompts) without accessing their videos. To that end, we\nexplore generative video diffusion for fine-grained editing of source videos\ncontrolled by the target sentences, enabling us to simulate target domain\nvideos. We address two problems in video editing for optimising unseen domain\nVMR: (1) generation of high-quality simulation videos of different moments with\nsubtle distinctions, (2) selection of simulation videos that complement\nexisting source training videos without introducing harmful noise or\nunnecessary repetitions. On the first problem, we formulate a two-stage video\ndiffusion generation controlled simultaneously by (1) the original video\nstructure of a source video, (2) subject specifics, and (3) a target sentence\nprompt. This ensures fine-grained variations between video moments. On the\nsecond problem, we introduce a hybrid selection mechanism that combines two\nquantitative metrics for noise filtering and one qualitative metric for\nleveraging VMR prediction on simulation video selection.\n","authors":["Dezhao Luo","Jiabo Huang","Shaogang Gong","Hailin Jin","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.13329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13325v1","updated":"2024-01-24T09:39:45Z","published":"2024-01-24T09:39:45Z","title":"Memory Consistency Guided Divide-and-Conquer Learning for Generalized\n  Category Discovery","summary":"  Generalized category discovery (GCD) aims at addressing a more realistic and\nchallenging setting of semi-supervised learning, where only part of the\ncategory labels are assigned to certain training samples. Previous methods\ngenerally employ naive contrastive learning or unsupervised clustering scheme\nfor all the samples. Nevertheless, they usually ignore the inherent critical\ninformation within the historical predictions of the model being trained.\nSpecifically, we empirically reveal that a significant number of salient\nunlabeled samples yield consistent historical predictions corresponding to\ntheir ground truth category. From this observation, we propose a Memory\nConsistency guided Divide-and-conquer Learning framework (MCDL). In this\nframework, we introduce two memory banks to record historical prediction of\nunlabeled data, which are exploited to measure the credibility of each sample\nin terms of its prediction consistency. With the guidance of credibility, we\ncan design a divide-and-conquer learning strategy to fully utilize the\ndiscriminative information of unlabeled data while alleviating the negative\ninfluence of noisy labels. Extensive experimental results on multiple\nbenchmarks demonstrate the generality and superiority of our method, where our\nmethod outperforms state-of-the-art models by a large margin on both seen and\nunseen classes of the generic image recognition and challenging semantic shift\nsettings (i.e.,with +8.4% gain on CUB and +8.1% on Standford Cars).\n","authors":["Yuanpeng Tu","Zhun Zhong","Yuxi Li","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.13325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11881v2","updated":"2024-01-24T09:37:07Z","published":"2023-10-18T11:06:41Z","title":"A Comparative Study of Image Restoration Networks for General Backbone\n  Network Design","summary":"  Despite the significant progress made by deep models in various image\nrestoration tasks, existing image restoration networks still face challenges in\nterms of task generality. An intuitive manifestation is that networks which\nexcel in certain tasks often fail to deliver satisfactory results in others. To\nillustrate this point, we select five representative networks and conduct a\ncomparative study on five classic image restoration tasks. First, we provide a\ndetailed explanation of the characteristics of different image restoration\ntasks and backbone networks. Following this, we present the benchmark results\nand analyze the reasons behind the performance disparity of different models\nacross various tasks. Drawing from this comparative study, we propose that a\ngeneral image restoration backbone network needs to meet the functional\nrequirements of diverse tasks. Based on this principle, we design a new general\nimage restoration backbone network, X-Restormer. Extensive experiments\ndemonstrate that X-Restormer possesses good task generality and achieves\nstate-of-the-art performance across a variety of tasks.\n","authors":["Xiangyu Chen","Zheyuan Li","Yuandong Pu","Yihao Liu","Jiantao Zhou","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2310.11881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13315v1","updated":"2024-01-24T09:14:33Z","published":"2024-01-24T09:14:33Z","title":"Deep Learning for Improved Polyp Detection from Synthetic Narrow-Band\n  Imaging","summary":"  To cope with the growing prevalence of colorectal cancer (CRC), screening\nprograms for polyp detection and removal have proven their usefulness.\nColonoscopy is considered the best-performing procedure for CRC screening. To\nease the examination, deep learning based methods for automatic polyp detection\nhave been developed for conventional white-light imaging (WLI). Compared with\nWLI, narrow-band imaging (NBI) can improve polyp classification during\ncolonoscopy but requires special equipment. We propose a CycleGAN-based\nframework to convert images captured with regular WLI to synthetic NBI (SNBI)\nas a pre-processing method for improving object detection on WLI when NBI is\nunavailable. This paper first shows that better results for polyp detection can\nbe achieved on NBI compared to a relatively similar dataset of WLI. Secondly,\nexperimental results demonstrate that our proposed modality translation can\nachieve improved polyp detection on SNBI images generated from WLI compared to\nthe original WLI. This is because our WLI-to-SNBI translation model can enhance\nthe observation of polyp surface patterns in the generated SNBI images.\n","authors":["Mathias Ramm Haugland","Hemin Ali Qadir","Ilangko Balasingham"],"pdf_url":"https://arxiv.org/pdf/2401.13315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13313v1","updated":"2024-01-24T09:09:37Z","published":"2024-01-24T09:09:37Z","title":"InstructDoc: A Dataset for Zero-Shot Generalization of Visual Document\n  Understanding with Instructions","summary":"  We study the problem of completing various visual document understanding\n(VDU) tasks, e.g., question answering and information extraction, on real-world\ndocuments through human-written instructions. To this end, we propose\nInstructDoc, the first large-scale collection of 30 publicly available VDU\ndatasets, each with diverse instructions in a unified format, which covers a\nwide range of 12 tasks and includes open document types/formats. Furthermore,\nto enhance the generalization performance on VDU tasks, we design a new\ninstruction-based document reading and understanding model, InstructDr, that\nconnects document images, image encoders, and large language models (LLMs)\nthrough a trainable bridging module. Experiments demonstrate that InstructDr\ncan effectively adapt to new VDU datasets, tasks, and domains via given\ninstructions and outperforms existing multimodal LLMs and ChatGPT without\nspecific training.\n","authors":["Ryota Tanaka","Taichi Iki","Kyosuke Nishida","Kuniko Saito","Jun Suzuki"],"pdf_url":"https://arxiv.org/pdf/2401.13313v1.pdf","comment":"Accepted by AAAI2024; project page:\n  https://github.com/nttmdlab-nlp/InstructDoc"},{"id":"http://arxiv.org/abs/2401.13311v1","updated":"2024-01-24T09:07:11Z","published":"2024-01-24T09:07:11Z","title":"ConTextual: Evaluating Context-Sensitive Text-Rich Visual Reasoning in\n  Large Multimodal Models","summary":"  Recent advancements in AI have led to the development of large multimodal\nmodels (LMMs) capable of processing complex tasks involving joint reasoning\nover text and visual content in the image (e.g., navigating maps in public\nplaces). This paper introduces ConTextual, a novel benchmark comprising\ninstructions designed explicitly to evaluate LMMs' ability to perform\ncontext-sensitive text-rich visual reasoning. ConTextual emphasizes diverse\nreal-world scenarios (e.g., time-reading, navigation, shopping and more)\ndemanding a deeper understanding of the interactions between textual and visual\nelements. Our findings reveal a significant performance gap of 30.8% between\nthe best-performing LMM, GPT-4V(ision), and human capabilities using human\nevaluation indicating substantial room for improvement in context-sensitive\ntext-rich visual reasoning. Notably, while GPT-4V excelled in abstract\ncategories like meme and quote interpretation, its overall performance still\nlagged behind humans. In addition to human evaluations, we also employed\nautomatic evaluation metrics using GPT-4, uncovering similar trends in\nperformance disparities. We also perform a fine-grained evaluation across\ndiverse visual contexts and provide qualitative analysis which provides a\nrobust framework for future advancements in the LMM design.\nhttps://con-textual.github.io/\n","authors":["Rohan Wadhawan","Hritik Bansal","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2401.13311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13307v1","updated":"2024-01-24T09:02:00Z","published":"2024-01-24T09:02:00Z","title":"ChatterBox: Multi-round Multimodal Referring and Grounding","summary":"  In this study, we establish a baseline for a new task named multimodal\nmulti-round referring and grounding (MRG), opening up a promising direction for\ninstance-level multimodal dialogues. We present a new benchmark and an\nefficient vision-language model for this purpose. The new benchmark, named\nCB-300K, spans challenges including multi-round dialogue, complex spatial\nrelationships among multiple instances, and consistent reasoning, which are\nbeyond those shown in existing benchmarks. The proposed model, named\nChatterBox, utilizes a two-branch architecture to collaboratively handle vision\nand language tasks. By tokenizing instance regions, the language branch\nacquires the ability to perceive referential information. Meanwhile, ChatterBox\nfeeds a query embedding in the vision branch to a token receiver for visual\ngrounding. A two-stage optimization strategy is devised, making use of both\nCB-300K and auxiliary external data to improve the model's stability and\ncapacity for instance-level understanding. Experiments show that ChatterBox\noutperforms existing models in MRG both quantitatively and qualitatively,\npaving a new path towards multimodal dialogue scenarios with complicated and\nprecise interactions. Code, data, and model are available at:\nhttps://github.com/sunsmarterjie/ChatterBox.\n","authors":["Yunjie Tian","Tianren Ma","Lingxi Xie","Jihao Qiu","Xi Tang","Yuan Zhang","Jianbin Jiao","Qi Tian","Qixiang Ye"],"pdf_url":"https://arxiv.org/pdf/2401.13307v1.pdf","comment":"17 pages, 6 tables, 9 figurs. Code, data, and model are available at:\n  https://github.com/sunsmarterjie/ChatterBox"},{"id":"http://arxiv.org/abs/2401.13296v1","updated":"2024-01-24T08:35:29Z","published":"2024-01-24T08:35:29Z","title":"Visual Objectification in Films: Towards a New AI Task for Video\n  Interpretation","summary":"  In film gender studies, the concept of 'male gaze' refers to the way the\ncharacters are portrayed on-screen as objects of desire rather than subjects.\nIn this article, we introduce a novel video-interpretation task, to detect\ncharacter objectification in films. The purpose is to reveal and quantify the\nusage of complex temporal patterns operated in cinema to produce the cognitive\nperception of objectification. We introduce the ObyGaze12 dataset, made of 1914\nmovie clips densely annotated by experts for objectification concepts\nidentified in film studies and psychology. We evaluate recent vision models,\nshow the feasibility of the task and where the challenges remain with concept\nbottleneck models. Our new dataset and code are made available to the\ncommunity.\n","authors":["Julie Tores","Lucile Sassatelli","Hui-Yin Wu","Clement Bergman","Lea Andolfi","Victor Ecrement","Frederic Precioso","Thierry Devars","Magali Guaresi","Virginie Julliard","Sarah Lecossais"],"pdf_url":"https://arxiv.org/pdf/2401.13296v1.pdf","comment":"12 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2310.07223v3","updated":"2024-01-24T08:11:02Z","published":"2023-10-11T06:13:50Z","title":"Bidirectional recurrent imputation and abundance estimation of LULC\n  classes with MODIS multispectral time series and geo-topographic and climatic\n  data","summary":"  Remotely sensed data are dominated by mixed Land Use and Land Cover (LULC)\ntypes. Spectral unmixing (SU) is a key technique that disentangles mixed pixels\ninto constituent LULC types and their abundance fractions. While existing\nstudies on Deep Learning (DL) for SU typically focus on single time-step\nhyperspectral (HS) or multispectral (MS) data, our work pioneers SU using MODIS\nMS time series, addressing missing data with end-to-end DL models. Our approach\nenhances a Long-Short Term Memory (LSTM)-based model by incorporating\ngeographic, topographic (geo-topographic), and climatic ancillary information.\nNotably, our method eliminates the need for explicit endmember extraction,\ninstead learning the input-output relationship between mixed spectra and LULC\nabundances through supervised learning. Experimental results demonstrate that\nintegrating spectral-temporal input data with geo-topographic and climatic\ninformation significantly improves the estimation of LULC abundances in mixed\npixels. To facilitate this study, we curated a novel labeled dataset for\nAndalusia (Spain) with monthly MODIS multispectral time series at 460m\nresolution for 2013. Named Andalusia MultiSpectral MultiTemporal Unmixing\n(Andalusia-MSMTU), this dataset provides pixel-level annotations of LULC\nabundances along with ancillary information. The dataset\n(https://zenodo.org/records/7752348) and code\n(https://github.com/jrodriguezortega/MSMTU) are available to the public.\n","authors":["José Rodríguez-Ortega","Rohaifa Khaldi","Domingo Alcaraz-Segura","Siham Tabik"],"pdf_url":"https://arxiv.org/pdf/2310.07223v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13285v1","updated":"2024-01-24T08:02:44Z","published":"2024-01-24T08:02:44Z","title":"Small Object Tracking in LiDAR Point Cloud: Learning the\n  Target-awareness Prototype and Fine-grained Search Region","summary":"  Single Object Tracking in LiDAR point cloud is one of the most essential\nparts of environmental perception, in which small objects are inevitable in\nreal-world scenarios and will bring a significant barrier to the accurate\nlocation. However, the existing methods concentrate more on exploring universal\narchitectures for common categories and overlook the challenges that small\nobjects have long been thorny due to the relative deficiency of foreground\npoints and a low tolerance for disturbances. To this end, we propose a Siamese\nnetwork-based method for small object tracking in the LiDAR point cloud, which\nis composed of the target-awareness prototype mining (TAPM) module and the\nregional grid subdivision (RGS) module. The TAPM module adopts the\nreconstruction mechanism of the masked decoder to learn the prototype in the\nfeature space, aiming to highlight the presence of foreground points that will\nfacilitate the subsequent location of small objects. Through the above\nprototype is capable of accentuating the small object of interest, the\npositioning deviation in feature maps still leads to high tracking errors. To\nalleviate this issue, the RGS module is proposed to recover the fine-grained\nfeatures of the search region based on ViT and pixel shuffle layers. In\naddition, apart from the normal settings, we elaborately design a scaling\nexperiment to evaluate the robustness of the different trackers on small\nobjects. Extensive experiments on KITTI and nuScenes demonstrate that our\nmethod can effectively improve the tracking performance of small targets\nwithout affecting normal-sized objects.\n","authors":["Shengjing Tian","Yinan Han","Xiuping Liu","Xiantong Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.13285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08013v6","updated":"2024-01-24T07:48:34Z","published":"2023-06-13T11:46:00Z","title":"TopP&R: Robust Support Estimation Approach for Evaluating Fidelity and\n  Diversity in Generative Models","summary":"  We propose a robust and reliable evaluation metric for generative models by\nintroducing topological and statistical treatments for rigorous support\nestimation. Existing metrics, such as Inception Score (IS), Frechet Inception\nDistance (FID), and the variants of Precision and Recall (P&R), heavily rely on\nsupports that are estimated from sample features. However, the reliability of\ntheir estimation has not been seriously discussed (and overlooked) even though\nthe quality of the evaluation entirely depends on it. In this paper, we propose\nTopological Precision and Recall (TopP&R, pronounced 'topper'), which provides\na systematic approach to estimating supports, retaining only topologically and\nstatistically important features with a certain level of confidence. This not\nonly makes TopP&R strong for noisy features, but also provides statistical\nconsistency. Our theoretical and experimental results show that TopP&R is\nrobust to outliers and non-independent and identically distributed (Non-IID)\nperturbations, while accurately capturing the true trend of change in samples.\nTo the best of our knowledge, this is the first evaluation metric focused on\nthe robust estimation of the support and provides its statistical consistency\nunder noise.\n","authors":["Pum Jun Kim","Yoojin Jang","Jisu Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2306.08013v6.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.13280v1","updated":"2024-01-24T07:45:24Z","published":"2024-01-24T07:45:24Z","title":"DDI-CoCo: A Dataset For Understanding The Effect Of Color Contrast In\n  Machine-Assisted Skin Disease Detection","summary":"  Skin tone as a demographic bias and inconsistent human labeling poses\nchallenges in dermatology AI. We take another angle to investigate color\ncontrast's impact, beyond skin tones, on malignancy detection in skin disease\ndatasets: We hypothesize that in addition to skin tones, the color difference\nbetween the lesion area and skin also plays a role in malignancy detection\nperformance of dermatology AI models. To study this, we first propose a robust\nlabeling method to quantify color contrast scores of each image and validate\nour method by showing small labeling variations. More importantly, applying our\nmethod to \\textit{the only} diverse-skin tone and pathologically-confirmed skin\ndisease dataset DDI, yields \\textbf{DDI-CoCo Dataset}, and we observe a\nperformance gap between the high and low color difference groups. This\ndisparity remains consistent across various state-of-the-art (SoTA) image\nclassification models, which supports our hypothesis. Furthermore, we study the\ninteraction between skin tone and color difference effects and suggest that\ncolor difference can be an additional reason behind model performance bias\nbetween skin tones. Our work provides a complementary angle to dermatology AI\nfor improving skin disease detection.\n","authors":["Ming-Chang Chiu","Yingfei Wang","Yen-Ju Kuo","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13280v1.pdf","comment":"5 pages, 4 figures, 2 tables, Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.09093v3","updated":"2024-01-24T07:23:15Z","published":"2023-12-14T16:24:09Z","title":"Aleth-NeRF: Illumination Adaptive NeRF with Concealing Field Assumption","summary":"  The standard Neural Radiance Fields (NeRF) paradigm employs a viewer-centered\nmethodology, entangling the aspects of illumination and material reflectance\ninto emission solely from 3D points. This simplified rendering approach\npresents challenges in accurately modeling images captured under adverse\nlighting conditions, such as low light or over-exposure. Motivated by the\nancient Greek emission theory that posits visual perception as a result of rays\nemanating from the eyes, we slightly refine the conventional NeRF framework to\ntrain NeRF under challenging light conditions and generate normal-light\ncondition novel views unsupervised. We introduce the concept of a \"Concealing\nField,\" which assigns transmittance values to the surrounding air to account\nfor illumination effects. In dark scenarios, we assume that object emissions\nmaintain a standard lighting level but are attenuated as they traverse the air\nduring the rendering process. Concealing Field thus compel NeRF to learn\nreasonable density and colour estimations for objects even in dimly lit\nsituations. Similarly, the Concealing Field can mitigate over-exposed emissions\nduring the rendering stage. Furthermore, we present a comprehensive multi-view\ndataset captured under challenging illumination conditions for evaluation. Our\ncode and dataset available at https://github.com/cuiziteng/Aleth-NeRF\n","authors":["Ziteng Cui","Lin Gu","Xiao Sun","Xianzheng Ma","Yu Qiao","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2312.09093v3.pdf","comment":"AAAI 2024, code available at\n  https://cuiziteng.github.io/Aleth_NeRF_web/ Modified version of previous\n  paper arXiv:2303.05807"},{"id":"http://arxiv.org/abs/2401.13270v1","updated":"2024-01-24T07:22:05Z","published":"2024-01-24T07:22:05Z","title":"Audio-Infused Automatic Image Colorization by Exploiting Audio Scene\n  Semantics","summary":"  Automatic image colorization is inherently an ill-posed problem with\nuncertainty, which requires an accurate semantic understanding of scenes to\nestimate reasonable colors for grayscale images. Although recent\ninteraction-based methods have achieved impressive performance, it is still a\nvery difficult task to infer realistic and accurate colors for automatic\ncolorization. To reduce the difficulty of semantic understanding of grayscale\nscenes, this paper tries to utilize corresponding audio, which naturally\ncontains extra semantic information about the same scene. Specifically, a novel\naudio-infused automatic image colorization (AIAIC) network is proposed, which\nconsists of three stages. First, we take color image semantics as a bridge and\npretrain a colorization network guided by color image semantics. Second, the\nnatural co-occurrence of audio and video is utilized to learn the color\nsemantic correlations between audio and visual scenes. Third, the implicit\naudio semantic representation is fed into the pretrained network to finally\nrealize the audio-guided colorization. The whole process is trained in a\nself-supervised manner without human annotation. In addition, an audiovisual\ncolorization dataset is established for training and testing. Experiments\ndemonstrate that audio guidance can effectively improve the performance of\nautomatic colorization, especially for some scenes that are difficult to\nunderstand only from visual modality.\n","authors":["Pengcheng Zhao","Yanxiang Chen","Yang Zhao","Wei Jia","Zhao Zhang","Ronggang Wang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2401.13270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13267v1","updated":"2024-01-24T07:13:06Z","published":"2024-01-24T07:13:06Z","title":"Dual-modal Dynamic Traceback Learning for Medical Report Generation","summary":"  With increasing reliance on medical imaging in clinical practices, automated\nreport generation from medical images is in great demand. Existing report\ngeneration methods typically adopt an encoder-decoder deep learning framework\nto build a uni-directional image-to-report mapping. However, such a framework\nignores the bi-directional mutual associations between images and reports, thus\nincurring difficulties in associating the intrinsic medical meanings between\nthem. Recent generative representation learning methods have demonstrated the\nbenefits of dual-modal learning from both image and text modalities. However,\nthese methods exhibit two major drawbacks for medical report generation: 1)\nthey tend to capture morphological information and have difficulties in\ncapturing subtle pathological semantic information, and 2) they predict masked\ntext rely on both unmasked images and text, inevitably degrading performance\nwhen inference is based solely on images. In this study, we propose a new\nreport generation framework with dual-modal dynamic traceback learning (DTrace)\nto overcome the two identified drawbacks and enable dual-modal learning for\nmedical report generation. To achieve this, our DTrace introduces a traceback\nmechanism to control the semantic validity of generated content via\nself-assessment. Further, our DTrace introduces a dynamic learning strategy to\nadapt to various proportions of image and text input, enabling report\ngeneration without reliance on textual input during inference. Extensive\nexperiments on two well-benchmarked datasets (IU-Xray and MIMIC-CXR) show that\nour DTrace outperforms state-of-the-art medical report generation methods.\n","authors":["Shuchang Ye","Mingyuan Meng","Mingjian Li","Dagan Feng","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2401.13267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13264v1","updated":"2024-01-24T07:11:05Z","published":"2024-01-24T07:11:05Z","title":"Enhancing cross-domain detection: adaptive class-aware contrastive\n  transformer","summary":"  Recently,the detection transformer has gained substantial attention for its\ninherent minimal post-processing requirement.However,this paradigm relies on\nabundant training data,yet in the context of the cross-domain\nadaptation,insufficient labels in the target domain exacerbate issues of class\nimbalance and model performance degradation.To address these challenges, we\npropose a novel class-aware cross domain detection transformer based on the\nadversarial learning and mean-teacher framework.First,considering the\ninconsistencies between the classification and regression tasks,we introduce an\nIoU-aware prediction branch and exploit the consistency of classification and\nlocation scores to filter and reweight pseudo labels.Second, we devise a\ndynamic category threshold refinement to adaptively manage model\nconfidence.Third,to alleviate the class imbalance,an instance-level class-aware\ncontrastive learning module is presented to encourage the generation of\ndiscriminative features for each class,particularly benefiting minority\nclasses.Experimental results across diverse domain-adaptive scenarios validate\nour method's effectiveness in improving performance and alleviating class\nimbalance issues,which outperforms the state-of-the-art transformer based\nmethods.\n","authors":["Ziru Zeng","Yue Ding","Hongtao Lu"],"pdf_url":"https://arxiv.org/pdf/2401.13264v1.pdf","comment":"Acceptd by Icassp 2024"},{"id":"http://arxiv.org/abs/2312.08917v2","updated":"2024-01-24T06:36:56Z","published":"2023-12-14T13:22:18Z","title":"An Incremental Unified Framework for Small Defect Inspection","summary":"  Artificial Intelligence (AI)-driven defect inspection is pivotal in\nindustrial manufacturing. Yet, many methods, tailored to specific pipelines,\ngrapple with diverse product portfolios and evolving processes. Addressing\nthis, we present the Incremental Unified Framework (IUF), which can reduce the\nfeature conflict problem when continuously integrating new objects in the\npipeline, making it advantageous in object-incremental learning scenarios.\nEmploying a state-of-the-art transformer, we introduce Object-Aware\nSelf-Attention (OASA) to delineate distinct semantic boundaries. Semantic\nCompression Loss (SCL) is integrated to optimize non-primary semantic space,\nenhancing network adaptability for novel objects. Additionally, we prioritize\nretaining the features of established objects during weight updates.\nDemonstrating prowess in both image and pixel-level defect inspection, our\napproach achieves state-of-the-art performance, proving indispensable for\ndynamic and scalable industrial inspections. Our code will be released at\n\\url{https://github.com/jqtangust/IUF}.\n","authors":["Jiaqi Tang","Hao Lu","Xiaogang Xu","Ruizheng Wu","Sixing Hu","Tong Zhang","Tsz Wa Cheng","Ming Ge","Ying-Cong Chen","Fugee Tsung"],"pdf_url":"https://arxiv.org/pdf/2312.08917v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15939v4","updated":"2024-01-24T06:04:29Z","published":"2023-11-27T15:46:47Z","title":"Unleashing the Power of Prompt-driven Nucleus Instance Segmentation","summary":"  Nucleus instance segmentation in histology images is crucial for a broad\nspectrum of clinical applications. Current dominant algorithms rely on\nregression of nuclear proxy maps. Distinguishing nucleus instances from the\nestimated maps requires carefully curated post-processing, which is error-prone\nand parameter-sensitive. Recently, the Segment Anything Model (SAM) has earned\nhuge attention in medical image segmentation, owing to its impressive\ngeneralization ability and promptable property. Nevertheless, its potential on\nnucleus instance segmentation remains largely underexplored. In this paper, we\npresent a novel prompt-driven framework that consists of a nucleus prompter and\nSAM for automatic nucleus instance segmentation. Specifically, the prompter\nlearns to generate a unique point prompt for each nucleus while the SAM is\nfine-tuned to output the corresponding mask for the prompted nucleus.\nFurthermore, we propose the inclusion of adjacent nuclei as negative prompts to\nenhance the model's capability to identify overlapping nuclei. Without\ncomplicated post-processing, our proposed method sets a new state-of-the-art\nperformance on three challenging benchmarks. Code is available at\n\\url{github.com/windygoo/PromptNucSeg}\n","authors":["Zhongyi Shui","Yunlong Zhang","Kai Yao","Chenglu Zhu","Sunyi Zheng","Jingxiong Li","Honglin Li","Yuxuan Sun","Ruizhe Guo","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15939v4.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2401.13221v1","updated":"2024-01-24T04:25:12Z","published":"2024-01-24T04:25:12Z","title":"Unified-Width Adaptive Dynamic Network for All-In-One Image Restoration","summary":"  In contrast to traditional image restoration methods, all-in-one image\nrestoration techniques are gaining increased attention for their ability to\nrestore images affected by diverse and unknown corruption types and levels.\nHowever, contemporary all-in-one image restoration methods omit task-wise\ndifficulties and employ the same networks to reconstruct images afflicted by\ndiverse degradations. This practice leads to an underestimation of the task\ncorrelations and suboptimal allocation of computational resources. To elucidate\ntask-wise complexities, we introduce a novel concept positing that intricate\nimage degradation can be represented in terms of elementary degradation.\nBuilding upon this foundation, we propose an innovative approach, termed the\nUnified-Width Adaptive Dynamic Network (U-WADN), consisting of two pivotal\ncomponents: a Width Adaptive Backbone (WAB) and a Width Selector (WS). The WAB\nincorporates several nested sub-networks with varying widths, which facilitates\nthe selection of the most apt computations tailored to each task, thereby\nstriking a balance between accuracy and computational efficiency during\nruntime. For different inputs, the WS automatically selects the most\nappropriate sub-network width, taking into account both task-specific and\nsample-specific complexities. Extensive experiments across a variety of image\nrestoration tasks demonstrate that the proposed U-WADN achieves better\nperformance while simultaneously reducing up to 32.3\\% of FLOPs and providing\napproximately 15.7\\% real-time acceleration. The code has been made available\nat \\url{https://github.com/xuyimin0926/U-WADN}.\n","authors":["Yimin Xu","Nanxi Gao","Zhongyun Shan","Fei Chao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2401.13221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13220v1","updated":"2024-01-24T04:23:17Z","published":"2024-01-24T04:23:17Z","title":"Segment Any Cell: A SAM-based Auto-prompting Fine-tuning Framework for\n  Nuclei Segmentation","summary":"  In the rapidly evolving field of AI research, foundational models like BERT\nand GPT have significantly advanced language and vision tasks. The advent of\npretrain-prompting models such as ChatGPT and Segmentation Anything Model (SAM)\nhas further revolutionized image segmentation. However, their applications in\nspecialized areas, particularly in nuclei segmentation within medical imaging,\nreveal a key challenge: the generation of high-quality, informative prompts is\nas crucial as applying state-of-the-art (SOTA) fine-tuning techniques on\nfoundation models. To address this, we introduce Segment Any Cell (SAC), an\ninnovative framework that enhances SAM specifically for nuclei segmentation.\nSAC integrates a Low-Rank Adaptation (LoRA) within the attention layer of the\nTransformer to improve the fine-tuning process, outperforming existing SOTA\nmethods. It also introduces an innovative auto-prompt generator that produces\neffective prompts to guide segmentation, a critical factor in handling the\ncomplexities of nuclei segmentation in biomedical imaging. Our extensive\nexperiments demonstrate the superiority of SAC in nuclei segmentation tasks,\nproving its effectiveness as a tool for pathologists and researchers. Our\ncontributions include a novel prompt generation strategy, automated\nadaptability for diverse segmentation tasks, the innovative application of\nLow-Rank Attention Adaptation in SAM, and a versatile framework for semantic\nsegmentation challenges.\n","authors":["Saiyang Na","Yuzhi Guo","Feng Jiang","Hehuan Ma","Junzhou Huang"],"pdf_url":"https://arxiv.org/pdf/2401.13220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03408v3","updated":"2024-01-24T04:21:51Z","published":"2023-12-06T10:46:53Z","title":"Open-sourced Data Ecosystem in Autonomous Driving: the Present and\n  Future","summary":"  With the continuous maturation and application of autonomous driving\ntechnology, a systematic examination of open-source autonomous driving datasets\nbecomes instrumental in fostering the robust evolution of the industry\necosystem. Current autonomous driving datasets can broadly be categorized into\ntwo generations. The first-generation autonomous driving datasets are\ncharacterized by relatively simpler sensor modalities, smaller data scale, and\nis limited to perception-level tasks. KITTI, introduced in 2012, serves as a\nprominent representative of this initial wave. In contrast, the\nsecond-generation datasets exhibit heightened complexity in sensor modalities,\ngreater data scale and diversity, and an expansion of tasks from perception to\nencompass prediction and control. Leading examples of the second generation\ninclude nuScenes and Waymo, introduced around 2019. This comprehensive review,\nconducted in collaboration with esteemed colleagues from both academia and\nindustry, systematically assesses over seventy open-source autonomous driving\ndatasets from domestic and international sources. It offers insights into\nvarious aspects, such as the principles underlying the creation of high-quality\ndatasets, the pivotal role of data engine systems, and the utilization of\ngenerative foundation models to facilitate scalable data generation.\nFurthermore, this review undertakes an exhaustive analysis and discourse\nregarding the characteristics and data scales that future third-generation\nautonomous driving datasets should possess. It also delves into the scientific\nand technical challenges that warrant resolution. These endeavors are pivotal\nin advancing autonomous innovation and fostering technological enhancement in\ncritical domains. For further details, please refer to\nhttps://github.com/OpenDriveLab/DriveAGI.\n","authors":["Hongyang Li","Yang Li","Huijie Wang","Jia Zeng","Huilin Xu","Pinlong Cai","Li Chen","Junchi Yan","Feng Xu","Lu Xiong","Jingdong Wang","Futang Zhu","Kai Yan","Chunjing Xu","Tiancai Wang","Fei Xia","Beipeng Mu","Zhihui Peng","Dahua Lin","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2312.03408v3.pdf","comment":"This article is a simplified English translation of corresponding\n  Chinese article. Please refer to Chinese version for the complete content"},{"id":"http://arxiv.org/abs/2312.07630v2","updated":"2024-01-24T04:04:26Z","published":"2023-12-12T08:33:45Z","title":"Building Universal Foundation Models for Medical Image Analysis with\n  Spatially Adaptive Networks","summary":"  Recent advancements in foundation models, typically trained with\nself-supervised learning on large-scale and diverse datasets, have shown great\npotential in medical image analysis. However, due to the significant spatial\nheterogeneity of medical imaging data, current models must tailor specific\nstructures for different datasets, making it challenging to leverage the\nabundant unlabeled data. In this work, we propose a universal foundation model\nfor medical image analysis that processes images with heterogeneous spatial\nproperties using a unified structure. To accomplish this, we propose spatially\nadaptive networks (SPAD-Nets), a family of networks that dynamically adjust the\nstructures to adapt to the spatial properties of input images, to build such a\nuniversal foundation model. We pre-train a spatial adaptive visual tokenizer\n(SPAD-VT) and then a spatial adaptive Vision Transformer (SPAD-ViT) via masked\nimage modeling (MIM) on 55 public medical image datasets. The pre-training data\ncomprises over 9 million image slices, representing the largest, most\ncomprehensive, and most diverse dataset to our knowledge for pre-training\nuniversal foundation models for medical image analysis. The experimental\nresults on downstream medical image classification and segmentation tasks\ndemonstrate the superior performance and label efficiency of our model. Our\ncode is available at https://github.com/function2-llx/PUMIT.\n","authors":["Lingxiao Luo","Xuanzhong Chen","Bingda Tang","Xinsheng Chen","Rong Han","Chengpeng Hu","Yujiang Li","Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2312.07630v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13214v1","updated":"2024-01-24T03:56:33Z","published":"2024-01-24T03:56:33Z","title":"AMANet: Advancing SAR Ship Detection with Adaptive Multi-Hierarchical\n  Attention Network","summary":"  Recently, methods based on deep learning have been successfully applied to\nship detection for synthetic aperture radar (SAR) images. Despite the\ndevelopment of numerous ship detection methodologies, detecting small and\ncoastal ships remains a significant challenge due to the limited features and\nclutter in coastal environments. For that, a novel adaptive multi-hierarchical\nattention module (AMAM) is proposed to learn multi-scale features and\nadaptively aggregate salient features from various feature layers, even in\ncomplex environments. Specifically, we first fuse information from adjacent\nfeature layers to enhance the detection of smaller targets, thereby achieving\nmulti-scale feature enhancement. Then, to filter out the adverse effects of\ncomplex backgrounds, we dissect the previously fused multi-level features on\nthe channel, individually excavate the salient regions, and adaptively\namalgamate features originating from different channels. Thirdly, we present a\nnovel adaptive multi-hierarchical attention network (AMANet) by embedding the\nAMAM between the backbone network and the feature pyramid network (FPN).\nBesides, the AMAM can be readily inserted between different frameworks to\nimprove object detection. Lastly, extensive experiments on two large-scale SAR\nship detection datasets demonstrate that our AMANet method is superior to\nstate-of-the-art methods.\n","authors":["Xiaolin Ma","Junkai Cheng","Aihua Li","Yuhua Zhang","Zhilong Lin"],"pdf_url":"https://arxiv.org/pdf/2401.13214v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.13213v1","updated":"2024-01-24T03:56:07Z","published":"2024-01-24T03:56:07Z","title":"Common-Sense Bias Discovery and Mitigation for Classification Tasks","summary":"  Machine learning model bias can arise from dataset composition: sensitive\nfeatures correlated to the learning target disturb the model decision rule and\nlead to performance differences along the features. Existing de-biasing work\ncaptures prominent and delicate image features which are traceable in model\nlatent space, like colors of digits or background of animals. However, using\nthe latent space is not sufficient to understand all dataset feature\ncorrelations. In this work, we propose a framework to extract feature clusters\nin a dataset based on image descriptions, allowing us to capture both subtle\nand coarse features of the images. The feature co-occurrence pattern is\nformulated and correlation is measured, utilizing a human-in-the-loop for\nexamination. The analyzed features and correlations are human-interpretable, so\nwe name the method Common-Sense Bias Discovery (CSBD). Having exposed sensitive\ncorrelations in a dataset, we demonstrate that downstream model bias can be\nmitigated by adjusting image sampling weights, without requiring a sensitive\ngroup label supervision. Experiments show that our method discovers novel\nbiases on multiple classification tasks for two benchmark image datasets, and\nthe intervention outperforms state-of-the-art unsupervised bias mitigation\nmethods.\n","authors":["Miao Zhang","Zee fryer","Ben Colman","Ali Shahriyari","Gaurav Bharaj"],"pdf_url":"https://arxiv.org/pdf/2401.13213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13212v1","updated":"2024-01-24T03:49:51Z","published":"2024-01-24T03:49:51Z","title":"AdCorDA: Classifier Refinement via Adversarial Correction and Domain\n  Adaptation","summary":"  This paper describes a simple yet effective technique for refining a\npretrained classifier network. The proposed AdCorDA method is based on\nmodification of the training set and making use of the duality between network\nweights and layer inputs. We call this input space training. The method\nconsists of two stages - adversarial correction followed by domain adaptation.\nAdversarial correction uses adversarial attacks to correct incorrect\ntraining-set classifications. The incorrectly classified samples of the\ntraining set are removed and replaced with the adversarially corrected samples\nto form a new training set, and then, in the second stage, domain adaptation is\nperformed back to the original training set. Extensive experimental validations\nshow significant accuracy boosts of over 5% on the CIFAR-100 dataset. The\ntechnique can be straightforwardly applied to refinement of weight-quantized\nneural networks, where experiments show substantial enhancement in performance\nover the baseline. The adversarial correction technique also results in\nenhanced robustness to adversarial attacks.\n","authors":["Lulan Shen","Ali Edalati","Brett Meyer","Warren Gross","James J. Clark"],"pdf_url":"https://arxiv.org/pdf/2401.13212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10727v2","updated":"2024-01-24T03:34:41Z","published":"2024-01-19T14:44:37Z","title":"MLLM-Tool: A Multimodal Large Language Model For Tool Agent Learning","summary":"  Recently, the astonishing performance of large language models (LLMs) in\nnatural language comprehension and generation tasks triggered lots of\nexploration of using them as central controllers to build agent systems.\nMultiple studies focus on bridging the LLMs to external tools to extend the\napplication scenarios. However, the current LLMs' perceiving tool-use ability\nis limited to a single text query, which may result in ambiguity in\nunderstanding the users' real intentions. LLMs are expected to eliminate that\nby perceiving the visual- or auditory-grounded instructions' information.\nTherefore, in this paper, we propose MLLM-Tool, a system incorporating\nopen-source LLMs and multi-modal encoders so that the learnt LLMs can be\nconscious of multi-modal input instruction and then select the function-matched\ntool correctly. To facilitate the evaluation of the model's capability, we\ncollect a dataset featured by consisting of multi-modal input tools from\nHuggingFace. Another important feature of our dataset is that our dataset also\ncontains multiple potential choices for the same instruction due to the\nexistence of identical functions and synonymous functions, which provides more\npotential solutions for the same query. The experiments reveal that our\nMLLM-Tool is capable of recommending appropriate tools for multi-modal\ninstructions. Codes and data are available at\nhttps://github.com/MLLM-Tool/MLLM-Tool.\n","authors":["Chenyu Wang","Weixin Luo","Qianyu Chen","Haonan Mai","Jindi Guo","Sixun Dong"," Xiaohua"," Xuan","Zhengxin Li","Lin Ma","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2401.10727v2.pdf","comment":"21 pages, 9 figures, 10 tables"},{"id":"http://arxiv.org/abs/2401.13205v1","updated":"2024-01-24T03:26:34Z","published":"2024-01-24T03:26:34Z","title":"Boosting the Transferability of Adversarial Examples via Local Mixup and\n  Adaptive Step Size","summary":"  Adversarial examples are one critical security threat to various visual\napplications, where injected human-imperceptible perturbations can confuse the\noutput.Generating transferable adversarial examples in the black-box setting is\ncrucial but challenging in practice. Existing input-diversity-based methods\nadopt different image transformations, but may be inefficient due to\ninsufficient input diversity and an identical perturbation step size. Motivated\nby the fact that different image regions have distinctive weights in\nclassification, this paper proposes a black-box adversarial generative\nframework by jointly designing enhanced input diversity and adaptive step\nsizes. We design local mixup to randomly mix a group of transformed adversarial\nimages, strengthening the input diversity. For precise adversarial generation,\nwe project the perturbation into the $tanh$ space to relax the boundary\nconstraint. Moreover, the step sizes of different regions can be dynamically\nadjusted by integrating a second-order momentum.Extensive experiments on\nImageNet validate that our framework can achieve superior transferability\ncompared to state-of-the-art baselines.\n","authors":["Junlin Liu","Xinchen Lyu"],"pdf_url":"https://arxiv.org/pdf/2401.13205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13203v1","updated":"2024-01-24T03:10:36Z","published":"2024-01-24T03:10:36Z","title":"Style-Consistent 3D Indoor Scene Synthesis with Decoupled Objects","summary":"  Controllable 3D indoor scene synthesis stands at the forefront of\ntechnological progress, offering various applications like gaming, film, and\naugmented/virtual reality. The capability to stylize and de-couple objects\nwithin these scenarios is a crucial factor, providing an advanced level of\ncontrol throughout the editing process. This control extends not just to\nmanipulating geometric attributes like translation and scaling but also\nincludes managing appearances, such as stylization. Current methods for scene\nstylization are limited to applying styles to the entire scene, without the\nability to separate and customize individual objects. Addressing the\nintricacies of this challenge, we introduce a unique pipeline designed for\nsynthesis 3D indoor scenes. Our approach involves strategically placing objects\nwithin the scene, utilizing information from professionally designed bounding\nboxes. Significantly, our pipeline prioritizes maintaining style consistency\nacross multiple objects within the scene, ensuring a cohesive and visually\nappealing result aligned with the desired aesthetic. The core strength of our\npipeline lies in its ability to generate 3D scenes that are not only visually\nimpressive but also exhibit features like photorealism, multi-view consistency,\nand diversity. These scenes are crafted in response to various natural language\nprompts, demonstrating the versatility and adaptability of our model.\n","authors":["Yunfan Zhang","Hong Huang","Zhiwei Xiong","Zhiqi Shen","Guosheng Lin","Hao Wang","Nicholas Vun"],"pdf_url":"https://arxiv.org/pdf/2401.13203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13201v1","updated":"2024-01-24T03:07:26Z","published":"2024-01-24T03:07:26Z","title":"MLLMReID: Multimodal Large Language Model-based Person Re-identification","summary":"  Multimodal large language models (MLLM) have achieved satisfactory results in\nmany tasks. However, their performance in the task of person re-identification\n(ReID) has not been explored to date. This paper will investigate how to adapt\nthem for the task of ReID. An intuitive idea is to fine-tune MLLM with ReID\nimage-text datasets, and then use their visual encoder as a backbone for ReID.\nHowever, there still exist two apparent issues: (1) Designing instructions for\nReID, MLLMs may overfit specific instructions, and designing a variety of\ninstructions will lead to higher costs. (2) Latent image feature vectors from\nLLMs are not involved in loss computation. Instructional learning, aligning\nimage-text features, results in indirect optimization and a learning objective\nthat inadequately utilizes features, limiting effectiveness in person feature\nlearning. To address these problems, this paper proposes MLLMReID: Multimodal\nLarge Language Model-based ReID. Firstly, we proposed Common Instruction, a\nsimple approach that leverages the essence ability of LLMs to continue writing,\navoiding complex and diverse instruction design. Secondly, we proposed\nDirectReID, which effectively employs the latent image feature vectors of\nimages outputted by LLMs in ReID tasks. The experimental results demonstrate\nthe superiority of our method. We will open-source the code on GitHub.\n","authors":["Shan Yang","Yongfei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.13201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10110v3","updated":"2024-01-24T03:05:53Z","published":"2024-01-18T16:27:09Z","title":"VIPTR: A Vision Permutable Extractor for Fast and Efficient Scene Text\n  Recognition","summary":"  Scene Text Recognition (STR) is a challenging task that involves recognizing\ntext within images of natural scenes. Although current state-of-the-art models\nfor STR exhibit high performance, they typically suffer from low inference\nefficiency due to their reliance on hybrid architectures comprised of visual\nencoders and sequence decoders. In this work, we propose the VIsion Permutable\nextractor for fast and efficient scene Text Recognition (VIPTR), which achieves\nan impressive balance between high performance and rapid inference speeds in\nthe domain of STR. Specifically, VIPTR leverages a visual-semantic extractor\nwith a pyramid structure, characterized by multiple self-attention layers,\nwhile eschewing the traditional sequence decoder. This design choice results in\na lightweight and efficient model capable of handling inputs of varying sizes.\nExtensive experimental results on various standard datasets for both Chinese\nand English scene text recognition validate the superiority of VIPTR. Notably,\nthe VIPTR-T (Tiny) variant delivers highly competitive accuracy on par with\nother lightweight models and achieves SOTA inference speeds. Meanwhile, the\nVIPTR-L (Large) variant attains greater recognition accuracy, while maintaining\na low parameter count and favorable inference speed. Our proposed method\nprovides a compelling solution for the STR challenge, which blends high\naccuracy with efficiency and greatly benefits real-world applications requiring\nfast and reliable text recognition. The code is publicly available at\nhttps://github.com/cxfyxl/VIPTR.\n","authors":["Xianfu Cheng","Weixiao Zhou","Xiang Li","Xiaoming Chen","Jian Yang","Tongliang Li","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2401.10110v3.pdf","comment":"9 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2401.13197v1","updated":"2024-01-24T02:57:32Z","published":"2024-01-24T02:57:32Z","title":"Predicting Mitral Valve mTEER Surgery Outcomes Using Machine Learning\n  and Deep Learning Techniques","summary":"  Mitral Transcatheter Edge-to-Edge Repair (mTEER) is a medical procedure\nutilized for the treatment of mitral valve disorders. However, predicting the\noutcome of the procedure poses a significant challenge. This paper makes the\nfirst attempt to harness classical machine learning (ML) and deep learning (DL)\ntechniques for predicting mitral valve mTEER surgery outcomes. To achieve this,\nwe compiled a dataset from 467 patients, encompassing labeled echocardiogram\nvideos and patient reports containing Transesophageal Echocardiography (TEE)\nmeasurements detailing Mitral Valve Repair (MVR) treatment outcomes. Leveraging\nthis dataset, we conducted a benchmark evaluation of six ML algorithms and two\nDL models. The results underscore the potential of ML and DL in predicting\nmTEER surgery outcomes, providing insight for future investigation and\nadvancements in this domain.\n","authors":["Tejas Vyas","Mohsena Chowdhury","Xiaojiao Xiao","Mathias Claeys","Géraldine Ong","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2401.13197v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2309.01171v2","updated":"2024-01-24T02:50:18Z","published":"2023-09-03T13:18:59Z","title":"Deep Unfolding Convolutional Dictionary Model for Multi-Contrast MRI\n  Super-resolution and Reconstruction","summary":"  Magnetic resonance imaging (MRI) tasks often involve multiple contrasts.\nRecently, numerous deep learning-based multi-contrast MRI super-resolution (SR)\nand reconstruction methods have been proposed to explore the complementary\ninformation from the multi-contrast images. However, these methods either\nconstruct parameter-sharing networks or manually design fusion rules, failing\nto accurately model the correlations between multi-contrast images and lacking\ncertain interpretations. In this paper, we propose a multi-contrast\nconvolutional dictionary (MC-CDic) model under the guidance of the optimization\nalgorithm with a well-designed data fidelity term. Specifically, we bulid an\nobservation model for the multi-contrast MR images to explicitly model the\nmulti-contrast images as common features and unique features. In this way, only\nthe useful information in the reference image can be transferred to the target\nimage, while the inconsistent information will be ignored. We employ the\nproximal gradient algorithm to optimize the model and unroll the iterative\nsteps into a deep CDic model. Especially, the proximal operators are replaced\nby learnable ResNet. In addition, multi-scale dictionaries are introduced to\nfurther improve the model performance. We test our MC-CDic model on\nmulti-contrast MRI SR and reconstruction tasks. Experimental results\ndemonstrate the superior performance of the proposed MC-CDic model against\nexisting SOTA methods. Code is available at\nhttps://github.com/lpcccc-cv/MC-CDic.\n","authors":["Pengcheng Lei","Faming Fang","Guixu Zhang","Ming Xu"],"pdf_url":"https://arxiv.org/pdf/2309.01171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13193v1","updated":"2024-01-24T02:42:50Z","published":"2024-01-24T02:42:50Z","title":"Catch-Up Mix: Catch-Up Class for Struggling Filters in CNN","summary":"  Deep learning has made significant advances in computer vision, particularly\nin image classification tasks. Despite their high accuracy on training data,\ndeep learning models often face challenges related to complexity and\noverfitting. One notable concern is that the model often relies heavily on a\nlimited subset of filters for making predictions. This dependency can result in\ncompromised generalization and an increased vulnerability to minor variations.\nWhile regularization techniques like weight decay, dropout, and data\naugmentation are commonly used to address this issue, they may not directly\ntackle the reliance on specific filters. Our observations reveal that the heavy\nreliance problem gets severe when slow-learning filters are deprived of\nlearning opportunities due to fast-learning filters. Drawing inspiration from\nimage augmentation research that combats over-reliance on specific image\nregions by removing and replacing parts of images, our idea is to mitigate the\nproblem of over-reliance on strong filters by substituting highly activated\nfeatures. To this end, we present a novel method called Catch-up Mix, which\nprovides learning opportunities to a wide range of filters during training,\nfocusing on filters that may lag behind. By mixing activation maps with\nrelatively lower norms, Catch-up Mix promotes the development of more diverse\nrepresentations and reduces reliance on a small subset of filters. Experimental\nresults demonstrate the superiority of our method in various vision\nclassification datasets, providing enhanced robustness.\n","authors":["Minsoo Kang","Minkoo Kang","Suhyun Kim"],"pdf_url":"https://arxiv.org/pdf/2401.13193v1.pdf","comment":"Published at AAAI2024, Equal contribution of first two authors"},{"id":"http://arxiv.org/abs/2401.13191v1","updated":"2024-01-24T02:35:32Z","published":"2024-01-24T02:35:32Z","title":"Towards Multi-domain Face Landmark Detection with Synthetic Data from\n  Diffusion model","summary":"  Recently, deep learning-based facial landmark detection for in-the-wild faces\nhas achieved significant improvement. However, there are still challenges in\nface landmark detection in other domains (e.g. cartoon, caricature, etc). This\nis due to the scarcity of extensively annotated training data. To tackle this\nconcern, we design a two-stage training approach that effectively leverages\nlimited datasets and the pre-trained diffusion model to obtain aligned pairs of\nlandmarks and face in multiple domains. In the first stage, we train a\nlandmark-conditioned face generation model on a large dataset of real faces. In\nthe second stage, we fine-tune the above model on a small dataset of\nimage-landmark pairs with text prompts for controlling the domain. Our new\ndesigns enable our method to generate high-quality synthetic paired datasets\nfrom multiple domains while preserving the alignment between landmarks and\nfacial features. Finally, we fine-tuned a pre-trained face landmark detection\nmodel on the synthetic dataset to achieve multi-domain face landmark detection.\nOur qualitative and quantitative results demonstrate that our method\noutperforms existing methods on multi-domain face landmark detection.\n","authors":["Yuanming Li","Gwantae Kim","Jeong-gi Kwak","Bon-hwa Ku","Hanseok Ko"],"pdf_url":"https://arxiv.org/pdf/2401.13191v1.pdf","comment":"6 pages, ICASSP 2024 accepted"},{"id":"http://arxiv.org/abs/2306.15142v5","updated":"2024-01-24T02:14:43Z","published":"2023-06-27T02:03:46Z","title":"LRANet: Towards Accurate and Efficient Scene Text Detection with\n  Low-Rank Approximation Network","summary":"  Recently, regression-based methods, which predict parameterized text shapes\nfor text localization, have gained popularity in scene text detection. However,\nthe existing parameterized text shape methods still have limitations in\nmodeling arbitrary-shaped texts due to ignoring the utilization of\ntext-specific shape information. Moreover, the time consumption of the entire\npipeline has been largely overlooked, leading to a suboptimal overall inference\nspeed. To address these issues, we first propose a novel parameterized text\nshape method based on low-rank approximation. Unlike other shape representation\nmethods that employ data-irrelevant parameterization, our approach utilizes\nsingular value decomposition and reconstructs the text shape using a few\neigenvectors learned from labeled text contours. By exploring the shape\ncorrelation among different text contours, our method achieves consistency,\ncompactness, simplicity, and robustness in shape representation. Next, we\npropose a dual assignment scheme for speed acceleration. It adopts a sparse\nassignment branch to accelerate the inference speed, and meanwhile, provides\nample supervised signals for training through a dense assignment branch.\nBuilding upon these designs, we implement an accurate and efficient\narbitrary-shaped text detector named LRANet. Extensive experiments are\nconducted on several challenging benchmarks, demonstrating the superior\naccuracy and efficiency of LRANet compared to state-of-the-art methods. Code is\navailable at: \\url{https://github.com/ychensu/LRANet.git}\n","authors":["Yuchen Su","Zhineng Chen","Zhiwen Shao","Yuning Du","Zhilong Ji","Jinfeng Bai","Yong Zhou","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2306.15142v5.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2401.12592v2","updated":"2024-01-24T02:06:41Z","published":"2024-01-23T09:47:13Z","title":"RGBD Objects in the Wild: Scaling Real-World 3D Object Learning from\n  RGB-D Videos","summary":"  We introduce a new RGB-D object dataset captured in the wild called\nWildRGB-D. Unlike most existing real-world object-centric datasets which only\ncome with RGB capturing, the direct capture of the depth channel allows better\n3D annotations and broader downstream applications. WildRGB-D comprises\nlarge-scale category-level RGB-D object videos, which are taken using an iPhone\nto go around the objects in 360 degrees. It contains around 8500 recorded\nobjects and nearly 20000 RGB-D videos across 46 common object categories. These\nvideos are taken with diverse cluttered backgrounds with three setups to cover\nas many real-world scenarios as possible: (i) a single object in one video;\n(ii) multiple objects in one video; and (iii) an object with a static hand in\none video. The dataset is annotated with object masks, real-world scale camera\nposes, and reconstructed aggregated point clouds from RGBD videos. We benchmark\nfour tasks with WildRGB-D including novel view synthesis, camera pose\nestimation, object 6d pose estimation, and object surface reconstruction. Our\nexperiments show that the large-scale capture of RGB-D objects provides a large\npotential to advance 3D object learning. Our project page is\nhttps://wildrgbd.github.io/.\n","authors":["Hongchi Xia","Yang Fu","Sifei Liu","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12592v2.pdf","comment":"Our project page: https://wildrgbd.github.io/"},{"id":"http://arxiv.org/abs/2203.13883v5","updated":"2024-01-24T01:50:22Z","published":"2022-03-25T19:45:33Z","title":"Multi-modal Misinformation Detection: Approaches, Challenges and\n  Opportunities","summary":"  As social media platforms are evolving from text-based forums into\nmulti-modal environments, the nature of misinformation in social media is also\ntransforming accordingly. Taking advantage of the fact that visual modalities\nsuch as images and videos are more favorable and attractive to the users and\ntextual contents are sometimes skimmed carelessly, misinformation spreaders\nhave recently targeted contextual connections between the modalities e.g., text\nand image. Hence many researchers have developed automatic techniques for\ndetecting possible cross-modal discordance in web-based content. We analyze,\ncategorize and identify existing approaches in addition to challenges and\nshortcomings they face in order to unearth new research opportunities in the\nfield of multi-modal misinformation detection.\n","authors":["Sara Abdali","Sina shaham","Bhaskar Krishnamachari"],"pdf_url":"https://arxiv.org/pdf/2203.13883v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13174v1","updated":"2024-01-24T01:41:26Z","published":"2024-01-24T01:41:26Z","title":"Boundary and Relation Distillation for Semantic Segmentation","summary":"  Recently, it has been revealed that small semantic segmentation (SS) models\nexhibit a tendency to make errors in maintaining boundary region completeness\nand preserving target region connectivity, despite their effective segmentation\nof the main object regions. To address these errors, we propose a targeted\nboundary and relation distillation (BRD) strategy using knowledge distillation\nfrom large teacher models to small student models. Specifically, the boundary\ndistillation extracts explicit object boundaries from the hierarchical feature\nmaps of the backbone network, subsequently enhancing the student model's mask\nquality in boundary regions. Concurrently, the relation distillation transfers\nimplicit relations from the teacher model to the student model using\npixel-level self-relation as a bridge, ensuring that the student's mask has\nstrong target region connectivity. The proposed BRD is designed concretely for\nSS and is characterized by simplicity and efficiency. Through experimental\nevaluations on multiple SS datasets, including Pascal VOC 2012, Cityscapes,\nADE20K, and COCO-Stuff 10K, we demonstrated that BRD significantly surpasses\nthe current methods without increasing the inference costs, generating crisp\nregion boundaries and smooth connecting regions that are challenging for small\nmodels.\n","authors":["Dong Zhang","Pingcheng Dong","Xinting Hu","Long Chen","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.13174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13172v1","updated":"2024-01-24T01:37:27Z","published":"2024-01-24T01:37:27Z","title":"ADMap: Anti-disturbance framework for reconstructing online vectorized\n  HD map","summary":"  In the field of autonomous driving, online high-definition (HD) map\nreconstruction is crucial for planning tasks. Recent research has developed\nseveral high-performance HD map reconstruction models to meet this necessity.\nHowever, the point sequences within the instance vectors may be jittery or\njagged due to prediction bias, which can impact subsequent tasks. Therefore,\nthis paper proposes the Anti-disturbance Map reconstruction framework (ADMap).\nTo mitigate point-order jitter, the framework consists of three modules:\nMulti-Scale Perception Neck, Instance Interactive Attention (IIA), and Vector\nDirection Difference Loss (VDDL). By exploring the point-order relationships\nbetween and within instances in a cascading manner, the model can monitor the\npoint-order prediction process more effectively. ADMap achieves\nstate-of-the-art performance on the nuScenes and Argoverse2 datasets. Extensive\nresults demonstrate its ability to produce stable and reliable map elements in\ncomplex and changing driving scenarios. Code and more demos are available at\nhttps://github.com/hht1996ok/ADMap.\n","authors":["Haotian Hu","Fanyi Wang","Yaonong Wang","Laifeng Hu","Jingwei Xu","Zhiwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.13172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13161v1","updated":"2024-01-24T00:37:14Z","published":"2024-01-24T00:37:14Z","title":"A Generalized Multiscale Bundle-Based Hyperspectral Sparse Unmixing\n  Algorithm","summary":"  In hyperspectral sparse unmixing, a successful approach employs spectral\nbundles to address the variability of the endmembers in the spatial domain.\nHowever, the regularization penalties usually employed aggregate substantial\ncomputational complexity, and the solutions are very noise-sensitive. We\ngeneralize a multiscale spatial regularization approach to solve the unmixing\nproblem by incorporating group sparsity-inducing mixed norms. Then, we propose\na noise-robust method that can take advantage of the bundle structure to deal\nwith endmember variability while ensuring inter- and intra-class sparsity in\nabundance estimation with reasonable computational cost. We also present a\ngeneral heuristic to select the \\emph{most representative} abundance estimation\nover multiple runs of the unmixing process, yielding a solution that is robust\nand highly reproducible. Experiments illustrate the robustness and consistency\nof the results when compared to related methods.\n","authors":["Luciano Carvalho Ayres","Ricardo Augusto Borsoi","José Carlos Moreira Bermudez","Sérgio José Melo de Almeida"],"pdf_url":"https://arxiv.org/pdf/2401.13161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13856v1","updated":"2024-01-24T23:42:08Z","published":"2024-01-24T23:42:08Z","title":"LAA-Net: Localized Artifact Attention Network for High-Quality Deepfakes\n  Detection","summary":"  This paper introduces a novel approach for high-quality deepfake detection\ncalled Localized Artifact Attention Network (LAA-Net). Existing methods for\nhigh-quality deepfake detection are mainly based on a supervised binary\nclassifier coupled with an implicit attention mechanism. As a result, they do\nnot generalize well to unseen manipulations. To handle this issue, two main\ncontributions are made. First, an explicit attention mechanism within a\nmulti-task learning framework is proposed. By combining heatmap-based and\nself-consistency attention strategies, LAA-Net is forced to focus on a few\nsmall artifact-prone vulnerable regions. Second, an Enhanced Feature Pyramid\nNetwork (E-FPN) is proposed as a simple and effective mechanism for spreading\ndiscriminative low-level features into the final feature output, with the\nadvantage of limiting redundancy. Experiments performed on several benchmarks\nshow the superiority of our approach in terms of Area Under the Curve (AUC) and\nAverage Precision (AP). The code will be released soon.\n","authors":["Dat Nguyen","Nesryne Mejri","Inder Pal Singh","Polina Kuleshova","Marcella Astrid","Anis Kacem","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2401.13856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13853v1","updated":"2024-01-24T23:25:23Z","published":"2024-01-24T23:25:23Z","title":"Dataset and Benchmark: Novel Sensors for Autonomous Vehicle Perception","summary":"  Conventional cameras employed in autonomous vehicle (AV) systems support many\nperception tasks, but are challenged by low-light or high dynamic range scenes,\nadverse weather, and fast motion. Novel sensors, such as event and thermal\ncameras, offer capabilities with the potential to address these scenarios, but\nthey remain to be fully exploited. This paper introduces the Novel Sensors for\nAutonomous Vehicle Perception (NSAVP) dataset to facilitate future research on\nthis topic. The dataset was captured with a platform including stereo event,\nthermal, monochrome, and RGB cameras as well as a high precision navigation\nsystem providing ground truth poses. The data was collected by repeatedly\ndriving two ~8 km routes and includes varied lighting conditions and opposing\nviewpoint perspectives. We provide benchmarking experiments on the task of\nplace recognition to demonstrate challenges and opportunities for novel sensors\nto enhance critical AV perception tasks. To our knowledge, the NSAVP dataset is\nthe first to include stereo thermal cameras together with stereo event and\nmonochrome cameras. The dataset and supporting software suite is available at:\nhttps://umautobots.github.io/nsavp\n","authors":["Spencer Carmichael","Austin Buchan","Mani Ramanagopal","Radhika Ravi","Ram Vasudevan","Katherine A. Skinner"],"pdf_url":"https://arxiv.org/pdf/2401.13853v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2401.13837v1","updated":"2024-01-24T22:28:26Z","published":"2024-01-24T22:28:26Z","title":"Democratizing Fine-grained Visual Recognition with Large Language Models","summary":"  Identifying subordinate-level categories from images is a longstanding task\nin computer vision and is referred to as fine-grained visual recognition\n(FGVR). It has tremendous significance in real-world applications since an\naverage layperson does not excel at differentiating species of birds or\nmushrooms due to subtle differences among the species. A major bottleneck in\ndeveloping FGVR systems is caused by the need of high-quality paired expert\nannotations. To circumvent the need of expert knowledge we propose Fine-grained\nSemantic Category Reasoning (FineR) that internally leverages the world\nknowledge of large language models (LLMs) as a proxy in order to reason about\nfine-grained category names. In detail, to bridge the modality gap between\nimages and LLM, we extract part-level visual attributes from images as text and\nfeed that information to a LLM. Based on the visual attributes and its internal\nworld knowledge the LLM reasons about the subordinate-level category names. Our\ntraining-free FineR outperforms several state-of-the-art FGVR and language and\nvision assistant models and shows promise in working in the wild and in new\ndomains where gathering expert annotation is arduous.\n","authors":["Mingxuan Liu","Subhankar Roy","Wenjing Li","Zhun Zhong","Nicu Sebe","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2401.13837v1.pdf","comment":"Accepted as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2306.15667v4","updated":"2024-01-24T21:00:12Z","published":"2023-06-27T17:59:07Z","title":"PoseDiffusion: Solving Pose Estimation via Diffusion-aided Bundle\n  Adjustment","summary":"  Camera pose estimation is a long-standing computer vision problem that to\ndate often relies on classical methods, such as handcrafted keypoint matching,\nRANSAC and bundle adjustment. In this paper, we propose to formulate the\nStructure from Motion (SfM) problem inside a probabilistic diffusion framework,\nmodelling the conditional distribution of camera poses given input images. This\nnovel view of an old problem has several advantages. (i) The nature of the\ndiffusion framework mirrors the iterative procedure of bundle adjustment. (ii)\nThe formulation allows a seamless integration of geometric constraints from\nepipolar geometry. (iii) It excels in typically difficult scenarios such as\nsparse views with wide baselines. (iv) The method can predict intrinsics and\nextrinsics for an arbitrary amount of images. We demonstrate that our method\nPoseDiffusion significantly improves over the classic SfM pipelines and the\nlearned approaches on two real-world datasets. Finally, it is observed that our\nmethod can generalize across datasets without further training. Project page:\nhttps://posediffusion.github.io/\n","authors":["Jianyuan Wang","Christian Rupprecht","David Novotny"],"pdf_url":"https://arxiv.org/pdf/2306.15667v4.pdf","comment":"ICCV Camera Ready: revised Introduction and Related work, added a\n  metric mAA (AUC), added some quantitative results, and added Appendix"},{"id":"http://arxiv.org/abs/2305.14486v2","updated":"2024-01-24T20:41:54Z","published":"2023-05-23T19:36:24Z","title":"Point2SSM: Learning Morphological Variations of Anatomies from Point\n  Cloud","summary":"  We present Point2SSM, a novel unsupervised learning approach for constructing\ncorrespondence-based statistical shape models (SSMs) directly from raw point\nclouds. SSM is crucial in clinical research, enabling population-level analysis\nof morphological variation in bones and organs. Traditional methods of SSM\nconstruction have limitations, including the requirement of noise-free surface\nmeshes or binary volumes, reliance on assumptions or templates, and prolonged\ninference times due to simultaneous optimization of the entire cohort.\nPoint2SSM overcomes these barriers by providing a data-driven solution that\ninfers SSMs directly from raw point clouds, reducing inference burdens and\nincreasing applicability as point clouds are more easily acquired. While deep\nlearning on 3D point clouds has seen success in unsupervised representation\nlearning and shape correspondence, its application to anatomical SSM\nconstruction is largely unexplored. We conduct a benchmark of state-of-the-art\npoint cloud deep networks on the SSM task, revealing their limited robustness\nto clinical challenges such as noisy, sparse, or incomplete input and limited\ntraining data. Point2SSM addresses these issues through an attention-based\nmodule, providing effective correspondence mappings from learned point\nfeatures. Our results demonstrate that the proposed method significantly\noutperforms existing networks in terms of accurate surface sampling and\ncorrespondence, better capturing population-level statistics.\n","authors":["Jadie Adams","Shireen Elhabian"],"pdf_url":"https://arxiv.org/pdf/2305.14486v2.pdf","comment":"Accepted as a Spotlight presentation at ICLR 2024"},{"id":"http://arxiv.org/abs/2206.11723v7","updated":"2024-01-24T20:36:29Z","published":"2022-06-23T14:16:30Z","title":"Self-Supervised Training with Autoencoders for Visual Anomaly Detection","summary":"  Recently, deep auto-encoders have been used for the task of anomaly detection\nin the visual domain. By optimising for the reconstruction error using\nanomaly-free examples, the common belief is that a corresponding network should\nfail to accurately reconstruct anomalous regions in the application phase. This\ngoal is typically addressed by controlling the capacity of the network, either\nby reducing the size of the bottleneck layer or by enforcing sparsity\nconstraints on its activations. However, neither of these techniques does\nexplicitly penalise reconstruction of anomalous signals often resulting in poor\ndetection. We tackle this problem by adapting a self-supervised learning regime\nthat allows the use of discriminative information during training but focuses\non the data manifold of normal examples. Precisely, we investigate two\ndifferent training objectives inspired by the task of neural image inpainting.\nOur main objective regularises the model to produce locally consistent\nreconstructions, while replacing irregularities, therefore, acting as a filter\nthat removes anomalous patterns. Our formal analysis shows that under mild\nconditions the corresponding model resembles a non-linear orthogonal projection\nof partially corrupted images onto the manifold of uncorrupted (defect-free)\nexamples. This insight makes the reconstruction error a natural choice for\ndefining the anomaly score of a sample according to its distance from a\ncorresponding projection on the data manifold. We emphasise that inference with\nour approach is very efficient during training and prediction requiring a\nsingle forward pass for each input image. Our experiments on the MVTec AD\ndataset demonstrate high detection and localisation performance. On the\ntexture-subset, in particular, our approach consistently outperforms recent\nanomaly detection methods by a significant margin.\n","authors":["Alexander Bauer","Shinichi Nakajima","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2206.11723v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13795v1","updated":"2024-01-24T20:25:48Z","published":"2024-01-24T20:25:48Z","title":"Diffuse to Choose: Enriching Image Conditioned Inpainting in Latent\n  Diffusion Models for Virtual Try-All","summary":"  As online shopping is growing, the ability for buyers to virtually visualize\nproducts in their settings-a phenomenon we define as \"Virtual Try-All\"-has\nbecome crucial. Recent diffusion models inherently contain a world model,\nrendering them suitable for this task within an inpainting context. However,\ntraditional image-conditioned diffusion models often fail to capture the\nfine-grained details of products. In contrast, personalization-driven models\nsuch as DreamPaint are good at preserving the item's details but they are not\noptimized for real-time applications. We present \"Diffuse to Choose,\" a novel\ndiffusion-based image-conditioned inpainting model that efficiently balances\nfast inference with the retention of high-fidelity details in a given reference\nitem while ensuring accurate semantic manipulations in the given scene content.\nOur approach is based on incorporating fine-grained features from the reference\nimage directly into the latent feature maps of the main diffusion model,\nalongside with a perceptual loss to further preserve the reference item's\ndetails. We conduct extensive testing on both in-house and publicly available\ndatasets, and show that Diffuse to Choose is superior to existing zero-shot\ndiffusion inpainting methods as well as few-shot diffusion personalization\nalgorithms like DreamPaint.\n","authors":["Mehmet Saygin Seyfioglu","Karim Bouyarmane","Suren Kumar","Amir Tavanaei","Ismail B. Tutar"],"pdf_url":"https://arxiv.org/pdf/2401.13795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08421v2","updated":"2024-01-24T20:25:02Z","published":"2023-09-15T14:23:51Z","title":"MIML: Multiplex Image Machine Learning for High Precision Cell\n  Classification via Mechanical Traits within Microfluidic Systems","summary":"  Label-free cell classification is advantageous for supplying pristine cells\nfor further use or examination, yet existing techniques frequently fall short\nin terms of specificity and speed. In this study, we address these limitations\nthrough the development of a novel machine learning framework, Multiplex Image\nMachine Learning (MIML). This architecture uniquely combines label-free cell\nimages with biomechanical property data, harnessing the vast, often\nunderutilized morphological information intrinsic to each cell. By integrating\nboth types of data, our model offers a more holistic understanding of the\ncellular properties, utilizing morphological information typically discarded in\ntraditional machine learning models. This approach has led to a remarkable\n98.3\\% accuracy in cell classification, a substantial improvement over models\nthat only consider a single data type. MIML has been proven effective in\nclassifying white blood cells and tumor cells, with potential for broader\napplication due to its inherent flexibility and transfer learning capability.\nIt's particularly effective for cells with similar morphology but distinct\nbiomechanical properties. This innovative approach has significant implications\nacross various fields, from advancing disease diagnostics to understanding\ncellular behavior.\n","authors":["Khayrul Islam","Ratul Paul","Shen Wang","Yaling Liu"],"pdf_url":"https://arxiv.org/pdf/2309.08421v2.pdf","comment":"major change"},{"id":"http://arxiv.org/abs/2401.13786v1","updated":"2024-01-24T20:07:59Z","published":"2024-01-24T20:07:59Z","title":"FoVA-Depth: Field-of-View Agnostic Depth Estimation for Cross-Dataset\n  Generalization","summary":"  Wide field-of-view (FoV) cameras efficiently capture large portions of the\nscene, which makes them attractive in multiple domains, such as automotive and\nrobotics. For such applications, estimating depth from multiple images is a\ncritical task, and therefore, a large amount of ground truth (GT) data is\navailable. Unfortunately, most of the GT data is for pinhole cameras, making it\nimpossible to properly train depth estimation models for large-FoV cameras. We\npropose the first method to train a stereo depth estimation model on the widely\navailable pinhole data, and to generalize it to data captured with larger FoVs.\nOur intuition is simple: We warp the training data to a canonical, large-FoV\nrepresentation and augment it to allow a single network to reason about diverse\ntypes of distortions that otherwise would prevent generalization. We show\nstrong generalization ability of our approach on both indoor and outdoor\ndatasets, which was not possible with previous methods.\n","authors":["Daniel Lichy","Hang Su","Abhishek Badki","Jan Kautz","Orazio Gallo"],"pdf_url":"https://arxiv.org/pdf/2401.13786v1.pdf","comment":"3DV 2024 (Oral); Project Website:\n  https://research.nvidia.com/labs/lpr/fova-depth/"},{"id":"http://arxiv.org/abs/2401.13785v1","updated":"2024-01-24T20:06:59Z","published":"2024-01-24T20:06:59Z","title":"S2TPVFormer: Spatio-Temporal Tri-Perspective View for temporally\n  coherent 3D Semantic Occupancy Prediction","summary":"  Holistic understanding and reasoning in 3D scenes play a vital role in the\nsuccess of autonomous driving systems. The evolution of 3D semantic occupancy\nprediction as a pretraining task for autonomous driving and robotic downstream\ntasks captures finer 3D details compared to methods like 3D detection. Existing\napproaches predominantly focus on spatial cues, often overlooking temporal\ncues. Query-based methods tend to converge on computationally intensive Voxel\nrepresentation for encoding 3D scene information. This study introduces\nS2TPVFormer, an extension of TPVFormer, utilizing a spatiotemporal transformer\narchitecture for coherent 3D semantic occupancy prediction. Emphasizing the\nimportance of spatiotemporal cues in 3D scene perception, particularly in 3D\nsemantic occupancy prediction, our work explores the less-explored realm of\ntemporal cues. Leveraging Tri-Perspective View (TPV) representation, our\nspatiotemporal encoder generates temporally rich embeddings, improving\nprediction coherence while maintaining computational efficiency. To achieve\nthis, we propose a novel Temporal Cross-View Hybrid Attention (TCVHA)\nmechanism, facilitating effective spatiotemporal information exchange across\nTPV views. Experimental evaluations on the nuScenes dataset demonstrate a\nsubstantial 3.1% improvement in mean Intersection over Union (mIoU) for 3D\nSemantic Occupancy compared to TPVFormer, confirming the effectiveness of the\nproposed S2TPVFormer in enhancing 3D scene perception.\n","authors":["Sathira Silva","Savindu Bhashitha Wannigama","Roshan Ragel","Gihan Jayatilaka"],"pdf_url":"https://arxiv.org/pdf/2401.13785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13782v1","updated":"2024-01-24T20:05:49Z","published":"2024-01-24T20:05:49Z","title":"Tweets to Citations: Unveiling the Impact of Social Media Influencers on\n  AI Research Visibility","summary":"  As the number of accepted papers at AI and ML conferences reaches into the\nthousands, it has become unclear how researchers access and read research\npublications. In this paper, we investigate the role of social media\ninfluencers in enhancing the visibility of machine learning research,\nparticularly the citation counts of papers they share. We have compiled a\ncomprehensive dataset of over 8,000 papers, spanning tweets from December 2018\nto October 2023, alongside 1:1 matched controls based on publication year,\nvenue, and abstract topics. Our analysis reveals a significant increase in\ncitations for papers endorsed by these influencers, with median citation counts\n2-3 times higher than those of the control group. Additionally, the study\ndelves into the geographic, gender, and institutional diversity of highlighted\nauthors. These findings highlight the expanding influence of social media in\nscholarly communication and underscore the importance of an evolving ecosystem\nin today's digital academic landscape.\n","authors":["Iain Xie Weissburg","Mehir Arora","Liangming Pan","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.13782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.12809v3","updated":"2024-01-24T20:05:21Z","published":"2022-06-26T07:15:48Z","title":"Role and Integration of Image Processing Systems in Maritime Target\n  Tracking","summary":"  In recent years, maritime traffic has increased, especially in seaborne\ntrade. To ensure safety, security, and environmental protection, various\nsystems have been deployed, often combining data for improved effectiveness.\nOne key application of this combined data is tracking targets at sea, where the\nAutomatic Identification System (AIS) and X-band marine radar are crucial.\nRecently, there has been growing interest in using visual data from cameras to\nenhance tracking. This has led to the development of several tracking\nalgorithms based on image processing. While much of the existing literature\naddresses data fusion, there hasn't been much focus on why integrating image\nprocessing systems is important given the existence of the other systems. In\nour paper, we aim to analyze these surveillance systems and highlight the\nreasons for integrating image processing systems. Our main goal is to show how\nthis integration can improve maritime security, offering practical insights\ninto enhancing safety and protection at sea.\n","authors":["Yassir Zardoua","Bilal Sebbar","Moussab Chbeine","Abdelali Astito","Mohammed Boulaala"],"pdf_url":"https://arxiv.org/pdf/2206.12809v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13751v1","updated":"2024-01-24T19:12:37Z","published":"2024-01-24T19:12:37Z","title":"A Systematic Approach to Robustness Modelling for Deep Convolutional\n  Neural Networks","summary":"  Convolutional neural networks have shown to be widely applicable to a large\nnumber of fields when large amounts of labelled data are available. The recent\ntrend has been to use models with increasingly larger sets of tunable\nparameters to increase model accuracy, reduce model loss, or create more\nadversarially robust models -- goals that are often at odds with one another.\nIn particular, recent theoretical work raises questions about the ability for\neven larger models to generalize to data outside of the controlled train and\ntest sets. As such, we examine the role of the number of hidden layers in the\nResNet model, demonstrated on the MNIST, CIFAR10, CIFAR100 datasets. We test a\nvariety of parameters including the size of the model, the floating point\nprecision, and the noise level of both the training data and the model output.\nTo encapsulate the model's predictive power and computational cost, we provide\na method that uses induced failures to model the probability of failure as a\nfunction of time and relate that to a novel metric that allows us to quickly\ndetermine whether or not the cost of training a model outweighs the cost of\nattacking it. Using this approach, we are able to approximate the expected\nfailure rate using a small number of specially crafted samples rather than\nincreasingly larger benchmark datasets. We demonstrate the efficacy of this\ntechnique on both the MNIST and CIFAR10 datasets using 8-, 16-, 32-, and 64-bit\nfloating-point numbers, various data pre-processing techniques, and several\nattacks on five configurations of the ResNet model. Then, using empirical\nmeasurements, we examine the various trade-offs between cost, robustness,\nlatency, and reliability to find that larger models do not significantly aid in\nadversarial robustness despite costing significantly more to train.\n","authors":["Charles Meyers","Mohammad Reza Saleh Sedghpour","Tommy Löfstedt","Erik Elmroth"],"pdf_url":"https://arxiv.org/pdf/2401.13751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13721v1","updated":"2024-01-24T14:55:02Z","published":"2024-01-24T14:55:02Z","title":"Uncertainty-Guided Alignment for Unsupervised Domain Adaptation in\n  Regression","summary":"  Unsupervised Domain Adaptation for Regression (UDAR) aims to adapt a model\nfrom a labeled source domain to an unlabeled target domain for regression\ntasks. Recent successful works in UDAR mostly focus on subspace alignment,\ninvolving the alignment of a selected subspace within the entire feature space.\nThis contrasts with the feature alignment methods used for classification,\nwhich aim at aligning the entire feature space and have proven effective but\nare less so in regression settings. Specifically, while classification aims to\nidentify separate clusters across the entire embedding dimension, regression\ninduces less structure in the data representation, necessitating additional\nguidance for efficient alignment. In this paper, we propose an effective method\nfor UDAR by incorporating guidance from uncertainty. Our approach serves a dual\npurpose: providing a measure of confidence in predictions and acting as a\nregularization of the embedding space. Specifically, we leverage the Deep\nEvidential Learning framework, which outputs both predictions and uncertainties\nfor each input sample. We propose aligning the parameters of higher-order\nevidential distributions between the source and target domains using\ntraditional alignment methods at the feature or posterior level. Additionally,\nwe propose to augment the feature space representation by mixing source samples\nwith pseudo-labeled target samples based on label similarity. This cross-domain\nmixing strategy produces more realistic samples than random mixing and\nintroduces higher uncertainty, facilitating further alignment. We demonstrate\nthe effectiveness of our approach on four benchmarks for UDAR, on which we\noutperform existing methods.\n","authors":["Ismail Nejjar","Gaetan Frusque","Florent Forest","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2401.13721v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.13609v1","updated":"2024-01-24T17:27:08Z","published":"2024-01-24T17:27:08Z","title":"Building Contextual Knowledge Graphs for Personalized Learning\n  Recommendations using Text Mining and Semantic Graph Completion","summary":"  Modelling learning objects (LO) within their context enables the learner to\nadvance from a basic, remembering-level, learning objective to a higher-order\none, i.e., a level with an application- and analysis objective. While\nhierarchical data models are commonly used in digital learning platforms, using\ngraph-based models enables representing the context of LOs in those platforms.\nThis leads to a foundation for personalized recommendations of learning paths.\nIn this paper, the transformation of hierarchical data models into knowledge\ngraph (KG) models of LOs using text mining is introduced and evaluated. We\nutilize custom text mining pipelines to mine semantic relations between\nelements of an expert-curated hierarchical model. We evaluate the KG structure\nand relation extraction using graph quality-control metrics and the comparison\nof algorithmic semantic-similarities to expert-defined ones. The results show\nthat the relations in the KG are semantically comparable to those defined by\ndomain experts, and that the proposed KG improves representing and linking the\ncontexts of LOs through increasing graph communities and betweenness\ncentrality.\n","authors":["Hasan Abu-Rasheed","Mareike Dornhöfer","Christian Weber","Gábor Kismihók","Ulrike Buchmann","Madjid Fathi"],"pdf_url":"https://arxiv.org/pdf/2401.13609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13566v1","updated":"2024-01-24T16:23:14Z","published":"2024-01-24T16:23:14Z","title":"A Cost-Sensitive Meta-Learning Strategy for Fair Provider Exposure in\n  Recommendation","summary":"  When devising recommendation services, it is important to account for the\ninterests of all content providers, encompassing not only newcomers but also\nminority demographic groups. In various instances, certain provider groups find\nthemselves underrepresented in the item catalog, a situation that can influence\nrecommendation results. Hence, platform owners often seek to regulate the\nexposure of these provider groups in the recommended lists. In this paper, we\npropose a novel cost-sensitive approach designed to guarantee these target\nexposure levels in pairwise recommendation models. This approach quantifies,\nand consequently mitigate, the discrepancies between the volume of\nrecommendations allocated to groups and their contribution in the item catalog,\nunder the principle of equity. Our results show that this approach, while\naligning groups exposure with their assigned levels, does not compromise to the\noriginal recommendation utility. Source code and pre-processed data can be\nretrieved at\nhttps://github.com/alessandraperniciano/meta-learning-strategy-fair-provider-exposure.\n","authors":["Ludovico Boratto","Giulia Cerniglia","Mirko Marras","Alessandra Perniciano","Barbara Pes"],"pdf_url":"https://arxiv.org/pdf/2401.13566v1.pdf","comment":"Accepted at the 46th European Conference on Information Retrieval\n  (ECIR 2024)"},{"id":"http://arxiv.org/abs/2401.13545v1","updated":"2024-01-24T16:05:03Z","published":"2024-01-24T16:05:03Z","title":"Fine-grained Contract NER using instruction based model","summary":"  Lately, instruction-based techniques have made significant strides in\nimproving performance in few-shot learning scenarios. They achieve this by\nbridging the gap between pre-trained language models and fine-tuning for\nspecific downstream tasks. Despite these advancements, the performance of Large\nLanguage Models (LLMs) in information extraction tasks like Named Entity\nRecognition (NER), using prompts or instructions, still falls short of\nsupervised baselines. The reason for this performance gap can be attributed to\nthe fundamental disparity between NER and LLMs. NER is inherently a sequence\nlabeling task, where the model must assign entity-type labels to individual\ntokens within a sentence. In contrast, LLMs are designed as a text generation\ntask. This distinction between semantic labeling and text generation leads to\nsubpar performance. In this paper, we transform the NER task into a\ntext-generation task that can be readily adapted by LLMs. This involves\nenhancing source sentences with task-specific instructions and answer choices,\nallowing for the identification of entities and their types within natural\nlanguage. We harness the strength of LLMs by integrating supervised learning\nwithin them. The goal of this combined strategy is to boost the performance of\nLLMs in extraction tasks like NER while simultaneously addressing hallucination\nissues often observed in LLM-generated content. A novel corpus Contract NER\ncomprising seven frequently observed contract categories, encompassing named\nentities associated with 18 distinct legal entity types is released along with\nour baseline models. Our models and dataset are available to the community for\nfuture research * .\n","authors":["Hiranmai Sri Adibhatla","Pavan Baswani","Manish Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2401.13545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.02659v2","updated":"2024-01-24T15:53:41Z","published":"2022-04-06T08:20:51Z","title":"Towards Better Understanding of User Satisfaction in Open-Domain\n  Conversational Search","summary":"  With the increasing popularity of conversational search, how to evaluate the\nperformance of conversational search systems has become an important question\nin the IR community. Existing works on conversational search evaluation can\nmainly be categorized into two streams: (1) constructing metrics based on\nsemantic similarity (e.g. BLUE, METEOR and BERTScore), or (2) directly\nevaluating the response ranking performance of the system using traditional\nsearch methods (e.g. nDCG, RBP and nERR). However, these methods either ignore\nthe information need of the user or ignore the mixed-initiative property of\nconversational search. This raises the question of how to accurately model user\nsatisfaction in conversational search scenarios. Since explicitly asking users\nto provide satisfaction feedback is difficult, traditional IR studies often\nrely on the Cranfield paradigm (i.e., third-party annotation) and user behavior\nmodeling to estimate user satisfaction in search. However, the feasibility and\neffectiveness of these two approaches have not been fully explored in\nconversational search. In this paper, we dive into the evaluation of\nconversational search from the perspective of user satisfaction. We build a\nnovel conversational search experimental platform and construct a Chinese\nopen-domain conversational search behavior dataset containing rich annotations\nand search behavior data. We also collect third-party satisfaction annotation\nat the session-level and turn-level, to investigate the feasibility of the\nCranfield paradigm in the conversational search scenario. Experimental results\nshow both some consistency and considerable differences between the user\nsatisfaction annotations and third-party annotations. We also propose dialog\ncontinuation or ending behavior models (DCEBM) to capture session-level user\nsatisfaction based on turn-level information.\n","authors":["Zhumin Chu","Qingyao Ai","Zhihong Wang","Yiqun Liu","Yingye Huang","Rui Zhang","Min Zhang","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2204.02659v2.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2401.13509v1","updated":"2024-01-24T15:06:44Z","published":"2024-01-24T15:06:44Z","title":"TPRF: A Transformer-based Pseudo-Relevance Feedback Model for Efficient\n  and Effective Retrieval","summary":"  This paper considers Pseudo-Relevance Feedback (PRF) methods for dense\nretrievers in a resource constrained environment such as that of cheap cloud\ninstances or embedded systems (e.g., smartphones and smartwatches), where\nmemory and CPU are limited and GPUs are not present. For this, we propose a\ntransformer-based PRF method (TPRF), which has a much smaller memory footprint\nand faster inference time compared to other deep language models that employ\nPRF mechanisms, with a marginal effectiveness loss. TPRF learns how to\neffectively combine the relevance feedback signals from dense passage\nrepresentations. Specifically, TPRF provides a mechanism for modelling\nrelationships and weights between the query and the relevance feedback signals.\nThe method is agnostic to the specific dense representation used and thus can\nbe generally applied to any dense retriever.\n","authors":["Chuting Yu","Hang Li","Ahmed Mourad","Bevan Koopman","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2401.13509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13478v1","updated":"2024-01-24T14:23:12Z","published":"2024-01-24T14:23:12Z","title":"SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval","summary":"  Multi-modal information retrieval (MMIR) is a rapidly evolving field, where\nsignificant progress, particularly in image-text pairing, has been made through\nadvanced representation learning and cross-modality alignment research.\nHowever, current benchmarks for evaluating MMIR performance in image-text\npairing within the scientific domain show a notable gap, where chart and table\nimages described in scholarly language usually do not play a significant role.\nTo bridge this gap, we develop a specialised scientific MMIR (SciMMIR)\nbenchmark by leveraging open-access paper collections to extract data relevant\nto the scientific domain. This benchmark comprises 530K meticulously curated\nimage-text pairs, extracted from figures and tables with detailed captions in\nscientific documents. We further annotate the image-text pairs with two-level\nsubset-subcategory hierarchy annotations to facilitate a more comprehensive\nevaluation of the baselines. We conducted zero-shot and fine-tuning evaluations\non prominent multi-modal image-captioning and visual language models, such as\nCLIP and BLIP. Our analysis offers critical insights for MMIR in the scientific\ndomain, including the impact of pre-training and fine-tuning settings and the\ninfluence of the visual and textual encoders. All our data and checkpoints are\npublicly available at https://github.com/Wusiwei0410/SciMMIR.\n","authors":["Siwei Wu","Yizhi Li","Kang Zhu","Ge Zhang","Yiming Liang","Kaijing Ma","Chenghao Xiao","Haoran Zhang","Bohao Yang","Wenhu Chen","Wenhao Huang","Noura Al Moubayed","Jie Fu","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2401.13478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13463v1","updated":"2024-01-24T14:08:38Z","published":"2024-01-24T14:08:38Z","title":"SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken\n  Question Answering","summary":"  Spoken Question Answering (SQA) is essential for machines to reply to user's\nquestion by finding the answer span within a given spoken passage. SQA has been\npreviously achieved without ASR to avoid recognition errors and\nOut-of-Vocabulary (OOV) problems. However, the real-world problem of\nOpen-domain SQA (openSQA), in which the machine needs to first retrieve\npassages that possibly contain the answer from a spoken archive in addition,\nwas never considered. This paper proposes the first known end-to-end framework,\nSpeech Dense Passage Retriever (SpeechDPR), for the retrieval component of the\nopenSQA problem. SpeechDPR learns a sentence-level semantic representation by\ndistilling knowledge from the cascading model of unsupervised ASR (UASR) and\ntext dense retriever (TDR). No manually transcribed speech data is needed.\nInitial experiments showed performance comparable to the cascading model of\nUASR and TDR, and significantly better when UASR was poor, verifying this\napproach is more robust to speech recognition errors.\n","authors":["Chyi-Jiunn Lin","Guan-Ting Lin","Yung-Sung Chuang","Wei-Lun Wu","Shang-Wen Li","Abdelrahman Mohamed","Hung-yi Lee","Lin-shan Lee"],"pdf_url":"https://arxiv.org/pdf/2401.13463v1.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.13448v1","updated":"2024-01-24T13:42:49Z","published":"2024-01-24T13:42:49Z","title":"Decentralized Collaborative Learning with Adaptive Reference Data for\n  On-Device POI Recommendation","summary":"  In Location-based Social Networks, Point-of-Interest (POI) recommendation\nhelps users discover interesting places. There is a trend to move from the\ncloud-based model to on-device recommendations for privacy protection and\nreduced server reliance. Due to the scarcity of local user-item interactions on\nindividual devices, solely relying on local instances is not adequate.\nCollaborative Learning (CL) emerges to promote model sharing among users, where\nreference data is an intermediary that allows users to exchange their soft\ndecisions without directly sharing their private data or parameters, ensuring\nprivacy and benefiting from collaboration. However, existing CL-based\nrecommendations typically use a single reference for all users. Reference data\nvaluable for one user might be harmful to another, given diverse user\npreferences. Users may not offer meaningful soft decisions on items outside\ntheir interest scope. Consequently, using the same reference data for all\ncollaborations can impede knowledge exchange and lead to sub-optimal\nperformance. To address this gap, we introduce the Decentralized Collaborative\nLearning with Adaptive Reference Data (DARD) framework, which crafts adaptive\nreference data for effective user collaboration. It first generates a\ndesensitized public reference data pool with transformation and probability\ndata generation methods. For each user, the selection of adaptive reference\ndata is executed in parallel by training loss tracking and influence function.\nLocal models are trained with individual private data and collaboratively with\nthe geographical and semantic neighbors. During the collaboration between two\nusers, they exchange soft decisions based on a combined set of their adaptive\nreference data. Our evaluations across two real-world datasets highlight DARD's\nsuperiority in recommendation performance and addressing the scarcity of\navailable reference data.\n","authors":["Ruiqi Zheng","Liang Qu","Tong Chen","Lizhen Cui","Yuhui Shi","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2401.13448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13434v1","updated":"2024-01-24T13:05:48Z","published":"2024-01-24T13:05:48Z","title":"Query Exposure Prediction for Groups of Documents in Rankings","summary":"  The main objective of an Information Retrieval system is to provide a user\nwith the most relevant documents to the user's query. To do this, modern IR\nsystems typically deploy a re-ranking pipeline in which a set of documents is\nretrieved by a lightweight first-stage retrieval process and then re-ranked by\na more effective but expensive model. However, the success of a re-ranking\npipeline is heavily dependent on the performance of the first stage retrieval,\nsince new documents are not usually identified during the re-ranking stage.\nMoreover, this can impact the amount of exposure that a particular group of\ndocuments, such as documents from a particular demographic group, can receive\nin the final ranking. For example, the fair allocation of exposure becomes more\nchallenging or impossible if the first stage retrieval returns too few\ndocuments from certain groups, since the number of group documents in the\nranking affects the exposure more than the documents' positions. With this in\nmind, it is beneficial to predict the amount of exposure that a group of\ndocuments is likely to receive in the results of the first stage retrieval\nprocess, in order to ensure that there are a sufficient number of documents\nincluded from each of the groups. In this paper, we introduce the novel task of\nquery exposure prediction (QEP). Specifically, we propose the first approach\nfor predicting the distribution of exposure that groups of documents will\nreceive for a given query. Our new approach, called GEP, uses lexical\ninformation from individual groups of documents to estimate the exposure the\ngroups will receive in a ranking. Our experiments on the TREC 2021 and 2022\nFair Ranking Track test collections show that our proposed GEP approach results\nin exposure predictions that are up to 40 % more accurate than the predictions\nof adapted existing query performance prediction and resource allocation\napproaches.\n","authors":["Thomas Jaenich","Graham McDonald","Iadh Ounis"],"pdf_url":"https://arxiv.org/pdf/2401.13434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13410v1","updated":"2024-01-24T12:11:41Z","published":"2024-01-24T12:11:41Z","title":"How to Forget Clients in Federated Online Learning to Rank?","summary":"  Data protection legislation like the European Union's General Data Protection\nRegulation (GDPR) establishes the \\textit{right to be forgotten}: a user\n(client) can request contributions made using their data to be removed from\nlearned models. In this paper, we study how to remove the contributions made by\na client participating in a Federated Online Learning to Rank (FOLTR) system.\nIn a FOLTR system, a ranker is learned by aggregating local updates to the\nglobal ranking model. Local updates are learned in an online manner at a\nclient-level using queries and implicit interactions that have occurred within\nthat specific client. By doing so, each client's local data is not shared with\nother clients or with a centralised search service, while at the same time\nclients can benefit from an effective global ranking model learned from\ncontributions of each client in the federation.\n  In this paper, we study an effective and efficient unlearning method that can\nremove a client's contribution without compromising the overall ranker\neffectiveness and without needing to retrain the global ranker from scratch. A\nkey challenge is how to measure whether the model has unlearned the\ncontributions from the client $c^*$ that has requested removal. For this, we\ninstruct $c^*$ to perform a poisoning attack (add noise to this client updates)\nand then we measure whether the impact of the attack is lessened when the\nunlearning process has taken place. Through experiments on four datasets, we\ndemonstrate the effectiveness and efficiency of the unlearning strategy under\ndifferent combinations of parameter settings.\n","authors":["Shuyi Wang","Bing Liu","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2401.13410v1.pdf","comment":"Accepted in ECIR 2024"},{"id":"http://arxiv.org/abs/2310.14037v3","updated":"2024-01-24T11:40:39Z","published":"2023-10-21T15:21:39Z","title":"MARVEL: Unlocking the Multi-Modal Capability of Dense Retrieval via\n  Visual Module Plugin","summary":"  This paper proposes Multi-modAl Retrieval model via Visual modulE pLugin\n(MARVEL), which learns an embedding space for queries and multi-modal documents\nto conduct retrieval. MARVEL encodes queries and multi-modal documents with a\nunified encoder model, which helps to alleviate the modality gap between images\nand texts. Specifically, we enable the image understanding ability of the\nwell-trained dense retriever, T5-ANCE, by incorporating the visual module's\nencoded image features as its inputs. To facilitate the multi-modal retrieval\ntasks, we build the ClueWeb22-MM dataset based on the ClueWeb22 dataset, which\nregards anchor texts as queries, and exacts the related text and image\ndocuments from anchor-linked web pages. Our experiments show that MARVEL\nsignificantly outperforms the state-of-the-art methods on the multi-modal\nretrieval dataset WebQA and ClueWeb22-MM. MARVEL provides an opportunity to\nbroaden the advantages of text retrieval to the multi-model scenario. Besides,\nwe also illustrate that the language model has the ability to extract image\nsemantics and partly map the image features to the input word embedding space.\nAll codes are available at https://github.com/OpenMatch/MARVEL.\n","authors":["Tianshuo Zhou","Sen Mei","Xinze Li","Zhenghao Liu","Chenyan Xiong","Zhiyuan Liu","Yu Gu","Ge Yu"],"pdf_url":"https://arxiv.org/pdf/2310.14037v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13351v1","updated":"2024-01-24T10:27:07Z","published":"2024-01-24T10:27:07Z","title":"Predicting IR Personalization Performance using Pre-retrieval Query\n  Predictors","summary":"  Personalization generally improves the performance of queries but in a few\ncases it may also harms it. If we are able to predict and therefore to disable\npersonalization for those situations, the overall performance will be higher\nand users will be more satisfied with personalized systems. We use some\nstate-of-the-art pre-retrieval query performance predictors and propose some\nothers including the user profile information for the previous purpose. We\nstudy the correlations among these predictors and the difference between the\npersonalized and the original queries. We also use classification and\nregression techniques to improve the results and finally reach a bit more than\none third of the maximum ideal performance. We think this is a good starting\npoint within this research line, which certainly needs more effort and\nimprovements.\n","authors":["Eduardo Vicente-López","Luis M. de Campos","Juan M. Fernández-Luna","Juan F. Huete"],"pdf_url":"https://arxiv.org/pdf/2401.13351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13320v1","updated":"2024-01-24T09:30:21Z","published":"2024-01-24T09:30:21Z","title":"A Big Data Architecture for Early Identification and Categorization of\n  Dark Web Sites","summary":"  The dark web has become notorious for its association with illicit activities\nand there is a growing need for systems to automate the monitoring of this\nspace. This paper proposes an end-to-end scalable architecture for the early\nidentification of new Tor sites and the daily analysis of their content. The\nsolution is built using an Open Source Big Data stack for data serving with\nKubernetes, Kafka, Kubeflow, and MinIO, continuously discovering onion\naddresses in different sources (threat intelligence, code repositories, web-Tor\ngateways, and Tor repositories), downloading the HTML from Tor and\ndeduplicating the content using MinHash LSH, and categorizing with the BERTopic\nmodeling (SBERT embedding, UMAP dimensionality reduction, HDBSCAN document\nclustering and c-TF-IDF topic keywords). In 93 days, the system identified\n80,049 onion services and characterized 90% of them, addressing the challenge\nof Tor volatility. A disproportionate amount of repeated content is found, with\nonly 6.1% unique sites. From the HTML files of the dark sites, 31 different\nlow-topics are extracted, manually labeled, and grouped into 11 high-level\ntopics. The five most popular included sexual and violent content,\nrepositories, search engines, carding, cryptocurrencies, and marketplaces.\nDuring the experiments, we identified 14 sites with 13,946 clones that shared a\nsuspiciously similar mirroring rate per day, suggesting an extensive common\nphishing network. Among the related works, this study is the most\nrepresentative characterization of onion services based on topics to date.\n","authors":["Javier Pastor-Galindo","Hông-Ân Sandlin","Félix Gómez Mármol","Gérôme Bovet","Gregorio Martínez Pérez"],"pdf_url":"https://arxiv.org/pdf/2401.13320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00976v3","updated":"2024-01-24T04:41:27Z","published":"2023-09-02T16:20:41Z","title":"Pure Message Passing Can Estimate Common Neighbor for Link Prediction","summary":"  Message Passing Neural Networks (MPNNs) have emerged as the {\\em de facto}\nstandard in graph representation learning. However, when it comes to link\nprediction, they often struggle, surpassed by simple heuristics such as Common\nNeighbor (CN). This discrepancy stems from a fundamental limitation: while\nMPNNs excel in node-level representation, they stumble with encoding the joint\nstructural features essential to link prediction, like CN. To bridge this gap,\nwe posit that, by harnessing the orthogonality of input vectors, pure\nmessage-passing can indeed capture joint structural features. Specifically, we\nstudy the proficiency of MPNNs in approximating CN heuristics. Based on our\nfindings, we introduce the Message Passing Link Predictor (MPLP), a novel link\nprediction model. MPLP taps into quasi-orthogonal vectors to estimate\nlink-level structural features, all while preserving the node-level\ncomplexities. Moreover, our approach demonstrates that leveraging\nmessage-passing to capture structural features could offset MPNNs'\nexpressiveness limitations at the expense of estimation variance. We conduct\nexperiments on benchmark datasets from various domains, where our method\nconsistently outperforms the baseline methods.\n","authors":["Kaiwen Dong","Zhichun Guo","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2309.00976v3.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2305.08845v2","updated":"2024-01-24T04:41:01Z","published":"2023-05-15T17:57:39Z","title":"Large Language Models are Zero-Shot Rankers for Recommender Systems","summary":"  Recently, large language models (LLMs) (e.g., GPT-4) have demonstrated\nimpressive general-purpose task-solving abilities, including the potential to\napproach recommendation tasks. Along this line of research, this work aims to\ninvestigate the capacity of LLMs that act as the ranking model for recommender\nsystems. We first formalize the recommendation problem as a conditional ranking\ntask, considering sequential interaction histories as conditions and the items\nretrieved by other candidate generation models as candidates. To solve the\nranking task by LLMs, we carefully design the prompting template and conduct\nextensive experiments on two widely-used datasets. We show that LLMs have\npromising zero-shot ranking abilities but (1) struggle to perceive the order of\nhistorical interactions, and (2) can be biased by popularity or item positions\nin the prompts. We demonstrate that these issues can be alleviated using\nspecially designed prompting and bootstrapping strategies. Equipped with these\ninsights, zero-shot LLMs can even challenge conventional recommendation models\nwhen ranking candidates are retrieved by multiple candidate generators. The\ncode and processed datasets are available at\nhttps://github.com/RUCAIBox/LLMRank.\n","authors":["Yupeng Hou","Junjie Zhang","Zihan Lin","Hongyu Lu","Ruobing Xie","Julian McAuley","Wayne Xin Zhao"],"pdf_url":"https://arxiv.org/pdf/2305.08845v2.pdf","comment":"Accepted by ECIR 2024"},{"id":"http://arxiv.org/abs/2401.13222v1","updated":"2024-01-24T04:25:54Z","published":"2024-01-24T04:25:54Z","title":"It's About Time: Incorporating Temporality in Retrieval Augmented\n  Language Models","summary":"  The web serves as a global repository of knowledge, used by billions of\npeople to search for information. Ensuring that users receive the most relevant\nand up-to-date information, especially in the presence of multiple versions of\nweb content from different time points remains a critical challenge for\ninformation retrieval. This challenge has recently been compounded by the\nincreased use of question answering tools trained on Wikipedia or web content\nand powered by large language models (LLMs) \\citep{chatgpt} which have been\nfound to make up information (or hallucinate), and in addition have been shown\nto struggle with the temporal dimensions of information. Even Retriever\nAugmented Language Models (RALMs) which incorporate a document database to\nreduce LLM hallucination are unable to handle temporal queries correctly. This\nleads to instances where RALMs respond to queries such as \"Who won the\nWimbledon Championship?\", by retrieving document passages related to Wimbledon\nbut without the ability to differentiate between them based on how recent they\nare.\n  In this paper, we propose and evaluate, TempRALM, a temporally-aware\nRetriever Augmented Language Model (RALM) with few-shot learning extensions,\nwhich takes into account both semantically and temporally relevant documents\nrelative to a given query, rather than relying on semantic similarity alone. We\nshow that our approach results in up to 74\\% improvement in performance over\nthe baseline RALM model, without requiring model pre-training, recalculating or\nreplacing the RALM document index, or adding other computationally intensive\nelements.\n","authors":["Anoushka Gade","Jorjeta Jetcheva"],"pdf_url":"https://arxiv.org/pdf/2401.13222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05975v2","updated":"2024-01-24T03:19:00Z","published":"2024-01-11T15:22:55Z","title":"Online Differentiable Clustering for Intent Learning in Recommendation","summary":"  Mining users' intents plays a crucial role in sequential recommendation. The\nrecent approach, ICLRec, was introduced to extract underlying users' intents\nusing contrastive learning and clustering. While it has shown effectiveness,\nthe existing method suffers from complex and cumbersome alternating\noptimization, leading to two main issues. Firstly, the separation of\nrepresentation learning and clustering optimization within a generalized\nexpectation maximization (EM) framework often results in sub-optimal\nperformance. Secondly, performing clustering on the entire dataset hampers\nscalability for large-scale industry data. To address these challenges, we\npropose a novel intent learning method called \\underline{ODCRec}, which\nintegrates representation learning into an \\underline{O}nline\n\\underline{D}ifferentiable \\underline{C}lustering framework for\n\\underline{Rec}ommendation. Specifically, we encode users' behavior sequences\nand initialize the cluster centers as differentiable network parameters.\nAdditionally, we design a clustering loss that guides the networks to\ndifferentiate between different cluster centers and pull similar samples\ntowards their respective cluster centers. This allows simultaneous optimization\nof recommendation and clustering using mini-batch data. Moreover, we leverage\nthe learned cluster centers as self-supervision signals for representation\nlearning, resulting in further enhancement of recommendation performance.\nExtensive experiments conducted on open benchmarks and industry data validate\nthe superiority, effectiveness, and efficiency of our proposed ODCRec method.\nCode is available at: https://github.com/yueliu1999/ELCRec.\n","authors":["Yue Liu","Shihao Zhu","Jun Xia","Yingwei Ma","Jian Ma","Wenliang Zhong","Guannan Zhang","Kejun Zhang","Xinwang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.05975v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2401.11648v2","updated":"2024-01-24T03:04:25Z","published":"2024-01-22T01:58:32Z","title":"Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal\n  Contrastive EHR Modelling with Hierarchical Regularisation","summary":"  Predicting next visit diagnosis using Electronic Health Records (EHR) is an\nessential task in healthcare, critical for devising proactive future plans for\nboth healthcare providers and patients. Nonetheless, many preceding studies\nhave not sufficiently addressed the heterogeneous and hierarchical\ncharacteristics inherent in EHR data, inevitably leading to sub-optimal\nperformance. To this end, we propose NECHO, a novel medical code-centric\nmultimodal contrastive EHR learning framework with hierarchical regularisation.\nFirst, we integrate multifaceted information encompassing medical codes,\ndemographics, and clinical notes using a tailored network design and a pair of\nbimodal contrastive losses, all of which pivot around a medical code\nrepresentation. We also regularise modality-specific encoders using a parental\nlevel information in medical ontology to learn hierarchical structure of EHR\ndata. A series of experiments on MIMIC-III data demonstrates effectiveness of\nour approach.\n","authors":["Heejoon Koo"],"pdf_url":"https://arxiv.org/pdf/2401.11648v2.pdf","comment":"Accepted to EACL 2024 (The 18th Conference of the European Chapter of\n  the Association for Computational Linguistics)"},{"id":"http://arxiv.org/abs/2401.13832v1","updated":"2024-01-24T22:15:38Z","published":"2024-01-24T22:15:38Z","title":"Algorithmically Curated Lies: How Search Engines Handle Misinformation\n  about US Biolabs in Ukraine","summary":"  The growing volume of online content prompts the need for adopting\nalgorithmic systems of information curation. These systems range from web\nsearch engines to recommender systems and are integral for helping users stay\ninformed about important societal developments. However, unlike journalistic\nediting the algorithmic information curation systems (AICSs) are known to be\nsubject to different forms of malperformance which make them vulnerable to\npossible manipulation. The risk of manipulation is particularly prominent in\nthe case when AICSs have to deal with information about false claims that\nunderpin propaganda campaigns of authoritarian regimes. Using as a case study\nof the Russian disinformation campaign concerning the US biolabs in Ukraine, we\ninvestigate how one of the most commonly used forms of AICSs - i.e. web search\nengines - curate misinformation-related content. For this aim, we conduct\nvirtual agent-based algorithm audits of Google, Bing, and Yandex search outputs\nin June 2022. Our findings highlight the troubling performance of search\nengines. Even though some search engines, like Google, were less likely to\nreturn misinformation results, across all languages and locations, the three\nsearch engines still mentioned or promoted a considerable share of false\ncontent (33% on Google; 44% on Bing, and 70% on Yandex). We also find\nsignificant disparities in misinformation exposure based on the language of\nsearch, with all search engines presenting a higher number of false stories in\nRussian. Location matters as well with users from Germany being more likely to\nbe exposed to search results promoting false information. These observations\nstress the possibility of AICSs being vulnerable to manipulation, in particular\nin the case of the unfolding propaganda campaigns, and underline the importance\nof monitoring performance of these systems to prevent it.\n","authors":["Elizaveta Kuznetsova","Mykola Makhortykh","Maryna Sydorova","Aleksandra Urman","Ilaria Vitulano","Martha Stolze"],"pdf_url":"https://arxiv.org/pdf/2401.13832v1.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.13823v1","updated":"2024-01-24T21:48:26Z","published":"2024-01-24T21:48:26Z","title":"Robustness in Fairness against Edge-level Perturbations in GNN-based\n  Recommendation","summary":"  Efforts in the recommendation community are shifting from the sole emphasis\non utility to considering beyond-utility factors, such as fairness and\nrobustness. Robustness of recommendation models is typically linked to their\nability to maintain the original utility when subjected to attacks. Limited\nresearch has explored the robustness of a recommendation model in terms of\nfairness, e.g., the parity in performance across groups, under attack\nscenarios. In this paper, we aim to assess the robustness of graph-based\nrecommender systems concerning fairness, when exposed to attacks based on\nedge-level perturbations. To this end, we considered four different fairness\noperationalizations, including both consumer and provider perspectives.\nExperiments on three datasets shed light on the impact of perturbations on the\ntargeted fairness notion, uncovering key shortcomings in existing evaluation\nprotocols for robustness. As an example, we observed perturbations affect\nconsumer fairness on a higher extent than provider fairness, with alarming\nunfairness for the former. Source code:\nhttps://github.com/jackmedda/CPFairRobust\n","authors":["Ludovico Boratto","Gianni Fenu","Francesco Fabbri","Mirko Marras","Giacomo Medda"],"pdf_url":"https://arxiv.org/pdf/2401.13823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13805v1","updated":"2024-01-24T20:56:23Z","published":"2024-01-24T20:56:23Z","title":"Longitudinal Sentiment Topic Modelling of Reddit Posts","summary":"  In this study, we analyze texts of Reddit posts written by students of four\nmajor Canadian universities. We gauge the emotional tone and uncover prevailing\nthemes and discussions through longitudinal topic modeling of posts textual\ndata. Our study focuses on four years, 2020-2023, covering COVID-19 pandemic\nand after pandemic years. Our results highlight a gradual uptick in discussions\nrelated to mental health.\n","authors":["Fabian Nwaoha","Ziyad Gaffar","Ho Joon Chun","Marina Sokolova"],"pdf_url":"https://arxiv.org/pdf/2401.13805v1.pdf","comment":"21 pages, 4 figures, 13 tables. arXiv admin note: text overlap with\n  arXiv:2401.12382"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.13662v1","updated":"2024-01-24T18:56:53Z","published":"2024-01-24T18:56:53Z","title":"The Definitive Guide to Policy Gradients in Deep Reinforcement Learning:\n  Theory, Algorithms and Implementations","summary":"  In recent years, various powerful policy gradient algorithms have been\nproposed in deep reinforcement learning. While all these algorithms build on\nthe Policy Gradient Theorem, the specific design choices differ significantly\nacross algorithms. We provide a holistic overview of on-policy policy gradient\nalgorithms to facilitate the understanding of both their theoretical\nfoundations and their practical implementations. In this overview, we include a\ndetailed proof of the continuous version of the Policy Gradient Theorem,\nconvergence results and a comprehensive discussion of practical algorithms. We\ncompare the most prominent algorithms on continuous control environments and\nprovide insights on the benefits of regularization. All code is available at\nhttps://github.com/Matt00n/PolicyGradientsJax.\n","authors":["Matthias Lehmann"],"pdf_url":"https://arxiv.org/pdf/2401.13662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13660v1","updated":"2024-01-24T18:53:53Z","published":"2024-01-24T18:53:53Z","title":"MambaByte: Token-free Selective State Space Model","summary":"  Token-free language models learn directly from raw bytes and remove the bias\nof subword tokenization. Operating on bytes, however, results in significantly\nlonger sequences, and standard autoregressive Transformers scale poorly in such\nsettings. We experiment with MambaByte, a token-free adaptation of the Mamba\nstate space model, trained autoregressively on byte sequences. Our experiments\nindicate the computational efficiency of MambaByte compared to other byte-level\nmodels. We also find MambaByte to be competitive with and even outperform\nstate-of-the-art subword Transformers. Furthermore, owing to linear scaling in\nlength, MambaByte benefits from fast inference compared to Transformers. Our\nfindings establish the viability of MambaByte in enabling token-free language\nmodeling.\n","authors":["Junxiong Wang","Tushaar Gangavarapu","Jing Nathan Yan","Alexander M Rush"],"pdf_url":"https://arxiv.org/pdf/2401.13660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13657v1","updated":"2024-01-24T18:49:30Z","published":"2024-01-24T18:49:30Z","title":"Inadequacy of common stochastic neural networks for reliable clinical\n  decision support","summary":"  Widespread adoption of AI for medical decision making is still hindered due\nto ethical and safety-related concerns. For AI-based decision support systems\nin healthcare settings it is paramount to be reliable and trustworthy. Common\ndeep learning approaches, however, have the tendency towards overconfidence\nunder data shift. Such inappropriate extrapolation beyond evidence-based\nscenarios may have dire consequences. This highlights the importance of\nreliable estimation of local uncertainty and its communication to the end user.\nWhile stochastic neural networks have been heralded as a potential solution to\nthese issues, this study investigates their actual reliability in clinical\napplications. We centered our analysis on the exemplary use case of mortality\nprediction for ICU hospitalizations using EHR from MIMIC3 study. For\npredictions on the EHR time series, Encoder-Only Transformer models were\nemployed. Stochasticity of model functions was achieved by incorporating common\nmethods such as Bayesian neural network layers and model ensembles. Our models\nachieve state of the art performance in terms of discrimination performance\n(AUC ROC: 0.868+-0.011, AUC PR: 0.554+-0.034) and calibration on the mortality\nprediction benchmark. However, epistemic uncertainty is critically\nunderestimated by the selected stochastic deep learning methods. A heuristic\nproof for the responsible collapse of the posterior distribution is provided.\nOur findings reveal the inadequacy of commonly used stochastic deep learning\napproaches to reliably recognize OoD samples. In both methods, unsubstantiated\nmodel confidence is not prevented due to strongly biased functional posteriors,\nrendering them inappropriate for reliable clinical decision support. This\nhighlights the need for approaches with more strictly enforced or inherent\ndistance-awareness to known data points, e.g., using kernel-based techniques.\n","authors":["Adrian Lindenmeyer","Malte Blattmann","Stefan Franke","Thomas Neumuth","Daniel Schneider"],"pdf_url":"https://arxiv.org/pdf/2401.13657v1.pdf","comment":"Keywords: probabilistic inference, uncertainty estimation,\n  uncertainty quantification, epistemic uncertainty, clinical prognosis,\n  electronic health records"},{"id":"http://arxiv.org/abs/2401.13652v1","updated":"2024-01-24T18:44:14Z","published":"2024-01-24T18:44:14Z","title":"Graph-Informed Neural Networks for Sparse Grid-Based Discontinuity\n  Detectors","summary":"  In this paper, we present a novel approach for detecting the discontinuity\ninterfaces of a discontinuous function. This approach leverages Graph-Informed\nNeural Networks (GINNs) and sparse grids to address discontinuity detection\nalso in domains of dimension larger than 3. GINNs, trained to identify troubled\npoints on sparse grids, exploit graph structures built on the grids to achieve\nefficient and accurate discontinuity detection performances. We also introduce\na recursive algorithm for general sparse grid-based detectors, characterized by\nconvergence properties and easy applicability. Numerical experiments on\nfunctions with dimensions n = 2 and n = 4 demonstrate the efficiency and robust\ngeneralization of GINNs in detecting discontinuity interfaces. Notably, the\ntrained GINNs offer portability and versatility, allowing integration into\nvarious algorithms and sharing among users.\n","authors":["Francesco Della Santa","Sandra Pieraccini"],"pdf_url":"https://arxiv.org/pdf/2401.13652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13649v1","updated":"2024-01-24T18:35:21Z","published":"2024-01-24T18:35:21Z","title":"VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web\n  Tasks","summary":"  Autonomous agents capable of planning, reasoning, and executing actions on\nthe web offer a promising avenue for automating computer tasks. However, the\nmajority of existing benchmarks primarily focus on text-based agents,\nneglecting many natural tasks that require visual information to effectively\nsolve. Given that most computer interfaces cater to human perception, visual\ninformation often augments textual data in ways that text-only models struggle\nto harness effectively. To bridge this gap, we introduce VisualWebArena, a\nbenchmark designed to assess the performance of multimodal web agents on\nrealistic \\textit{visually grounded tasks}. VisualWebArena comprises of a set\nof diverse and complex web-based tasks that evaluate various capabilities of\nautonomous multimodal agents. To perform on this benchmark, agents need to\naccurately process image-text inputs, interpret natural language instructions,\nand execute actions on websites to accomplish user-defined objectives. We\nconduct an extensive evaluation of state-of-the-art LLM-based autonomous\nagents, including several multimodal models. Through extensive quantitative and\nqualitative analysis, we identify several limitations of text-only LLM agents,\nand reveal gaps in the capabilities of state-of-the-art multimodal language\nagents. VisualWebArena provides a framework for evaluating multimodal\nautonomous language agents, and offers insights towards building stronger\nautonomous agents for the web. Our code, baseline models, and data is publicly\navailable at https://jykoh.com/vwa.\n","authors":["Jing Yu Koh","Robert Lo","Lawrence Jang","Vikram Duvvur","Ming Chong Lim","Po-Yu Huang","Graham Neubig","Shuyan Zhou","Ruslan Salakhutdinov","Daniel Fried"],"pdf_url":"https://arxiv.org/pdf/2401.13649v1.pdf","comment":"24 pages. Project page: https://jykoh.com/vwa"},{"id":"http://arxiv.org/abs/2306.09205v2","updated":"2024-01-24T18:32:49Z","published":"2023-06-15T15:40:04Z","title":"Reward-Free Curricula for Training Robust World Models","summary":"  There has been a recent surge of interest in developing generally-capable\nagents that can adapt to new tasks without additional training in the\nenvironment. Learning world models from reward-free exploration is a promising\napproach, and enables policies to be trained using imagined experience for new\ntasks. However, achieving a general agent requires robustness across different\nenvironments. In this work, we address the novel problem of generating\ncurricula in the reward-free setting to train robust world models. We consider\nrobustness in terms of minimax regret over all environment instantiations and\nshow that the minimax regret can be connected to minimising the maximum error\nin the world model across environment instances. This result informs our\nalgorithm, WAKER: Weighted Acquisition of Knowledge across Environments for\nRobustness. WAKER selects environments for data collection based on the\nestimated error of the world model for each environment. Our experiments\ndemonstrate that WAKER outperforms several baselines, resulting in improved\nrobustness, efficiency, and generalisation.\n","authors":["Marc Rigter","Minqi Jiang","Ingmar Posner"],"pdf_url":"https://arxiv.org/pdf/2306.09205v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2309.17249v2","updated":"2024-01-24T18:27:30Z","published":"2023-09-29T13:55:45Z","title":"Batch Calibration: Rethinking Calibration for In-Context Learning and\n  Prompt Engineering","summary":"  Prompting and in-context learning (ICL) have become efficient learning\nparadigms for large language models (LLMs). However, LLMs suffer from prompt\nbrittleness and various bias factors in the prompt, including but not limited\nto the formatting, the choice verbalizers, and the ICL examples. To address\nthis problem that results in unexpected performance degradation, calibration\nmethods have been developed to mitigate the effects of these biases while\nrecovering LLM performance. In this work, we first conduct a systematic\nanalysis of the existing calibration methods, where we both provide a unified\nview and reveal the failure cases. Inspired by these analyses, we propose Batch\nCalibration (BC), a simple yet intuitive method that controls the contextual\nbias from the batched input, unifies various prior approaches, and effectively\naddresses the aforementioned issues. BC is zero-shot, inference-only, and\nincurs negligible additional costs. In the few-shot setup, we further extend BC\nto allow it to learn the contextual bias from labeled data. We validate the\neffectiveness of BC with PaLM 2-(S, M, L) and CLIP models and demonstrate\nstate-of-the-art performance over previous calibration baselines across more\nthan 10 natural language understanding and image classification tasks.\n","authors":["Han Zhou","Xingchen Wan","Lev Proleev","Diana Mincu","Jilin Chen","Katherine Heller","Subhrajit Roy"],"pdf_url":"https://arxiv.org/pdf/2309.17249v2.pdf","comment":"ICLR 2024. 9 pages, 9 figures, 3 tables (22 pages, 11 figures, 11\n  tables including references and appendices)"},{"id":"http://arxiv.org/abs/2401.13641v1","updated":"2024-01-24T18:10:39Z","published":"2024-01-24T18:10:39Z","title":"How Good is ChatGPT at Face Biometrics? A First Look into Recognition,\n  Soft Biometrics, and Explainability","summary":"  Large Language Models (LLMs) such as GPT developed by OpenAI, have already\nshown astonishing results, introducing quick changes in our society. This has\nbeen intensified by the release of ChatGPT which allows anyone to interact in a\nsimple conversational way with LLMs, without any experience in the field\nneeded. As a result, ChatGPT has been rapidly applied to many different tasks\nsuch as code- and song-writer, education, virtual assistants, etc., showing\nimpressive results for tasks for which it was not trained (zero-shot learning).\n  The present study aims to explore the ability of ChatGPT, based on the recent\nGPT-4 multimodal LLM, for the task of face biometrics. In particular, we\nanalyze the ability of ChatGPT to perform tasks such as face verification,\nsoft-biometrics estimation, and explainability of the results. ChatGPT could be\nvery valuable to further increase the explainability and transparency of the\nautomatic decisions in human scenarios. Experiments are carried out in order to\nevaluate the performance and robustness of ChatGPT, using popular public\nbenchmarks and comparing the results with state-of-the-art methods in the\nfield. The results achieved in this study show the potential of LLMs such as\nChatGPT for face biometrics, especially to enhance explainability. For\nreproducibility reasons, we release all the code in GitHub.\n","authors":["Ivan DeAndres-Tame","Ruben Tolosana","Ruben Vera-Rodriguez","Aythami Morales","Julian Fierrez","Javier Ortega-Garcia"],"pdf_url":"https://arxiv.org/pdf/2401.13641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10451v3","updated":"2024-01-24T17:59:46Z","published":"2024-01-19T01:40:58Z","title":"Learning-assisted Stochastic Capacity Expansion Planning: A Bayesian\n  Optimization Approach","summary":"  Solving large-scale capacity expansion problems (CEPs) is central to\ncost-effective decarbonization of regional-scale energy systems. To ensure the\nintended outcomes of CEPs, modeling uncertainty due to weather-dependent\nvariable renewable energy (VRE) supply and energy demand becomes crucially\nimportant. However, the resulting stochastic optimization models are often less\ncomputationally tractable than their deterministic counterparts. Here, we\npropose a learning-assisted approximate solution method to tractably solve\ntwo-stage stochastic CEPs. Our method identifies low-cost planning decisions by\nconstructing and solving a sequence of tractable temporally aggregated\nsurrogate problems. We adopt a Bayesian optimization approach to searching the\nspace of time series aggregation hyperparameters and compute approximate\nsolutions that minimize costs on a validation set of supply-demand projections.\nImportantly, we evaluate solved planning outcomes on a held-out set of test\nprojections. We apply our approach to generation and transmission expansion\nplanning for a joint power-gas system spanning New England. We show that our\napproach yields an estimated cost savings of up to 3.8% in comparison to\nbenchmark time series aggregation approaches.\n","authors":["Aron Brenner","Rahman Khorramfar","Dharik Mallapragada","Saurabh Amin"],"pdf_url":"https://arxiv.org/pdf/2401.10451v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15865v4","updated":"2024-01-24T17:55:05Z","published":"2023-06-28T01:41:30Z","title":"Differentially Private Distributed Estimation and Learning","summary":"  We study distributed estimation and learning problems in a networked\nenvironment in which agents exchange information to estimate unknown\nstatistical properties of random variables from their privately observed\nsamples. The agents can collectively estimate the unknown quantities by\nexchanging information about their private observations, but they also face\nprivacy risks. Our novel algorithms extend the existing distributed estimation\nliterature and enable the participating agents to estimate a complete\nsufficient statistic from private signals acquired offline or online over time\nand to preserve the privacy of their signals and network neighborhoods. This is\nachieved through linear aggregation schemes with adjusted randomization schemes\nthat add noise to the exchanged estimates subject to differential privacy (DP)\nconstraints, both in an offline and online manner. We provide convergence rate\nanalysis and tight finite-time convergence bounds. We show that the noise that\nminimizes the convergence time to the best estimates is the Laplace noise, with\nparameters corresponding to each agent's sensitivity to their signal and\nnetwork characteristics. Our algorithms are further amenable to dynamic\ntopologies and balancing privacy and accuracy trade-offs. Finally, to\nsupplement and validate our theoretical results, we run experiments on\nreal-world data from the US Power Grid Network and electric consumption data\nfrom German Households to estimate the average power consumption of power\nstations and households under all privacy regimes and show that our method\noutperforms existing first-order privacy-aware distributed optimization\nmethods.\n","authors":["Marios Papachristou","M. Amin Rahimian"],"pdf_url":"https://arxiv.org/pdf/2306.15865v4.pdf","comment":"Additional experiments, comparison with related work, and extensions\n  (dynamic networks, directed networks networks, heterogeneous privacy budgets)"},{"id":"http://arxiv.org/abs/2401.13624v1","updated":"2024-01-24T17:54:55Z","published":"2024-01-24T17:54:55Z","title":"Can overfitted deep neural networks in adversarial training generalize?\n  -- An approximation viewpoint","summary":"  Adversarial training is a widely used method to improve the robustness of\ndeep neural networks (DNNs) over adversarial perturbations. However, it is\nempirically observed that adversarial training on over-parameterized networks\noften suffers from the \\textit{robust overfitting}: it can achieve almost zero\nadversarial training error while the robust generalization performance is not\npromising. In this paper, we provide a theoretical understanding of the\nquestion of whether overfitted DNNs in adversarial training can generalize from\nan approximation viewpoint. Specifically, our main results are summarized into\nthree folds: i) For classification, we prove by construction the existence of\ninfinitely many adversarial training classifiers on over-parameterized DNNs\nthat obtain arbitrarily small adversarial training error (overfitting), whereas\nachieving good robust generalization error under certain conditions concerning\nthe data quality, well separated, and perturbation level. ii) Linear\nover-parameterization (meaning that the number of parameters is only slightly\nlarger than the sample size) is enough to ensure such existence if the target\nfunction is smooth enough. iii) For regression, our results demonstrate that\nthere also exist infinitely many overfitted DNNs with linear\nover-parameterization in adversarial training that can achieve almost optimal\nrates of convergence for the standard generalization error. Overall, our\nanalysis points out that robust overfitting can be avoided but the required\nmodel capacity will depend on the smoothness of the target function, while a\nrobust generalization gap is inevitable. We hope our analysis will give a\nbetter understanding of the mathematical foundations of robustness in DNNs from\nan approximation view.\n","authors":["Zhongjie Shi","Fanghui Liu","Yuan Cao","Johan A. K. Suykens"],"pdf_url":"https://arxiv.org/pdf/2401.13624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.02344v2","updated":"2024-01-24T17:49:12Z","published":"2023-01-06T00:37:25Z","title":"TrojanPuzzle: Covertly Poisoning Code-Suggestion Models","summary":"  With tools like GitHub Copilot, automatic code suggestion is no longer a\ndream in software engineering. These tools, based on large language models, are\ntypically trained on massive corpora of code mined from unvetted public\nsources. As a result, these models are susceptible to data poisoning attacks\nwhere an adversary manipulates the model's training by injecting malicious\ndata. Poisoning attacks could be designed to influence the model's suggestions\nat run time for chosen contexts, such as inducing the model into suggesting\ninsecure code payloads. To achieve this, prior attacks explicitly inject the\ninsecure code payload into the training data, making the poison data detectable\nby static analysis tools that can remove such malicious data from the training\nset. In this work, we demonstrate two novel attacks, COVERT and TROJANPUZZLE,\nthat can bypass static analysis by planting malicious poison data in\nout-of-context regions such as docstrings. Our most novel attack, TROJANPUZZLE,\ngoes one step further in generating less suspicious poison data by never\nexplicitly including certain (suspicious) parts of the payload in the poison\ndata, while still inducing a model that suggests the entire payload when\ncompleting code (i.e., outside docstrings). This makes TROJANPUZZLE robust\nagainst signature-based dataset-cleansing methods that can filter out\nsuspicious sequences from the training data. Our evaluation against models of\ntwo sizes demonstrates that both COVERT and TROJANPUZZLE have significant\nimplications for practitioners when selecting code used to train or tune\ncode-suggestion models.\n","authors":["Hojjat Aghakhani","Wei Dai","Andre Manoel","Xavier Fernandes","Anant Kharkar","Christopher Kruegel","Giovanni Vigna","David Evans","Ben Zorn","Robert Sim"],"pdf_url":"https://arxiv.org/pdf/2301.02344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09493v2","updated":"2024-01-24T17:18:39Z","published":"2024-01-17T01:17:12Z","title":"Identifying Three-Dimensional Radiative Patterns Associated with Early\n  Tropical Cyclone Intensification","summary":"  Cloud radiative feedback impacts early tropical cyclone (TC) intensification,\nbut limitations in existing diagnostic frameworks make them unsuitable for\nstudying asymmetric or transient radiative heating. We propose a linear\nVariational Encoder-Decoder (VED) to learn the hidden relationship between\nradiation and the surface intensification of realistic simulated TCs. Limiting\nVED model inputs enables using its uncertainty to identify periods when\nradiation has more importance for intensification. A close examination of the\nextracted 3D radiative structures suggests that longwave radiative forcing from\ninner core deep convection and shallow clouds both contribute to\nintensification, with the deep convection having the most impact overall. We\nfind that deep convection downwind of the shallow clouds is critical to the\nintensification of Haiyan. Our work demonstrates that machine learning can\ndiscover thermodynamic-kinematic relationships without relying on axisymmetric\nor deterministic assumptions, paving the way towards the objective discovery of\nprocesses leading to TC intensification in realistic conditions.\n","authors":["Frederick Iat-Hin Tam","Tom Beucler","James H. Ruppert Jr"],"pdf_url":"https://arxiv.org/pdf/2401.09493v2.pdf","comment":"12 pages, 4 figures (main text)"},{"id":"http://arxiv.org/abs/2401.08534v2","updated":"2024-01-24T17:13:08Z","published":"2024-01-16T17:54:02Z","title":"DiConStruct: Causal Concept-based Explanations through Black-Box\n  Distillation","summary":"  Model interpretability plays a central role in human-AI decision-making\nsystems. Ideally, explanations should be expressed using human-interpretable\nsemantic concepts. Moreover, the causal relations between these concepts should\nbe captured by the explainer to allow for reasoning about the explanations.\nLastly, explanation methods should be efficient and not compromise the\nperformance of the predictive task. Despite the rapid advances in AI\nexplainability in recent years, as far as we know to date, no method fulfills\nthese three properties. Indeed, mainstream methods for local concept\nexplainability do not produce causal explanations and incur a trade-off between\nexplainability and prediction performance. We present DiConStruct, an\nexplanation method that is both concept-based and causal, with the goal of\ncreating more interpretable local explanations in the form of structural causal\nmodels and concept attributions. Our explainer works as a distillation model to\nany black-box machine learning model by approximating its predictions while\nproducing the respective explanations. Because of this, DiConStruct generates\nexplanations efficiently while not impacting the black-box prediction task. We\nvalidate our method on an image dataset and a tabular dataset, showing that\nDiConStruct approximates the black-box models with higher fidelity than other\nconcept explainability baselines, while providing explanations that include the\ncausal relations between the concepts.\n","authors":["Ricardo Moreira","Jacopo Bono","Mário Cardoso","Pedro Saleiro","Mário A. T. Figueiredo","Pedro Bizarro"],"pdf_url":"https://arxiv.org/pdf/2401.08534v2.pdf","comment":"Accepted at Conference on Causal Learning and Reasoning (CLeaR 2024,\n  https://www.cclear.cc/2024). To be published at Proceedings of Machine\n  Learning Research (PMLR)"},{"id":"http://arxiv.org/abs/2311.14828v2","updated":"2024-01-24T17:07:55Z","published":"2023-11-24T19:55:57Z","title":"Deep Latent Force Models: ODE-based Process Convolutions for Bayesian\n  Deep Learning","summary":"  Modelling the behaviour of highly nonlinear dynamical systems with robust\nuncertainty quantification is a challenging task which typically requires\napproaches specifically designed to address the problem at hand. We introduce a\ndomain-agnostic model to address this issue termed the deep latent force model\n(DLFM), a deep Gaussian process with physics-informed kernels at each layer,\nderived from ordinary differential equations using the framework of process\nconvolutions. Two distinct formulations of the DLFM are presented which utilise\nweight-space and variational inducing points-based Gaussian process\napproximations, both of which are amenable to doubly stochastic variational\ninference. We present empirical evidence of the capability of the DLFM to\ncapture the dynamics present in highly nonlinear real-world multi-output time\nseries data. Additionally, we find that the DLFM is capable of achieving\ncomparable performance to a range of non-physics-informed probabilistic models\non benchmark univariate regression tasks. We also empirically assess the\nnegative impact of the inducing points framework on the extrapolation\ncapabilities of LFM-based models.\n","authors":["Thomas Baldwin-McDonald","Mauricio A. Álvarez"],"pdf_url":"https://arxiv.org/pdf/2311.14828v2.pdf","comment":"31 pages, 6 figures. Introduction and abstract updated. arXiv admin\n  note: text overlap with arXiv:2106.05960"},{"id":"http://arxiv.org/abs/2401.13586v1","updated":"2024-01-24T16:51:23Z","published":"2024-01-24T16:51:23Z","title":"Prompt Weight Experiments for LLM Instruction Fine-Tuning","summary":"  We present a small study analyzing how prompt token classification loss\nweighting (PLW) affects the performance of 7B-size LLaMA models fine-tuned on\ninstruction tasks. We recreated Stanford's Alpaca experiment with both LLaMA 1\nand LLaMA 2 using multiple instruction datasets. We found that models\nfine-tuned on our short-completion dataset have a negative quadratic\nrelationship with PLW while models fine-tuned on long-completion datasets were\nunaffected by PLW.\n","authors":["Mathew Huerta-Enochian"],"pdf_url":"https://arxiv.org/pdf/2401.13586v1.pdf","comment":"5 pages of content. 5 pages for limitations, acknowledgments,\n  references, and appendix. 3 figures"},{"id":"http://arxiv.org/abs/2401.13575v1","updated":"2024-01-24T16:40:30Z","published":"2024-01-24T16:40:30Z","title":"CNN architecture extraction on edge GPU","summary":"  Neural networks have become popular due to their versatility and\nstate-of-the-art results in many applications, such as image classification,\nnatural language processing, speech recognition, forecasting, etc. These\napplications are also used in resource-constrained environments such as\nembedded devices. In this work, the susceptibility of neural network\nimplementations to reverse engineering is explored on the NVIDIA Jetson Nano\nmicrocomputer via side-channel analysis. To this end, an architecture\nextraction attack is presented. In the attack, 15 popular convolutional neural\nnetwork architectures (EfficientNets, MobileNets, NasNet, etc.) are implemented\non the GPU of Jetson Nano and the electromagnetic radiation of the GPU is\nanalyzed during the inference operation of the neural networks. The results of\nthe analysis show that neural network architectures are easily distinguishable\nusing deep learning-based side-channel analysis.\n","authors":["Peter Horvath","Lukasz Chmielewski","Leo Weissbart","Lejla Batina","Yuval Yarom"],"pdf_url":"https://arxiv.org/pdf/2401.13575v1.pdf","comment":"Will appear at the AIHWS 2024 workshop at ACNS 2024"},{"id":"http://arxiv.org/abs/2401.13570v1","updated":"2024-01-24T16:31:50Z","published":"2024-01-24T16:31:50Z","title":"Guided Diffusion for Fast Inverse Design of Density-based Mechanical\n  Metamaterials","summary":"  Mechanical metamaterial is a synthetic material that can possess\nextraordinary physical characteristics, such as abnormal elasticity, stiffness,\nand stability, by carefully designing its internal structure. To make\nmetamaterials contain delicate local structures with unique mechanical\nproperties, it is a potential method to represent them through high-resolution\nvoxels. However, it brings a substantial computational burden. To this end,\nthis paper proposes a fast inverse design method, whose core is an advanced\ndeep generative AI algorithm, to generate voxel-based mechanical metamaterials.\nSpecifically, we use the self-conditioned diffusion model, capable of\ngenerating a microstructure with a resolution of $128^3$ to approach the\nspecified homogenized tensor matrix in just 3 seconds. Accordingly, this rapid\nreverse design tool facilitates the exploration of extreme metamaterials, the\nsequence interpolation in metamaterials, and the generation of diverse\nmicrostructures for multi-scale design. This flexible and adaptive generative\ntool is of great value in structural engineering or other mechanical systems\nand can stimulate more subsequent research.\n","authors":["Yanyan Yang","Lili Wang","Xiaoya Zhai","Kai Chen","Wenming Wu","Yunkai Zhao","Ligang Liu","Xiao-Ming Fu"],"pdf_url":"https://arxiv.org/pdf/2401.13570v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2011.05001v9","updated":"2024-01-24T16:23:21Z","published":"2020-11-10T09:32:50Z","title":"MMD-Regularized Unbalanced Optimal Transport","summary":"  We study the unbalanced optimal transport (UOT) problem, where the marginal\nconstraints are enforced using Maximum Mean Discrepancy (MMD) regularization.\nOur work is motivated by the observation that the literature on UOT is focused\non regularization based on $\\phi$-divergence (e.g., KL divergence). Despite the\npopularity of MMD, its role as a regularizer in the context of UOT seems less\nunderstood. We begin by deriving a specific dual of MMD-regularized UOT\n(MMD-UOT), which helps us prove several useful properties. One interesting\noutcome of this duality result is that MMD-UOT induces novel metrics, which not\nonly lift the ground metric like the Wasserstein but are also sample-wise\nefficient to estimate like the MMD. Further, for real-world applications\ninvolving non-discrete measures, we present an estimator for the transport plan\nthat is supported only on the given ($m$) samples. Under certain conditions, we\nprove that the estimation error with this finitely-supported transport plan is\nalso $\\mathcal{O}(1/\\sqrt{m})$. As far as we know, such error bounds that are\nfree from the curse of dimensionality are not known for $\\phi$-divergence\nregularized UOT. Finally, we discuss how the proposed estimator can be computed\nefficiently using accelerated gradient descent. Our experiments show that\nMMD-UOT consistently outperforms popular baselines, including KL-regularized\nUOT and MMD, in diverse machine learning applications. Our codes are publicly\navailable at https://github.com/Piyushi-0/MMD-reg-OT\n","authors":["Piyushi Manupriya","J. Saketha Nath","Pratik Jawanpuria"],"pdf_url":"https://arxiv.org/pdf/2011.05001v9.pdf","comment":null},{"id":"http://arxiv.org/abs/1802.03308v9","updated":"2024-01-24T16:22:36Z","published":"2018-02-09T15:35:41Z","title":"The Power of Linear Recurrent Neural Networks","summary":"  Recurrent neural networks are a powerful means to cope with time series. We\nshow how autoregressive linear, i.e., linearly activated recurrent neural\nnetworks (LRNNs) can approximate any time-dependent function f(t). The\napproximation can effectively be learned by simply solving a linear equation\nsystem; no backpropagation or similar methods are needed. Furthermore, and this\nis the main contribution of this article, the size of an LRNN can be reduced\nsignificantly in one step after inspecting the spectrum of the network\ntransition matrix, i.e., its eigenvalues, by taking only the most relevant\ncomponents. Therefore, in contrast to other approaches, we do not only learn\nnetwork weights but also the network architecture. LRNNs have interesting\nproperties: They end up in ellipse trajectories in the long run and allow the\nprediction of further values and compact representations of functions. We\ndemonstrate this by several experiments, among them multiple superimposed\noscillators (MSO), robotic soccer (RoboCup), and stock price prediction. LRNNs\noutperform the previous state-of-the-art for the MSO task with a minimal number\nof units.\n","authors":["Frieder Stolzenburg","Sandra Litz","Olivia Michael","Oliver Obst"],"pdf_url":"https://arxiv.org/pdf/1802.03308v9.pdf","comment":"50 pages, 12 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.13558v1","updated":"2024-01-24T16:14:38Z","published":"2024-01-24T16:14:38Z","title":"Task structure and nonlinearity jointly determine learned\n  representational geometry","summary":"  The utility of a learned neural representation depends on how well its\ngeometry supports performance in downstream tasks. This geometry depends on the\nstructure of the inputs, the structure of the target outputs, and the\narchitecture of the network. By studying the learning dynamics of networks with\none hidden layer, we discovered that the network's activation function has an\nunexpectedly strong impact on the representational geometry: Tanh networks tend\nto learn representations that reflect the structure of the target outputs,\nwhile ReLU networks retain more information about the structure of the raw\ninputs. This difference is consistently observed across a broad class of\nparameterized tasks in which we modulated the degree of alignment between the\ngeometry of the task inputs and that of the task labels. We analyzed the\nlearning dynamics in weight space and show how the differences between the\nnetworks with Tanh and ReLU nonlinearities arise from the asymmetric asymptotic\nbehavior of ReLU, which leads feature neurons to specialize for different\nregions of input space. By contrast, feature neurons in Tanh networks tend to\ninherit the task label structure. Consequently, when the target outputs are low\ndimensional, Tanh networks generate neural representations that are more\ndisentangled than those obtained with a ReLU nonlinearity. Our findings shed\nlight on the interplay between input-output geometry, nonlinearity, and learned\nrepresentations in neural networks.\n","authors":["Matteo Alleman","Jack W Lindsey","Stefano Fusi"],"pdf_url":"https://arxiv.org/pdf/2401.13558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13555v1","updated":"2024-01-24T16:13:26Z","published":"2024-01-24T16:13:26Z","title":"Benchmarking the Fairness of Image Upsampling Methods","summary":"  Recent years have witnessed a rapid development of deep generative models for\ncreating synthetic media, such as images and videos. While the practical\napplications of these models in everyday tasks are enticing, it is crucial to\nassess the inherent risks regarding their fairness. In this work, we introduce\na comprehensive framework for benchmarking the performance and fairness of\nconditional generative models. We develop a set of\nmetrics$\\unicode{x2013}$inspired by their supervised fairness\ncounterparts$\\unicode{x2013}$to evaluate the models on their fairness and\ndiversity. Focusing on the specific application of image upsampling, we create\na benchmark covering a wide variety of modern upsampling methods. As part of\nthe benchmark, we introduce UnfairFace, a subset of FairFace that replicates\nthe racial distribution of common large-scale face datasets. Our empirical\nstudy highlights the importance of using an unbiased training set and reveals\nvariations in how the algorithms respond to dataset imbalances. Alarmingly, we\nfind that none of the considered methods produces statistically fair and\ndiverse results.\n","authors":["Mike Laszkiewicz","Imant Daunhawer","Julia E. Vogt","Asja Fischer","Johannes Lederer"],"pdf_url":"https://arxiv.org/pdf/2401.13555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13544v1","updated":"2024-01-24T16:02:14Z","published":"2024-01-24T16:02:14Z","title":"Beyond Concept Bottleneck Models: How to Make Black Boxes Intervenable?","summary":"  Recently, interpretable machine learning has re-explored concept bottleneck\nmodels (CBM), comprising step-by-step prediction of the high-level concepts\nfrom the raw features and the target variable from the predicted concepts. A\ncompelling advantage of this model class is the user's ability to intervene on\nthe predicted concept values, affecting the model's downstream output. In this\nwork, we introduce a method to perform such concept-based interventions on\nalready-trained neural networks, which are not interpretable by design, given\nan annotated validation set. Furthermore, we formalise the model's\nintervenability as a measure of the effectiveness of concept-based\ninterventions and leverage this definition to fine-tune black-box models.\nEmpirically, we explore the intervenability of black-box classifiers on\nsynthetic tabular and natural image benchmarks. We demonstrate that fine-tuning\nimproves intervention effectiveness and often yields better-calibrated\npredictions. To showcase the practical utility of the proposed techniques, we\napply them to deep chest X-ray classifiers and show that fine-tuned black boxes\ncan be as intervenable and more performant than CBMs.\n","authors":["Ričards Marcinkevičs","Sonia Laguna","Moritz Vandenhirtz","Julia E. Vogt"],"pdf_url":"https://arxiv.org/pdf/2401.13544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05836v5","updated":"2024-01-24T15:54:17Z","published":"2023-11-10T02:47:15Z","title":"UMedNeRF: Uncertainty-aware Single View Volumetric Rendering for Medical\n  Neural Radiance Fields","summary":"  In the field of clinical medicine, computed tomography (CT) is an effective\nmedical imaging modality for the diagnosis of various pathologies. Compared\nwith X-ray images, CT images can provide more information, including\nmulti-planar slices and three-dimensional structures for clinical diagnosis.\nHowever, CT imaging requires patients to be exposed to large doses of ionizing\nradiation for a long time, which may cause irreversible physical harm. In this\npaper, we propose an Uncertainty-aware MedNeRF (UMedNeRF) network based on\ngenerated radiation fields. The network can learn a continuous representation\nof CT projections from 2D X-ray images by obtaining the internal structure and\ndepth information and using adaptive loss weights to ensure the quality of the\ngenerated images. Our model is trained on publicly available knee and chest\ndatasets, and we show the results of CT projection rendering with a single\nX-ray and compare our method with other methods based on generated radiation\nfields.\n","authors":["Jing Hu","Qinrui Fan","Shu Hu","Siwei Lyu","Xi Wu","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.05836v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03258v2","updated":"2024-01-24T15:52:08Z","published":"2023-10-05T02:22:16Z","title":"Assessing Electricity Service Unfairness with Transfer Counterfactual\n  Learning","summary":"  Energy justice is a growing area of interest in interdisciplinary energy\nresearch. However, identifying systematic biases in the energy sector remains\nchallenging due to confounding variables, intricate heterogeneity in\ncounterfactual effects, and limited data availability. First, this paper\ndemonstrates how one can evaluate counterfactual unfairness in a power system\nby analyzing the average causal effect of a specific protected attribute.\nSubsequently, we use subgroup analysis to handle model heterogeneity and\nintroduce a novel method for estimating counterfactual unfairness based on\ntransfer learning, which helps to alleviate the data scarcity in each subgroup.\nIn our numerical analysis, we apply our method to a unique large-scale\ncustomer-level power outage data set and investigate the counterfactual effect\nof demographic factors, such as income and age of the population, on power\noutage durations. Our results indicate that low-income and elderly-populated\nareas consistently experience longer power outages under both daily and\npost-disaster operations, and such discrimination is exacerbated under severe\nconditions. These findings suggest a widespread, systematic issue of injustice\nin the power service systems and emphasize the necessity for focused\ninterventions in disadvantaged communities.\n","authors":["Song Wei","Xiangrui Kong","Alinson Santos Xavier","Shixiang Zhu","Yao Xie","Feng Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.03258v2.pdf","comment":"The preliminary version titled \"Detecting Electricity Service Equity\n  Issues with Transfer Counterfactual Learning on Large-Scale Outage Datasets\"\n  is presented at NeurIPS 2023 Workshops on Causal Representation Learning\n  (CRL) and Algorithmic Fairness through the Lens of Time (AFT); See v1"},{"id":"http://arxiv.org/abs/2401.13537v1","updated":"2024-01-24T15:46:32Z","published":"2024-01-24T15:46:32Z","title":"Masked Particle Modeling on Sets: Towards Self-Supervised High Energy\n  Physics Foundation Models","summary":"  We propose \\textit{masked particle modeling} (MPM) as a self-supervised\nmethod for learning generic, transferable, and reusable representations on\nunordered sets of inputs for use in high energy physics (HEP) scientific data.\nThis work provides a novel scheme to perform masked modeling based pre-training\nto learn permutation invariant functions on sets. More generally, this work\nprovides a step towards building large foundation models for HEP that can be\ngenerically pre-trained with self-supervised learning and later fine-tuned for\na variety of down-stream tasks. In MPM, particles in a set are masked and the\ntraining objective is to recover their identity, as defined by a discretized\ntoken representation of a pre-trained vector quantized variational autoencoder.\nWe study the efficacy of the method in samples of high energy jets at collider\nphysics experiments, including studies on the impact of discretization,\npermutation invariance, and ordering. We also study the fine-tuning capability\nof the model, showing that it can be adapted to tasks such as supervised and\nweakly supervised jet classification, and that the model can transfer\nefficiently with small fine-tuning data sets to new classes and new data\ndomains.\n","authors":["Lukas Heinrich","Michael Kagan","Samuel Klein","Matthew Leigh","Tobias Golling","John Andrew Raine","Margarita Osadchy"],"pdf_url":"https://arxiv.org/pdf/2401.13537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13536v1","updated":"2024-01-24T15:46:25Z","published":"2024-01-24T15:46:25Z","title":"Finetuning Foundation Models for Joint Analysis Optimization","summary":"  In this work we demonstrate that significant gains in performance and data\nefficiency can be achieved in High Energy Physics (HEP) by moving beyond the\nstandard paradigm of sequential optimization or reconstruction and analysis\ncomponents. We conceptually connect HEP reconstruction and analysis to modern\nmachine learning workflows such as pretraining, finetuning, domain adaptation\nand high-dimensional embedding spaces and quantify the gains in the example\nusecase of searches of heavy resonances decaying via an intermediate di-Higgs\nsystem to four $b$-jets.\n","authors":["Matthias Vig","Nicole Hartman","Lukas Heinrich"],"pdf_url":"https://arxiv.org/pdf/2401.13536v1.pdf","comment":"13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2309.07937v3","updated":"2024-01-24T15:36:31Z","published":"2023-09-14T03:13:18Z","title":"Voxtlm: unified decoder-only models for consolidating speech\n  recognition/synthesis and speech/text continuation tasks","summary":"  We propose a decoder-only language model, VoxtLM, that can perform four\ntasks: speech recognition, speech synthesis, text generation, and speech\ncontinuation. VoxtLM integrates text vocabulary with discrete speech tokens\nfrom self-supervised speech features and uses special tokens to enable\nmultitask learning. Compared to a single-task model, VoxtLM exhibits a\nsignificant improvement in speech synthesis, with improvements in both speech\nintelligibility from 28.9 to 5.6 and objective quality from 2.68 to 3.90.\nVoxtLM also improves speech generation and speech recognition performance over\nthe single-task counterpart. Further, VoxtLM is trained with publicly available\ndata and training recipes and model checkpoints are open-sourced to make fully\nreproducible work.\n","authors":["Soumi Maiti","Yifan Peng","Shukjae Choi","Jee-weon Jung","Xuankai Chang","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2309.07937v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13530v1","updated":"2024-01-24T15:35:44Z","published":"2024-01-24T15:35:44Z","title":"Towards Understanding the Riemannian SGD and SVRG Flows on Wasserstein\n  Probabilistic Space","summary":"  Recently, optimization on the Riemannian manifold has provided new insights\nto the optimization community. In this regard, the manifold taken as the\nprobability measure metric space equipped with the second-order Wasserstein\ndistance is of particular interest, since optimization on it can be linked to\npractical sampling processes. In general, the oracle (continuous) optimization\nmethod on Wasserstein space is Riemannian gradient flow (i.e., Langevin\ndynamics when minimizing KL divergence). In this paper, we aim to enrich the\ncontinuous optimization methods in the Wasserstein space by extending the\ngradient flow into the stochastic gradient descent (SGD) flow and stochastic\nvariance reduction gradient (SVRG) flow. The two flows on Euclidean space are\nstandard stochastic optimization methods, while their Riemannian counterparts\nare not explored yet. By leveraging the structures in Wasserstein space, we\nconstruct a stochastic differential equation (SDE) to approximate the discrete\ndynamics of desired stochastic methods in the corresponded random vector space.\nThen, the flows of probability measures are naturally obtained by applying\nFokker-Planck equation to such SDE. Furthermore, the convergence rates of the\nproposed Riemannian stochastic flows are proven, and they match the results in\nEuclidean space.\n","authors":["Mingyang Yi","Bohan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.13530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13541v2","updated":"2024-01-24T15:33:32Z","published":"2023-11-22T17:30:41Z","title":"Linear Log-Normal Attention with Unbiased Concentration","summary":"  Transformer models have achieved remarkable results in a wide range of\napplications. However, their scalability is hampered by the quadratic time and\nmemory complexity of the self-attention mechanism concerning the sequence\nlength. This limitation poses a substantial obstacle when dealing with long\ndocuments or high-resolution images. In this work, we study the self-attention\nmechanism by analyzing the distribution of the attention matrix and its\nconcentration ability. Furthermore, we propose instruments to measure these\nquantities and introduce a novel self-attention mechanism, Linear Log-Normal\nAttention, designed to emulate the distribution and concentration behavior of\nthe original self-attention. Our experimental results on popular natural\nlanguage benchmarks reveal that our proposed Linear Log-Normal Attention\noutperforms other linearized attention alternatives, offering a promising\navenue for enhancing the scalability of transformer models. Our code is\navailable in supplementary materials.\n","authors":["Yury Nahshan","Joseph Kampeas","Emir Haleva"],"pdf_url":"https://arxiv.org/pdf/2311.13541v2.pdf","comment":"22 pages, 20 figures, 5 tables, submitted to ICLR2024"},{"id":"http://arxiv.org/abs/2401.08318v2","updated":"2024-01-24T15:30:55Z","published":"2024-01-16T12:36:17Z","title":"OpenDPD: An Open-Source End-to-End Learning & Benchmarking Framework for\n  Wideband Power Amplifier Modeling and Digital Pre-Distortion","summary":"  With the rise in communication capacity, deep neural networks (DNN) for\ndigital pre-distortion (DPD) to correct non-linearity in wideband power\namplifiers (PAs) have become prominent. Yet, there is a void in open-source and\nmeasurement-setup-independent platforms for fast DPD exploration and objective\nDPD model comparison. This paper presents an open-source framework, OpenDPD,\ncrafted in PyTorch, with an associated dataset for PA modeling and DPD\nlearning. We introduce a Dense Gated Recurrent Unit (DGRU)-DPD, trained via a\nnovel end-to-end learning architecture, outperforming previous DPD models on a\ndigital PA (DPA) in the new digital transmitter (DTX) architecture with\nunconventional transfer characteristics compared to analog PAs. Measurements\nshow our DGRU-DPD achieves an ACPR of -44.69/-44.47 dBc and an EVM of -35.22 dB\nfor 200 MHz OFDM signals. OpenDPD code, datasets, and documentation are\npublicly available at https://github.com/lab-emi/OpenDPD.\n","authors":["Yizhuo Wu","Gagan Deep Singh","Mohammadreza Beikmirza","Leo C. N. de Vreede","Morteza Alavi","Chang Gao"],"pdf_url":"https://arxiv.org/pdf/2401.08318v2.pdf","comment":"To be published at the 2024 IEEE International Symposium on Circuits\n  and Systems (ISCAS), Singapore"},{"id":"http://arxiv.org/abs/2206.06009v3","updated":"2024-01-24T15:23:09Z","published":"2022-06-13T09:55:04Z","title":"Relative Policy-Transition Optimization for Fast Policy Transfer","summary":"  We consider the problem of policy transfer between two Markov Decision\nProcesses (MDPs). We introduce a lemma based on existing theoretical results in\nreinforcement learning to measure the relativity gap between two arbitrary\nMDPs, that is the difference between any two cumulative expected returns\ndefined on different policies and environment dynamics. Based on this lemma, we\npropose two new algorithms referred to as Relative Policy Optimization (RPO)\nand Relative Transition Optimization (RTO), which offer fast policy transfer\nand dynamics modelling, respectively. RPO transfers the policy evaluated in one\nenvironment to maximize the return in another, while RTO updates the\nparameterized dynamics model to reduce the gap between the dynamics of the two\nenvironments. Integrating the two algorithms results in the complete Relative\nPolicy-Transition Optimization (RPTO) algorithm, in which the policy interacts\nwith the two environments simultaneously, such that data collections from two\nenvironments, policy and transition updates are completed in one closed loop to\nform a principled learning framework for policy transfer. We demonstrate the\neffectiveness of RPTO on a set of MuJoCo continuous control tasks by creating\npolicy transfer problems via variant dynamics.\n","authors":["Jiawei Xu","Cheng Zhou","Yizheng Zhang","Baoxiang Wang","Lei Han"],"pdf_url":"https://arxiv.org/pdf/2206.06009v3.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2212.01068v6","updated":"2024-01-24T15:19:52Z","published":"2022-12-02T10:12:14Z","title":"Fast Algorithm for Constrained Linear Inverse Problems","summary":"  We consider the constrained Linear Inverse Problem (LIP), where a certain\natomic norm (like the $\\ell_1 $ norm) is minimized subject to a quadratic\nconstraint. Typically, such cost functions are non-differentiable which makes\nthem not amenable to the fast optimization methods existing in practice. We\npropose two equivalent reformulations of the constrained LIP with improved\nconvex regularity: (i) a smooth convex minimization problem, and (ii) a\nstrongly convex min-max problem. These problems could be solved by applying\nexisting acceleration-based convex optimization methods which provide better $\nO \\left( \\frac{1}{k^2} \\right) $ theoretical convergence guarantee, improving\nupon the current best rate of $ O \\left( \\frac{1}{k} \\right) $. We also provide\na novel algorithm named the Fast Linear Inverse Problem Solver (FLIPS), which\nis tailored to maximally exploit the structure of the reformulations. We\ndemonstrate the performance of FLIPS on the classical problems of Binary\nSelection, Compressed Sensing, and Image Denoising. We also provide open source\n\\texttt{MATLAB} package for these three examples, which can be easily adapted\nto other LIPs.\n","authors":["Mohammed Rayyan Sheriff","Floor Fenne Redel","Peyman Mohajerin Esfahani"],"pdf_url":"https://arxiv.org/pdf/2212.01068v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13511v1","updated":"2024-01-24T15:09:12Z","published":"2024-01-24T15:09:12Z","title":"Tissue Cross-Section and Pen Marking Segmentation in Whole Slide Images","summary":"  Tissue segmentation is a routine preprocessing step to reduce the\ncomputational cost of whole slide image (WSI) analysis by excluding background\nregions. Traditional image processing techniques are commonly used for tissue\nsegmentation, but often require manual adjustments to parameter values for\natypical cases, fail to exclude all slide and scanning artifacts from the\nbackground, and are unable to segment adipose tissue. Pen marking artifacts in\nparticular can be a potential source of bias for subsequent analyses if not\nremoved. In addition, several applications require the separation of individual\ncross-sections, which can be challenging due to tissue fragmentation and\nadjacent positioning. To address these problems, we develop a convolutional\nneural network for tissue and pen marking segmentation using a dataset of 200\nH&E stained WSIs. For separating tissue cross-sections, we propose a novel\npost-processing method based on clustering predicted centroid locations of the\ncross-sections in a 2D histogram. On an independent test set, the model\nachieved a mean Dice score of 0.981$\\pm$0.033 for tissue segmentation and a\nmean Dice score of 0.912$\\pm$0.090 for pen marking segmentation. The mean\nabsolute difference between the number of annotated and separated\ncross-sections was 0.075$\\pm$0.350. Our results demonstrate that the proposed\nmodel can accurately segment H&E stained tissue cross-sections and pen markings\nin WSIs while being robust to many common slide and scanning artifacts. The\nmodel with trained model parameters and post-processing method are made\npublicly available as a Python package called SlideSegmenter.\n","authors":["Ruben T. Lucassen","Willeke A. M. Blokx","Mitko Veta"],"pdf_url":"https://arxiv.org/pdf/2401.13511v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2106.01135v5","updated":"2024-01-24T14:56:44Z","published":"2021-06-02T13:05:34Z","title":"MNL-Bandit with Knapsacks: a near-optimal algorithm","summary":"  We consider a dynamic assortment selection problem where a seller has a fixed\ninventory of $N$ substitutable products and faces an unknown demand that\narrives sequentially over $T$ periods. In each period, the seller needs to\ndecide on the assortment of products (satisfying certain constraints) to offer\nto the customers. The customer's response follows an unknown multinomial logit\nmodel (MNL) with parameter $\\boldsymbol{v}$. If customer selects product $i \\in\n[N]$, the seller receives revenue $r_i$. The goal of the seller is to maximize\nthe total expected revenue from the $T$ customers given the fixed initial\ninventory of $N$ products. We present MNLwK-UCB, a UCB-based algorithm and\ncharacterize its regret under different regimes of inventory size. We show that\nwhen the inventory size grows quasi-linearly in time, MNLwK-UCB achieves a\n$\\tilde{O}(N + \\sqrt{NT})$ regret bound. We also show that for a smaller\ninventory (with growth $\\sim T^{\\alpha}$, $\\alpha < 1$), MNLwK-UCB achieves a\n$\\tilde{O}(N(1 + T^{\\frac{1 - \\alpha}{2}}) + \\sqrt{NT})$. In particular, over a\nlong time horizon $T$, the rate $\\tilde{O}(\\sqrt{NT})$ is always achieved\nregardless of the constraints and the size of the inventory.\n","authors":["Abdellah Aznag","Vineet Goyal","Noemie Perivier"],"pdf_url":"https://arxiv.org/pdf/2106.01135v5.pdf","comment":"Improved the regret bound/assumptions. Corrected the abstract"},{"id":"http://arxiv.org/abs/2401.13498v1","updated":"2024-01-24T14:44:01Z","published":"2024-01-24T14:44:01Z","title":"Expressive Acoustic Guitar Sound Synthesis with an Instrument-Specific\n  Input Representation and Diffusion Outpainting","summary":"  Synthesizing performing guitar sound is a highly challenging task due to the\npolyphony and high variability in expression. Recently, deep generative models\nhave shown promising results in synthesizing expressive polyphonic instrument\nsounds from music scores, often using a generic MIDI input. In this work, we\npropose an expressive acoustic guitar sound synthesis model with a customized\ninput representation to the instrument, which we call guitarroll. We implement\nthe proposed approach using diffusion-based outpainting which can generate\naudio with long-term consistency. To overcome the lack of MIDI/audio-paired\ndatasets, we used not only an existing guitar dataset but also collected data\nfrom a high quality sample-based guitar synthesizer. Through quantitative and\nqualitative evaluations, we show that our proposed model has higher audio\nquality than the baseline model and generates more realistic timbre sounds than\nthe previous leading work.\n","authors":["Hounsu Kim","Soonbeom Choi","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2401.13498v1.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.13486v1","updated":"2024-01-24T14:34:59Z","published":"2024-01-24T14:34:59Z","title":"Separable Physics-Informed Neural Networks for the solution of\n  elasticity problems","summary":"  A method for solving elasticity problems based on separable physics-informed\nneural networks (SPINN) in conjunction with the deep energy method (DEM) is\npresented. Numerical experiments have been carried out for a number of problems\nshowing that this method has a significantly higher convergence rate and\naccuracy than the vanilla physics-informed neural networks (PINN) and even\nSPINN based on a system of partial differential equations (PDEs). In addition,\nusing the SPINN in the framework of DEM approach it is possible to solve\nproblems of the linear theory of elasticity on complex geometries, which is\nunachievable with the help of PINNs in frames of partial differential\nequations. Considered problems are very close to the industrial problems in\nterms of geometry, loading, and material parameters.\n","authors":["Vasiliy A. Es'kin","Danil V. Davydov","Julia V. Gur'eva","Alexey O. Malkhanov","Mikhail E. Smorkalov"],"pdf_url":"https://arxiv.org/pdf/2401.13486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15901v4","updated":"2024-01-24T14:13:09Z","published":"2023-05-25T10:01:57Z","title":"Consistent Optimal Transport with Empirical Conditional Measures","summary":"  Given samples from two joint distributions, we consider the problem of\nOptimal Transportation (OT) between them when conditioned on a common variable.\nWe focus on the general setting where the conditioned variable may be\ncontinuous, and the marginals of this variable in the two joint distributions\nmay not be the same. In such settings, standard OT variants cannot be employed,\nand novel estimation techniques are necessary. Since the main challenge is that\nthe conditional distributions are not explicitly available, the key idea in our\nOT formulation is to employ kernelized-least-squares terms computed over the\njoint samples, which implicitly match the transport plan's marginals with the\nempirical conditionals. Under mild conditions, we prove that our estimated\ntransport plans, as a function of the conditioned variable, are asymptotically\noptimal. For finite samples, we show that the deviation in terms of our\nregularized objective is bounded by $O(1/m^{1/4})$, where $m$ is the number of\nsamples. We also discuss how the conditional transport plan could be modelled\nusing explicit probabilistic models as well as using implicit generative ones.\nWe empirically verify the consistency of our estimator on synthetic datasets,\nwhere the optimal plan is analytically known. When employed in applications\nlike prompt learning for few-shot classification and conditional-generation in\nthe context of predicting cell responses to treatment, our methodology improves\nupon state-of-the-art methods.\n","authors":["Piyushi Manupriya","Rachit Keerti Das","Sayantan Biswas","Saketha Nath Jagarlapudi"],"pdf_url":"https://arxiv.org/pdf/2305.15901v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13460v1","updated":"2024-01-24T14:02:09Z","published":"2024-01-24T14:02:09Z","title":"Multi-Agent Diagnostics for Robustness via Illuminated Diversity","summary":"  In the rapidly advancing field of multi-agent systems, ensuring robustness in\nunfamiliar and adversarial settings is crucial. Notwithstanding their\noutstanding performance in familiar environments, these systems often falter in\nnew situations due to overfitting during the training phase. This is especially\npronounced in settings where both cooperative and competitive behaviours are\npresent, encapsulating a dual nature of overfitting and generalisation\nchallenges. To address this issue, we present Multi-Agent Diagnostics for\nRobustness via Illuminated Diversity (MADRID), a novel approach for generating\ndiverse adversarial scenarios that expose strategic vulnerabilities in\npre-trained multi-agent policies. Leveraging the concepts from open-ended\nlearning, MADRID navigates the vast space of adversarial settings, employing a\ntarget policy's regret to gauge the vulnerabilities of these settings. We\nevaluate the effectiveness of MADRID on the 11vs11 version of Google Research\nFootball, one of the most complex environments for multi-agent reinforcement\nlearning. Specifically, we employ MADRID for generating a diverse array of\nadversarial settings for TiZero, the state-of-the-art approach which \"masters\"\nthe game through 45 days of training on a large-scale distributed\ninfrastructure. We expose key shortcomings in TiZero's tactical\ndecision-making, underlining the crucial importance of rigorous evaluation in\nmulti-agent systems.\n","authors":["Mikayel Samvelyan","Davide Paglieri","Minqi Jiang","Jack Parker-Holder","Tim Rocktäschel"],"pdf_url":"https://arxiv.org/pdf/2401.13460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13711v3","updated":"2024-01-24T13:44:31Z","published":"2023-02-27T12:18:19Z","title":"Internal-Coordinate Density Modelling of Protein Structure: Covariance\n  Matters","summary":"  After the recent ground-breaking advances in protein structure prediction,\none of the remaining challenges in protein machine learning is to reliably\npredict distributions of structural states. Parametric models of fluctuations\nare difficult to fit due to complex covariance structures between degrees of\nfreedom in the protein chain, often causing models to either violate local or\nglobal structural constraints. In this paper, we present a new strategy for\nmodelling protein densities in internal coordinates, which uses constraints in\n3D space to induce covariance structure between the internal degrees of\nfreedom. We illustrate the potential of the procedure by constructing a\nvariational autoencoder with full covariance output induced by the constraints\nimplied by the conditional mean in 3D, and demonstrate that our approach makes\nit possible to scale density models of internal coordinates to full protein\nbackbones in two settings: 1) a unimodal setting for proteins exhibiting small\nfluctuations and limited amounts of available data, and 2) a multimodal setting\nfor larger conformational changes in a high data regime.\n","authors":["Marloes Arts","Jes Frellsen","Wouter Boomsma"],"pdf_url":"https://arxiv.org/pdf/2302.13711v3.pdf","comment":"Pages: 10 main, 3 references, 8 appendix. Figures: 5 main, 6 appendix"},{"id":"http://arxiv.org/abs/2309.15717v2","updated":"2024-01-24T13:43:03Z","published":"2023-09-27T15:19:05Z","title":"Timbre-Trap: A Low-Resource Framework for Instrument-Agnostic Music\n  Transcription","summary":"  In recent years, research on music transcription has focused mainly on\narchitecture design and instrument-specific data acquisition. With the lack of\navailability of diverse datasets, progress is often limited to solo-instrument\ntasks such as piano transcription. Several works have explored multi-instrument\ntranscription as a means to bolster the performance of models on low-resource\ntasks, but these methods face the same data availability issues. We propose\nTimbre-Trap, a novel framework which unifies music transcription and audio\nreconstruction by exploiting the strong separability between pitch and timbre.\nWe train a single autoencoder to simultaneously estimate pitch salience and\nreconstruct complex spectral coefficients, selecting between either output\nduring the decoding stage via a simple switch mechanism. In this way, the model\nlearns to produce coefficients corresponding to timbre-less audio, which can be\ninterpreted as pitch salience. We demonstrate that the framework leads to\nperformance comparable to state-of-the-art instrument-agnostic transcription\nmethods, while only requiring a small amount of annotated data.\n","authors":["Frank Cwitkowitz","Kin Wai Cheuk","Woosung Choi","Marco A. Martínez-Ramírez","Keisuke Toyama","Wei-Hsiang Liao","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2309.15717v2.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.13447v1","updated":"2024-01-24T13:42:24Z","published":"2024-01-24T13:42:24Z","title":"Symbolic Equation Solving via Reinforcement Learning","summary":"  Machine-learning methods are gradually being adopted in a great variety of\nsocial, economic, and scientific contexts, yet they are notorious for\nstruggling with exact mathematics. A typical example is computer algebra, which\nincludes tasks like simplifying mathematical terms, calculating formal\nderivatives, or finding exact solutions of algebraic equations. Traditional\nsoftware packages for these purposes are commonly based on a huge database of\nrules for how a specific operation (e.g., differentiation) transforms a certain\nterm (e.g., sine function) into another one (e.g., cosine function). Thus far,\nthese rules have usually needed to be discovered and subsequently programmed by\nhumans. Focusing on the paradigmatic example of solving linear equations in\nsymbolic form, we demonstrate how the process of finding elementary\ntransformation rules and step-by-step solutions can be automated using\nreinforcement learning with deep neural networks.\n","authors":["Lennart Dabelow","Masahito Ueda"],"pdf_url":"https://arxiv.org/pdf/2401.13447v1.pdf","comment":"12 pages, 4 figures + appendices 17 pages, 1 figure, 16 tables"},{"id":"http://arxiv.org/abs/2309.03619v2","updated":"2024-01-24T13:37:11Z","published":"2023-09-07T10:23:59Z","title":"Understanding Self-Supervised Learning of Speech Representation via\n  Invariance and Redundancy Reduction","summary":"  Self-supervised learning (SSL) has emerged as a promising paradigm for\nlearning flexible speech representations from unlabeled data. By designing\npretext tasks that exploit statistical regularities, SSL models can capture\nuseful representations that are transferable to downstream tasks. This study\nprovides an empirical analysis of Barlow Twins (BT), an SSL technique inspired\nby theories of redundancy reduction in human perception. On downstream tasks,\nBT representations accelerated learning and transferred across domains.\nHowever, limitations exist in disentangling key explanatory factors, with\nredundancy reduction and invariance alone insufficient for factorization of\nlearned latents into modular, compact, and informative codes. Our ablations\nstudy isolated gains from invariance constraints, but the gains were\ncontext-dependent. Overall, this work substantiates the potential of Barlow\nTwins for sample-efficient speech encoding. However, challenges remain in\nachieving fully hierarchical representations. The analysis methodology and\ninsights pave a path for extensions incorporating further inductive priors and\nperceptual principles to further enhance the BT self-supervision framework.\n","authors":["Yusuf Brima","Ulf Krumnack","Simone Pika","Gunther Heidemann"],"pdf_url":"https://arxiv.org/pdf/2309.03619v2.pdf","comment":"13 pages, 5 figures, in submission to MDPI Information"},{"id":"http://arxiv.org/abs/2303.09599v3","updated":"2024-01-24T13:24:48Z","published":"2023-03-16T18:54:20Z","title":"cito: An R package for training neural networks using torch","summary":"  Deep Neural Networks (DNN) have become a central method in ecology. Most\ncurrent deep learning (DL) applications rely on one of the major deep learning\nframeworks, in particular Torch or TensorFlow, to build and train DNN. Using\nthese frameworks, however, requires substantially more experience and time than\ntypical regression functions in the R environment. Here, we present 'cito', a\nuser-friendly R package for DL that allows specifying DNNs in the familiar\nformula syntax used by many R packages. To fit the models, 'cito' uses 'torch',\ntaking advantage of the numerically optimized torch library, including the\nability to switch between training models on the CPU or the graphics processing\nunit (GPU) (which allows to efficiently train large DNN). Moreover, 'cito'\nincludes many user-friendly functions for model plotting and analysis,\nincluding optional confidence intervals (CIs) based on bootstraps for\npredictions and explainable AI (xAI) metrics for effect sizes and variable\nimportance with CIs and p-values. To showcase a typical analysis pipeline using\n'cito', including its built-in xAI features to explore the trained DNN, we\nbuild a species distribution model of the African elephant. We hope that by\nproviding a user-friendly R framework to specify, deploy and interpret DNN,\n'cito' will make this interesting model class more accessible to ecological\ndata analysis. A stable version of 'cito' can be installed from the\ncomprehensive R archive network (CRAN).\n","authors":["Christian Amesoeder","Florian Hartig","Maximilian Pichler"],"pdf_url":"https://arxiv.org/pdf/2303.09599v3.pdf","comment":"12 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2401.09793v3","updated":"2024-01-24T13:23:53Z","published":"2024-01-18T08:26:33Z","title":"PatchAD: Patch-based MLP-Mixer for Time Series Anomaly Detection","summary":"  Anomaly detection stands as a crucial aspect of time series analysis, aiming\nto identify abnormal events in time series samples. The central challenge of\nthis task lies in effectively learning the representations of normal and\nabnormal patterns in a label-lacking scenario. Previous research mostly relied\non reconstruction-based approaches, restricting the representational abilities\nof the models. In addition, most of the current deep learning-based methods are\nnot lightweight enough, which prompts us to design a more efficient framework\nfor anomaly detection. In this study, we introduce PatchAD, a novel multi-scale\npatch-based MLP-Mixer architecture that leverages contrastive learning for\nrepresentational extraction and anomaly detection. Specifically, PatchAD is\ncomposed of four distinct MLP Mixers, exclusively utilizing the MLP\narchitecture for high efficiency and lightweight architecture. Additionally, we\nalso innovatively crafted a dual project constraint module to mitigate\npotential model degradation. Comprehensive experiments demonstrate that PatchAD\nachieves state-of-the-art results across multiple real-world multivariate time\nseries datasets. Our code is publicly available\nhttps://github.com/EmorZz1G/PatchAD\n","authors":["Zhijie Zhong","Zhiwen Yu","Yiyuan Yang","Weizheng Wang","Kaixiang Yang"],"pdf_url":"https://arxiv.org/pdf/2401.09793v3.pdf","comment":"13 pages, 16 figures, Under review"},{"id":"http://arxiv.org/abs/2401.13429v1","updated":"2024-01-24T12:58:08Z","published":"2024-01-24T12:58:08Z","title":"Detection of Correlated Random Vectors","summary":"  In this paper, we investigate the problem of deciding whether two standard\nnormal random vectors $\\mathsf{X}\\in\\mathbb{R}^{n}$ and\n$\\mathsf{Y}\\in\\mathbb{R}^{n}$ are correlated or not. This is formulated as a\nhypothesis testing problem, where under the null hypothesis, these vectors are\nstatistically independent, while under the alternative, $\\mathsf{X}$ and a\nrandomly and uniformly permuted version of $\\mathsf{Y}$, are correlated with\ncorrelation $\\rho$. We analyze the thresholds at which optimal testing is\ninformation-theoretically impossible and possible, as a function of $n$ and\n$\\rho$. To derive our information-theoretic lower bounds, we develop a novel\ntechnique for evaluating the second moment of the likelihood ratio using an\northogonal polynomials expansion, which among other things, reveals a\nsurprising connection to integer partition functions. We also study a\nmulti-dimensional generalization of the above setting, where rather than two\nvectors we observe two databases/matrices, and furthermore allow for partial\ncorrelations between these two.\n","authors":["Dor Elimelech","Wasim Huleihel"],"pdf_url":"https://arxiv.org/pdf/2401.13429v1.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2211.12343v3","updated":"2024-01-24T12:51:43Z","published":"2022-11-20T01:09:49Z","title":"Diffusion Model Based Posterior Sampling for Noisy Linear Inverse\n  Problems","summary":"  We consider the ubiquitous linear inverse problems with additive Gaussian\nnoise and propose an unsupervised sampling approach called diffusion model\nbased posterior sampling (DMPS) to reconstruct the unknown signal from noisy\nlinear measurements. Specifically, using one diffusion model (DM) as an\nimplicit prior, the fundamental difficulty in performing posterior sampling is\nthat the noise-perturbed likelihood score, i.e., gradient of an annealed\nlikelihood function, is intractable. To circumvent this problem, we introduce a\nsimple yet effective closed-form approximation using an uninformative prior\nassumption. Extensive experiments are conducted on a variety of noisy linear\ninverse problems such as noisy super-resolution, denoising, deblurring, and\ncolorization. In all tasks, the proposed DMPS demonstrates highly competitive\nor even better performances on various tasks while being 3 times faster than\nthe state-of-the-art competitor diffusion posterior sampling (DPS).\n","authors":["Xiangming Meng","Yoshiyuki Kabashima"],"pdf_url":"https://arxiv.org/pdf/2211.12343v3.pdf","comment":"Code is available at https://github.com/mengxiangming/dmps"},{"id":"http://arxiv.org/abs/2401.12617v2","updated":"2024-01-24T12:49:24Z","published":"2024-01-23T10:16:44Z","title":"The Joint Effect of Task Similarity and Overparameterization on\n  Catastrophic Forgetting -- An Analytical Model","summary":"  In continual learning, catastrophic forgetting is affected by multiple\naspects of the tasks. Previous works have analyzed separately how forgetting is\naffected by either task similarity or overparameterization. In contrast, our\npaper examines how task similarity and overparameterization jointly affect\nforgetting in an analyzable model. Specifically, we focus on two-task continual\nlinear regression, where the second task is a random orthogonal transformation\nof an arbitrary first task (an abstraction of random permutation tasks). We\nderive an exact analytical expression for the expected forgetting - and uncover\na nuanced pattern. In highly overparameterized models, intermediate task\nsimilarity causes the most forgetting. However, near the interpolation\nthreshold, forgetting decreases monotonically with the expected task\nsimilarity. We validate our findings with linear regression on synthetic data,\nand with neural networks on established permutation task benchmarks.\n","authors":["Daniel Goldfarb","Itay Evron","Nir Weinberger","Daniel Soudry","Paul Hand"],"pdf_url":"https://arxiv.org/pdf/2401.12617v2.pdf","comment":"Accepted to the Twelfth International Conference on Learning\n  Representations (ICLR 2024)"},{"id":"http://arxiv.org/abs/2401.13421v1","updated":"2024-01-24T12:32:08Z","published":"2024-01-24T12:32:08Z","title":"Federated learning with distributed fixed design quantum chips and\n  quantum channels","summary":"  The privacy in classical federated learning can be breached through the use\nof local gradient results by using engineered queries from the clients.\nHowever, quantum communication channels are considered more secure because the\nuse of measurements in the data causes some loss of information, which can be\ndetected. Therefore, the quantum version of federated learning can be used to\nprovide more privacy. Additionally, sending an $N$ dimensional data vector\nthrough a quantum channel requires sending $\\log N$ entangled qubits, which can\nprovide exponential efficiency if the data vector is obtained as quantum\nstates.\n  In this paper, we propose a quantum federated learning model where fixed\ndesign quantum chips are operated based on the quantum states sent by a\ncentralized server. Based on the coming superposition states, the clients\ncompute and then send their local gradients as quantum states to the server,\nwhere they are aggregated to update parameters. Since the server does not send\nmodel parameters, but instead sends the operator as a quantum state, the\nclients are not required to share the model. This allows for the creation of\nasynchronous learning models. In addition, the model as a quantum state is fed\ninto client-side chips directly; therefore, it does not require measurements on\nthe upcoming quantum state to obtain model parameters in order to compute\ngradients. This can provide efficiency over the models where parameter vector\nis sent via classical or quantum channels and local gradients are obtained\nthrough the obtained values of these parameters.\n","authors":["Ammar Daskin"],"pdf_url":"https://arxiv.org/pdf/2401.13421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13410v1","updated":"2024-01-24T12:11:41Z","published":"2024-01-24T12:11:41Z","title":"How to Forget Clients in Federated Online Learning to Rank?","summary":"  Data protection legislation like the European Union's General Data Protection\nRegulation (GDPR) establishes the \\textit{right to be forgotten}: a user\n(client) can request contributions made using their data to be removed from\nlearned models. In this paper, we study how to remove the contributions made by\na client participating in a Federated Online Learning to Rank (FOLTR) system.\nIn a FOLTR system, a ranker is learned by aggregating local updates to the\nglobal ranking model. Local updates are learned in an online manner at a\nclient-level using queries and implicit interactions that have occurred within\nthat specific client. By doing so, each client's local data is not shared with\nother clients or with a centralised search service, while at the same time\nclients can benefit from an effective global ranking model learned from\ncontributions of each client in the federation.\n  In this paper, we study an effective and efficient unlearning method that can\nremove a client's contribution without compromising the overall ranker\neffectiveness and without needing to retrain the global ranker from scratch. A\nkey challenge is how to measure whether the model has unlearned the\ncontributions from the client $c^*$ that has requested removal. For this, we\ninstruct $c^*$ to perform a poisoning attack (add noise to this client updates)\nand then we measure whether the impact of the attack is lessened when the\nunlearning process has taken place. Through experiments on four datasets, we\ndemonstrate the effectiveness and efficiency of the unlearning strategy under\ndifferent combinations of parameter settings.\n","authors":["Shuyi Wang","Bing Liu","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2401.13410v1.pdf","comment":"Accepted in ECIR 2024"},{"id":"http://arxiv.org/abs/2311.14743v7","updated":"2024-01-24T12:08:34Z","published":"2023-11-21T18:41:26Z","title":"A Baseline Analysis of Reward Models' Ability To Accurately Analyze\n  Foundation Models Under Distribution Shift","summary":"  Foundation models, specifically Large Language Models (LLMs), have lately\ngained wide-spread attention and adoption. Reinforcement Learning with Human\nFeedback (RLHF) involves training a reward model to capture desired behaviors,\nwhich is then used to align LLM's. These reward models are additionally used at\ninference-time to estimate LLM responses' adherence to those desired behaviors.\nHowever, there is little work measuring how robust these reward models are to\ndistribution shifts. In this work, we evaluate how reward model performance -\nmeasured via accuracy and calibration (i.e. alignment between accuracy and\nconfidence) - is affected by distribution shift. We show novel calibration\npatterns and accuracy drops due to OOD prompts and responses, and that the\nreward model is more sensitive to shifts in responses than prompts.\nAdditionally, we adapt an OOD detection technique commonly used in\nclassification to the reward model setting to detect these distribution shifts\nin prompts and responses.\n","authors":["Will LeVine","Benjamin Pikus","Anthony Chen","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2311.14743v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10623v2","updated":"2024-01-24T12:06:49Z","published":"2023-08-21T10:47:52Z","title":"GaitPT: Skeletons Are All You Need For Gait Recognition","summary":"  The analysis of patterns of walking is an important area of research that has\nnumerous applications in security, healthcare, sports and human-computer\ninteraction. Lately, walking patterns have been regarded as a unique\nfingerprinting method for automatic person identification at a distance. In\nthis work, we propose a novel gait recognition architecture called Gait Pyramid\nTransformer (GaitPT) that leverages pose estimation skeletons to capture unique\nwalking patterns, without relying on appearance information. GaitPT adopts a\nhierarchical transformer architecture that effectively extracts both spatial\nand temporal features of movement in an anatomically consistent manner, guided\nby the structure of the human skeleton. Our results show that GaitPT achieves\nstate-of-the-art performance compared to other skeleton-based gait recognition\nworks, in both controlled and in-the-wild scenarios. GaitPT obtains 82.6%\naverage accuracy on CASIA-B, surpassing other works by a margin of 6%.\nMoreover, it obtains 52.16% Rank-1 accuracy on GREW, outperforming both\nskeleton-based and appearance-based approaches.\n","authors":["Andy Catruna","Adrian Cosma","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2308.10623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13398v1","updated":"2024-01-24T11:52:05Z","published":"2024-01-24T11:52:05Z","title":"Text Categorization Can Enhance Domain-Agnostic Stopword Extraction","summary":"  This paper investigates the role of text categorization in streamlining\nstopword extraction in natural language processing (NLP), specifically focusing\non nine African languages alongside French. By leveraging the MasakhaNEWS,\nAfrican Stopwords Project, and MasakhaPOS datasets, our findings emphasize that\ntext categorization effectively identifies domain-agnostic stopwords with over\n80% detection success rate for most examined languages. Nevertheless,\nlinguistic variances result in lower detection rates for certain languages.\nInterestingly, we find that while over 40% of stopwords are common across news\ncategories, less than 15% are unique to a single category. Uncommon stopwords\nadd depth to text but their classification as stopwords depends on context.\nTherefore combining statistical and linguistic approaches creates comprehensive\nstopword lists, highlighting the value of our hybrid method. This research\nenhances NLP for African languages and underscores the importance of text\ncategorization in stopword extraction.\n","authors":["Houcemeddine Turki","Naome A. Etori","Mohamed Ali Hadj Taieb","Abdul-Hakeem Omotayo","Chris Chinenye Emezue","Mohamed Ben Aouicha","Ayodele Awokoya","Falalu Ibrahim Lawan","Doreen Nixdorf"],"pdf_url":"https://arxiv.org/pdf/2401.13398v1.pdf","comment":"A Project Report for the Masakhane Research Community"},{"id":"http://arxiv.org/abs/2401.13391v1","updated":"2024-01-24T11:41:30Z","published":"2024-01-24T11:41:30Z","title":"Beyond Accuracy-Fairness: Stop evaluating bias mitigation methods solely\n  on between-group metrics","summary":"  Artificial Intelligence (AI) finds widespread applications across various\ndomains, sparking concerns about fairness in its deployment. While fairness in\nAI remains a central concern, the prevailing discourse often emphasizes\noutcome-based metrics without a nuanced consideration of the differential\nimpacts within subgroups. Bias mitigation techniques do not only affect the\nranking of pairs of instances across sensitive groups, but often also\nsignificantly affect the ranking of instances within these groups. Such changes\nare hard to explain and raise concerns regarding the validity of the\nintervention. Unfortunately, these effects largely remain under the radar in\nthe accuracy-fairness evaluation framework that is usually applied. This paper\nchallenges the prevailing metrics for assessing bias mitigation techniques,\narguing that they do not take into account the changes within-groups and that\nthe resulting prediction labels fall short of reflecting real-world scenarios.\nWe propose a paradigm shift: initially, we should focus on generating the most\nprecise ranking for each subgroup. Following this, individuals should be chosen\nfrom these rankings to meet both fairness standards and practical\nconsiderations.\n","authors":["Sofie Goethals","Toon Calders","David Martens"],"pdf_url":"https://arxiv.org/pdf/2401.13391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.10227v4","updated":"2024-01-24T11:38:25Z","published":"2022-11-18T13:26:57Z","title":"Adversarial Detection by Approximation of Ensemble Boundary","summary":"  A new method of detecting adversarial attacks is proposed for an ensemble of\nDeep Neural Networks (DNNs) solving two-class pattern recognition problems. The\nensemble is combined using Walsh coefficients which are capable of\napproximating Boolean functions and thereby controlling the complexity of the\nensemble decision boundary. The hypothesis in this paper is that decision\nboundaries with high curvature allow adversarial perturbations to be found, but\nchange the curvature of the decision boundary, which is then approximated in a\ndifferent way by Walsh coefficients compared to the clean images. By observing\nthe difference in Walsh coefficient approximation between clean and adversarial\nimages, it is shown experimentally that transferability of attack may be used\nfor detection. Furthermore, approximating the decision boundary may aid in\nunderstanding the learning and transferability properties of DNNs. While the\nexperiments here use images, the proposed approach of modelling two-class\nensemble decision boundaries could in principle be applied to any application\narea. Code for approximating Boolean functions using Walsh coefficients:\nhttps://doi.org/10.24433/CO.3695905.v1\n","authors":["T. Windeatt"],"pdf_url":"https://arxiv.org/pdf/2211.10227v4.pdf","comment":"17 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.15398v2","updated":"2024-01-24T11:33:09Z","published":"2023-07-28T08:48:32Z","title":"The Initial Screening Order Problem","summary":"  We investigate the role of the initial screening order (ISO) in candidate\nscreening processes, such as hiring and academic admissions. ISO refers to the\norder in which the screener sorts the candidate pool before the evaluation. It\nhas been largely overlooked in the literature, despite its potential impact on\nthe optimality and fairness of the chosen set, especially under a human\nscreener. We define two problem formulations: best-$k$, where the screener\nchooses the $k$ best candidates, and good-$k$, where the screener chooses the\nfirst $k$ good-enough candidates. To study the impact of ISO, we introduce a\nhuman-like screener and compare to its algorithmic counterpart. The human-like\nscreener is conceived to be inconsistent over time due to fatigue. Our analysis\nshows that the ISO under a human-like screener hinders individual fairness\ndespite meeting group level fairness. This is due to the position bias, where a\ncandidate's evaluation is affected by its position within ISO. We report\nextensive simulated experiments exploring the parameters of the problem\nformulations both for algorithmic and human-like screeners. This work is\nmotivated by a real world candidate screening problem studied in collaboration\nwith a large European company.\n","authors":["Jose M. Alvarez","Antonio Mastropietro","Salvatore Ruggieri"],"pdf_url":"https://arxiv.org/pdf/2307.15398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18612v2","updated":"2024-01-24T11:19:11Z","published":"2023-10-28T06:41:47Z","title":"Efficient kernel surrogates for neural network-based regression","summary":"  Despite their immense promise in performing a variety of learning tasks, a\ntheoretical understanding of the limitations of Deep Neural Networks (DNNs) has\nso far eluded practitioners. This is partly due to the inability to determine\nthe closed forms of the learned functions, making it harder to study their\ngeneralization properties on unseen datasets. Recent work has shown that\nrandomly initialized DNNs in the infinite width limit converge to kernel\nmachines relying on a Neural Tangent Kernel (NTK) with known closed form. These\nresults suggest, and experimental evidence corroborates, that empirical kernel\nmachines can also act as surrogates for finite width DNNs. The high\ncomputational cost of assembling the full NTK, however, makes this approach\ninfeasible in practice, motivating the need for low-cost approximations. In the\ncurrent work, we study the performance of the Conjugate Kernel (CK), an\nefficient approximation to the NTK that has been observed to yield fairly\nsimilar results. For the regression problem of smooth functions and logistic\nregression classification, we show that the CK performance is only marginally\nworse than that of the NTK and, in certain cases, is shown to be superior. In\nparticular, we establish bounds for the relative test losses, verify them with\nnumerical tests, and identify the regularity of the kernel as the key\ndeterminant of performance. In addition to providing a theoretical grounding\nfor using CKs instead of NTKs, our framework suggests a recipe for improving\nDNN accuracy inexpensively. We present a demonstration of this on the\nfoundation model GPT-2 by comparing its performance on a classification task\nusing a conventional approach and our prescription. We also show how our\napproach can be used to improve physics-informed operator network training for\nregression tasks as well as convolutional neural network training for vision\nclassification tasks.\n","authors":["Saad Qadeer","Andrew Engel","Amanda Howard","Adam Tsou","Max Vargas","Panos Stinis","Tony Chiang"],"pdf_url":"https://arxiv.org/pdf/2310.18612v2.pdf","comment":"35 pages. software used to reach results available upon request,\n  approved for release by Pacific Northwest National Laboratory"},{"id":"http://arxiv.org/abs/2309.02292v3","updated":"2024-01-24T11:10:07Z","published":"2023-09-05T14:55:09Z","title":"Inferring effective couplings with Restricted Boltzmann Machines","summary":"  Generative models offer a direct way of modeling complex data. Energy-based\nmodels attempt to encode the statistical correlations observed in the data at\nthe level of the Boltzmann weight associated with an energy function in the\nform of a neural network. We address here the challenge of understanding the\nphysical interpretation of such models. In this study, we propose a simple\nsolution by implementing a direct mapping between the Restricted Boltzmann\nMachine and an effective Ising spin Hamiltonian. This mapping includes\ninteractions of all possible orders, going beyond the conventional pairwise\ninteractions typically considered in the inverse Ising (or Boltzmann Machine)\napproach, and allowing the description of complex datasets. Earlier works\nattempted to achieve this goal, but the proposed mappings were inaccurate for\ninference applications, did not properly treat the complexity of the problem,\nor did not provide precise prescriptions for practical application. To validate\nour method, we performed several controlled inverse numerical experiments in\nwhich we trained the RBMs using equilibrium samples of predefined models with\nlocal external fields, 2-body and 3-body interactions in different sparse\ntopologies. The results demonstrate the effectiveness of our proposed approach\nin learning the correct interaction network and pave the way for its\napplication in modeling interesting binary variable datasets. We also evaluate\nthe quality of the inferred model based on different training methods.\n","authors":["Aurélien Decelle","Cyril Furtlehner","Alfonso De Jesus Navas Gómez","Beatriz Seoane"],"pdf_url":"https://arxiv.org/pdf/2309.02292v3.pdf","comment":"17 figures, 39 pages"},{"id":"http://arxiv.org/abs/2311.16093v2","updated":"2024-01-24T11:03:49Z","published":"2023-11-27T18:58:34Z","title":"Visual cognition in multimodal large language models","summary":"  A chief goal of artificial intelligence is to build machines that think like\npeople. Yet it has been argued that deep neural network architectures fail to\naccomplish this. Researchers have asserted these models' limitations in the\ndomains of causal reasoning, intuitive physics, and intuitive psychology. Yet\nrecent advancements, namely the rise of large language models, particularly\nthose designed for visual processing, have rekindled interest in the potential\nto emulate human-like cognitive abilities. This paper evaluates the current\nstate of vision-based large language models in the domains of intuitive\nphysics, causal reasoning, and intuitive psychology. Through a series of\ncontrolled experiments, we investigate the extent to which these modern models\ngrasp complex physical interactions, causal relationships, and intuitive\nunderstanding of others' preferences. Our findings reveal that, while these\nmodels demonstrate a notable proficiency in processing and interpreting visual\ndata, they still fall short of human capabilities in these areas. The models\nexhibit a rudimentary understanding of physical laws and causal relationships,\nbut their performance is hindered by a lack of deeper insights - a key aspect\nof human cognition. Furthermore, in tasks requiring an intuitive theory of\nmind, the models fail altogether. Our results emphasize the need for\nintegrating more robust mechanisms for understanding causality, physical\ndynamics, and social cognition into modern-day, vision-based language models,\nand point out the importance of cognitively-inspired benchmarks.\n","authors":["Luca M. Schulze Buschoff","Elif Akata","Matthias Bethge","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2311.16093v2.pdf","comment":"Changed title and main text"},{"id":"http://arxiv.org/abs/2312.09084v3","updated":"2024-01-24T10:56:24Z","published":"2023-12-14T16:16:35Z","title":"Language Modeling on a SpiNNaker 2 Neuromorphic Chip","summary":"  As large language models continue to scale in size rapidly, so too does the\ncomputational power required to run them. Event-based networks on neuromorphic\ndevices offer a potential way to reduce energy consumption for inference\nsignificantly. However, to date, most event-based networks that can run on\nneuromorphic hardware, including spiking neural networks (SNNs), have not\nachieved task performance even on par with LSTM models for language modeling.\nAs a result, language modeling on neuromorphic devices has seemed a distant\nprospect. In this work, we demonstrate the first-ever implementation of a\nlanguage model on a neuromorphic device - specifically the SpiNNaker 2 chip -\nbased on a recently published event-based architecture called the EGRU.\nSpiNNaker 2 is a many-core neuromorphic chip designed for large-scale\nasynchronous processing, while the EGRU is architected to leverage such\nhardware efficiently while maintaining competitive task performance. This\nimplementation marks the first time a neuromorphic language model matches\nLSTMs, setting the stage for taking task performance to the level of large\nlanguage models. We also demonstrate results on a gesture recognition task\nbased on inputs from a DVS camera. Overall, our results showcase the\nfeasibility of this neuro-inspired neural network in hardware, highlighting\nsignificant gains versus conventional hardware in energy efficiency for the\ncommon use case of single batch inference.\n","authors":["Khaleelulla Khan Nazeer","Mark Schöne","Rishav Mukherji","Bernhard Vogginger","Christian Mayr","David Kappel","Anand Subramoney"],"pdf_url":"https://arxiv.org/pdf/2312.09084v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13366v1","updated":"2024-01-24T10:51:15Z","published":"2024-01-24T10:51:15Z","title":"Mitigating System Bias in Resource Constrained Asynchronous Federated\n  Learning Systems","summary":"  Federated learning (FL) systems face performance challenges in dealing with\nheterogeneous devices and non-identically distributed data across clients. We\npropose a dynamic global model aggregation method within Asynchronous Federated\nLearning (AFL) deployments to address these issues. Our aggregation method\nscores and adjusts the weighting of client model updates based on their upload\nfrequency to accommodate differences in device capabilities. Additionally, we\nalso immediately provide an updated global model to clients after they upload\ntheir local models to reduce idle time and improve training efficiency. We\nevaluate our approach within an AFL deployment consisting of 10 simulated\nclients with heterogeneous compute constraints and non-IID data. The simulation\nresults, using the FashionMNIST dataset, demonstrate over 10% and 19%\nimprovement in global model accuracy compared to state-of-the-art methods\nPAPAYA and FedAsync, respectively. Our dynamic aggregation method allows\nreliable global model training despite limiting client resources and\nstatistical data heterogeneity. This improves robustness and scalability for\nreal-world FL deployments.\n","authors":["Jikun Gao","Ioannis Mavromatis","Peizheng Li","Pietro Carnelli","Aftab Khan"],"pdf_url":"https://arxiv.org/pdf/2401.13366v1.pdf","comment":"6 pages, 5 figures. This work has been accepted by PerCom PerconAI\n  workshop 2024"},{"id":"http://arxiv.org/abs/2307.00527v3","updated":"2024-01-24T10:48:54Z","published":"2023-07-02T09:38:43Z","title":"Graph Neural Networks based Log Anomaly Detection and Explanation","summary":"  Event logs are widely used to record the status of high-tech systems, making\nlog anomaly detection important for monitoring those systems. Most existing log\nanomaly detection methods take a log event count matrix or log event sequences\nas input, exploiting quantitative and/or sequential relationships between log\nevents to detect anomalies. Unfortunately, only considering quantitative or\nsequential relationships may result in low detection accuracy. To alleviate\nthis problem, we propose a graph-based method for unsupervised log anomaly\ndetection, dubbed Logs2Graphs, which first converts event logs into attributed,\ndirected, and weighted graphs, and then leverages graph neural networks to\nperform graph-level anomaly detection. Specifically, we introduce One-Class\nDigraph Inception Convolutional Networks, abbreviated as OCDiGCN, a novel graph\nneural network model for detecting graph-level anomalies in a collection of\nattributed, directed, and weighted graphs. By coupling the graph representation\nand anomaly detection steps, OCDiGCN can learn a representation that is\nespecially suited for anomaly detection, resulting in a high detection\naccuracy. Importantly, for each identified anomaly, we additionally provide a\nsmall subset of nodes that play a crucial role in OCDiGCN's prediction as\nexplanations, which can offer valuable cues for subsequent root cause\ndiagnosis. Experiments on five benchmark datasets show that Logs2Graphs\nperforms at least on par with state-of-the-art log anomaly detection methods on\nsimple datasets while largely outperforming state-of-the-art log anomaly\ndetection methods on complicated datasets.\n","authors":["Zhong Li","Jiayang Shi","Matthijs van Leeuwen"],"pdf_url":"https://arxiv.org/pdf/2307.00527v3.pdf","comment":"Technical Report (A short version was accepted by ICSE'24 poster\n  track)"},{"id":"http://arxiv.org/abs/2401.13360v1","updated":"2024-01-24T10:37:28Z","published":"2024-01-24T10:37:28Z","title":"Debiased Sample Selection for Combating Noisy Labels","summary":"  Learning with noisy labels aims to ensure model generalization given a\nlabel-corrupted training set. The sample selection strategy achieves promising\nperformance by selecting a label-reliable subset for model training. In this\npaper, we empirically reveal that existing sample selection methods suffer from\nboth data and training bias that are represented as imbalanced selected sets\nand accumulation errors in practice, respectively. However, only the training\nbias was handled in previous studies. To address this limitation, we propose a\nnoIse-Tolerant Expert Model (ITEM) for debiased learning in sample selection.\nSpecifically, to mitigate the training bias, we design a robust network\narchitecture that integrates with multiple experts. Compared with the\nprevailing double-branch network, our network exhibits better performance of\nselection and prediction by ensembling these experts while training with fewer\nparameters. Meanwhile, to mitigate the data bias, we propose a mixed sampling\nstrategy based on two weight-based data samplers. By training on the mixture of\ntwo class-discriminative mini-batches, the model mitigates the effect of the\nimbalanced training set while avoiding sparse representations that are easily\ncaused by sampling strategies. Extensive experiments and analyses demonstrate\nthe effectiveness of ITEM. Our code is available at this url\n\\href{https://github.com/1998v7/ITEM}{ITEM}.\n","authors":["Qi Wei","Lei Feng","Haobo Wang","Bo An"],"pdf_url":"https://arxiv.org/pdf/2401.13360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09800v2","updated":"2024-01-24T10:33:18Z","published":"2023-06-16T12:19:32Z","title":"$\\pi2\\text{vec}$: Policy Representations with Successor Features","summary":"  This paper describes $\\pi2\\text{vec}$, a method for representing behaviors of\nblack box policies as feature vectors. The policy representations capture how\nthe statistics of foundation model features change in response to the policy\nbehavior in a task agnostic way, and can be trained from offline data, allowing\nthem to be used in offline policy selection. This work provides a key piece of\na recipe for fusing together three modern lines of research: Offline policy\nevaluation as a counterpart to offline RL, foundation models as generic and\npowerful state representations, and efficient policy selection in resource\nconstrained environments.\n","authors":["Gianluca Scarpellini","Ksenia Konyushkova","Claudio Fantacci","Tom Le Paine","Yutian Chen","Misha Denil"],"pdf_url":"https://arxiv.org/pdf/2306.09800v2.pdf","comment":"Accepted paper at ICLR2024"},{"id":"http://arxiv.org/abs/2401.13343v1","updated":"2024-01-24T10:12:43Z","published":"2024-01-24T10:12:43Z","title":"Lessons on Datasets and Paradigms in Machine Learning for Symbolic\n  Computation: A Case Study on CAD","summary":"  Symbolic Computation algorithms and their implementation in computer algebra\nsystems often contain choices which do not affect the correctness of the output\nbut can significantly impact the resources required: such choices can benefit\nfrom having them made separately for each problem via a machine learning model.\nThis study reports lessons on such use of machine learning in symbolic\ncomputation, in particular on the importance of analysing datasets prior to\nmachine learning and on the different machine learning paradigms that may be\nutilised. We present results for a particular case study, the selection of\nvariable ordering for cylindrical algebraic decomposition, but expect that the\nlessons learned are applicable to other decisions in symbolic computation.\n  We utilise an existing dataset of examples derived from applications which\nwas found to be imbalanced with respect to the variable ordering decision. We\nintroduce an augmentation technique for polynomial systems problems that allows\nus to balance and further augment the dataset, improving the machine learning\nresults by 28\\% and 38\\% on average, respectively. We then demonstrate how the\nexisting machine learning methodology used for the problem $-$ classification\n$-$ might be recast into the regression paradigm. While this does not have a\nradical change on the performance, it does widen the scope in which the\nmethodology can be applied to make choices.\n","authors":["Tereso del Río","Matthew England"],"pdf_url":"https://arxiv.org/pdf/2401.13343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13335v1","updated":"2024-01-24T09:59:48Z","published":"2024-01-24T09:59:48Z","title":"Full Bayesian Significance Testing for Neural Networks","summary":"  Significance testing aims to determine whether a proposition about the\npopulation distribution is the truth or not given observations. However,\ntraditional significance testing often needs to derive the distribution of the\ntesting statistic, failing to deal with complex nonlinear relationships. In\nthis paper, we propose to conduct Full Bayesian Significance Testing for neural\nnetworks, called \\textit{n}FBST, to overcome the limitation in relationship\ncharacterization of traditional approaches. A Bayesian neural network is\nutilized to fit the nonlinear and multi-dimensional relationships with small\nerrors and avoid hard theoretical derivation by computing the evidence value.\nBesides, \\textit{n}FBST can test not only global significance but also local\nand instance-wise significance, which previous testing methods don't focus on.\nMoreover, \\textit{n}FBST is a general framework that can be extended based on\nthe measures selected, such as Grad-\\textit{n}FBST, LRP-\\textit{n}FBST,\nDeepLIFT-\\textit{n}FBST, LIME-\\textit{n}FBST. A range of experiments on both\nsimulated and real data are conducted to show the advantages of our method.\n","authors":["Zehua Liu","Zimeng Li","Jingyuan Wang","Yue He"],"pdf_url":"https://arxiv.org/pdf/2401.13335v1.pdf","comment":"Published as a conference paper at AAAI 2024"},{"id":"http://arxiv.org/abs/2401.13334v1","updated":"2024-01-24T09:59:22Z","published":"2024-01-24T09:59:22Z","title":"Explainable Bayesian Optimization","summary":"  In industry, Bayesian optimization (BO) is widely applied in the human-AI\ncollaborative parameter tuning of cyber-physical systems. However, BO's\nsolutions may deviate from human experts' actual goal due to approximation\nerrors and simplified objectives, requiring subsequent tuning. The black-box\nnature of BO limits the collaborative tuning process because the expert does\nnot trust the BO recommendations. Current explainable AI (XAI) methods are not\ntailored for optimization and thus fall short of addressing this gap. To bridge\nthis gap, we propose TNTRules (TUNE-NOTUNE Rules), a post-hoc, rule-based\nexplainability method that produces high quality explanations through\nmultiobjective optimization. Our evaluation of benchmark optimization problems\nand real-world hyperparameter optimization tasks demonstrates TNTRules'\nsuperiority over state-of-the-art XAI methods in generating high quality\nexplanations. This work contributes to the intersection of BO and XAI,\nproviding interpretable optimization techniques for real-world applications.\n","authors":["Tanmay Chakraborty","Christin Seifert","Christian Wirth"],"pdf_url":"https://arxiv.org/pdf/2401.13334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13330v1","updated":"2024-01-24T09:48:12Z","published":"2024-01-24T09:48:12Z","title":"NACHOS: Neural Architecture Search for Hardware Constrained Early Exit\n  Neural Networks","summary":"  Early Exit Neural Networks (EENNs) endow astandard Deep Neural Network (DNN)\nwith Early Exit Classifiers (EECs), to provide predictions at intermediate\npoints of the processing when enough confidence in classification is achieved.\nThis leads to many benefits in terms of effectiveness and efficiency.\nCurrently, the design of EENNs is carried out manually by experts, a complex\nand time-consuming task that requires accounting for many aspects, including\nthe correct placement, the thresholding, and the computational overhead of the\nEECs. For this reason, the research is exploring the use of Neural Architecture\nSearch (NAS) to automatize the design of EENNs. Currently, few comprehensive\nNAS solutions for EENNs have been proposed in the literature, and a fully\nautomated, joint design strategy taking into consideration both the backbone\nand the EECs remains an open problem. To this end, this work presents Neural\nArchitecture Search for Hardware Constrained Early Exit Neural Networks\n(NACHOS), the first NAS framework for the design of optimal EENNs satisfying\nconstraints on the accuracy and the number of Multiply and Accumulate (MAC)\noperations performed by the EENNs at inference time. In particular, this\nprovides the joint design of backbone and EECs to select a set of admissible\n(i.e., respecting the constraints) Pareto Optimal Solutions in terms of best\ntradeoff between the accuracy and number of MACs. The results show that the\nmodels designed by NACHOS are competitive with the state-of-the-art EENNs.\nAdditionally, this work investigates the effectiveness of two novel\nregularization terms designed for the optimization of the auxiliary classifiers\nof the EENN\n","authors":["Matteo Gambella","Jary Pomponi","Simone Scardapane","Manuel Roveri"],"pdf_url":"https://arxiv.org/pdf/2401.13330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02803v3","updated":"2024-01-24T09:47:27Z","published":"2023-04-11T14:53:57Z","title":"Tensor PCA from basis in tensor space","summary":"  The aim of this paper is to present a mathematical framework for tensor PCA.\nThe proposed approach is able to overcome the limitations of previous methods\nthat extract a low dimensional subspace by iteratively solving an optimization\nproblem. The core of the proposed approach is the derivation of a basis in\ntensor space from a real self-adjoint tensor operator, thus reducing the\nproblem of deriving a basis to an eigenvalue problem. Three different cases\nhave been studied to derive: i) a basis from a self-adjoint tensor operator;\nii) a rank-1 basis; iii) a basis in a subspace. In particular, the equivalence\nbetween eigenvalue equation for a real self-adjoint tensor operator and\nstandard matrix eigenvalue equation has been proven. For all the three cases\nconsidered, a subspace approach has been adopted to derive a tensor PCA.\nExperiments on image datasets validate the proposed mathematical framework.\n","authors":["Claudio Turchetti","Laura Falaschetti"],"pdf_url":"https://arxiv.org/pdf/2305.02803v3.pdf","comment":"This version contains a new experiment better showing the\n  potentiality of the paper and a corrected autor list. This work has been\n  submitted to the IEEE for possible publication. Copyright may be transferred\n  without notice, after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2401.13327v1","updated":"2024-01-24T09:44:57Z","published":"2024-01-24T09:44:57Z","title":"Generating Synthetic Health Sensor Data for Privacy-Preserving Wearable\n  Stress Detection","summary":"  Smartwatch health sensor data is increasingly utilized in smart health\napplications and patient monitoring, including stress detection. However, such\nmedical data often comprises sensitive personal information and is\nresource-intensive to acquire for research purposes. In response to this\nchallenge, we introduce the privacy-aware synthetization of multi-sensor\nsmartwatch health readings related to moments of stress. Our method involves\nthe generation of synthetic sequence data through Generative Adversarial\nNetworks (GANs), coupled with the implementation of Differential Privacy (DP)\nsafeguards for protecting patient information during model training. To ensure\nthe integrity of our synthetic data, we employ a range of quality assessments\nand monitor the plausibility between synthetic and original data. To test the\nusefulness, we create private machine learning models on a commonly used,\nalbeit small, stress detection dataset, exploring strategies for enhancing the\nexisting data foundation with our synthetic data. Through our GAN-based\naugmentation methods, we observe improvements in model performance, both in\nnon-private (0.45% F1) and private (11.90-15.48% F1) training scenarios. We\nunderline the potential of differentially private synthetic data in optimizing\nutility-privacy trade-offs, especially with limited availability of real\ntraining samples.\n","authors":["Lucas Lange","Nils Wenzlitschke","Erhard Rahm"],"pdf_url":"https://arxiv.org/pdf/2401.13327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.07289v5","updated":"2024-01-24T09:35:38Z","published":"2021-06-14T10:36:25Z","title":"Decentralized Personalized Federated Learning for Min-Max Problems","summary":"  Personalized Federated Learning (PFL) has witnessed remarkable advancements,\nenabling the development of innovative machine learning applications that\npreserve the privacy of training data. However, existing theoretical research\nin this field has primarily focused on distributed optimization for\nminimization problems. This paper is the first to study PFL for saddle point\nproblems encompassing a broader range of optimization problems, that require\nmore than just solving minimization problems. In this work, we consider a\nrecently proposed PFL setting with the mixing objective function, an approach\ncombining the learning of a global model together with locally distributed\nlearners. Unlike most previous work, which considered only the centralized\nsetting, we work in a more general and decentralized setup that allows us to\ndesign and analyze more practical and federated ways to connect devices to the\nnetwork. We proposed new algorithms to address this problem and provide a\ntheoretical analysis of the smooth (strongly) convex-(strongly) concave saddle\npoint problems in stochastic and deterministic cases. Numerical experiments for\nbilinear problems and neural networks with adversarial noise demonstrate the\neffectiveness of the proposed methods.\n","authors":["Ekaterina Borodich","Aleksandr Beznosikov","Abdurakhmon Sadiev","Vadim Sushko","Nikolay Savelyev","Martin Takáč","Alexander Gasnikov"],"pdf_url":"https://arxiv.org/pdf/2106.07289v5.pdf","comment":"33 pages, 3 algorithms, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2401.13311v1","updated":"2024-01-24T09:07:11Z","published":"2024-01-24T09:07:11Z","title":"ConTextual: Evaluating Context-Sensitive Text-Rich Visual Reasoning in\n  Large Multimodal Models","summary":"  Recent advancements in AI have led to the development of large multimodal\nmodels (LMMs) capable of processing complex tasks involving joint reasoning\nover text and visual content in the image (e.g., navigating maps in public\nplaces). This paper introduces ConTextual, a novel benchmark comprising\ninstructions designed explicitly to evaluate LMMs' ability to perform\ncontext-sensitive text-rich visual reasoning. ConTextual emphasizes diverse\nreal-world scenarios (e.g., time-reading, navigation, shopping and more)\ndemanding a deeper understanding of the interactions between textual and visual\nelements. Our findings reveal a significant performance gap of 30.8% between\nthe best-performing LMM, GPT-4V(ision), and human capabilities using human\nevaluation indicating substantial room for improvement in context-sensitive\ntext-rich visual reasoning. Notably, while GPT-4V excelled in abstract\ncategories like meme and quote interpretation, its overall performance still\nlagged behind humans. In addition to human evaluations, we also employed\nautomatic evaluation metrics using GPT-4, uncovering similar trends in\nperformance disparities. We also perform a fine-grained evaluation across\ndiverse visual contexts and provide qualitative analysis which provides a\nrobust framework for future advancements in the LMM design.\nhttps://con-textual.github.io/\n","authors":["Rohan Wadhawan","Hritik Bansal","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2401.13311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11792v3","updated":"2024-01-24T09:04:03Z","published":"2024-01-22T09:44:16Z","title":"Safe and Generalized end-to-end Autonomous Driving System with\n  Reinforcement Learning and Demonstrations","summary":"  An intelligent driving system should be capable of dynamically formulating\nappropriate driving strategies based on the current environment and vehicle\nstatus, while ensuring the security and reliability of the system. However,\nexisting methods based on reinforcement learning and imitation learning suffer\nfrom low safety, poor generalization, and inefficient sampling. Additionally,\nthey cannot accurately predict future driving trajectories, and the accurate\nprediction of future driving trajectories is a precondition for making optimal\ndecisions. To solve these problems, in this paper, we introduce a Safe and\nGeneralized end-to-end Autonomous Driving System (SGADS) for complex and\nvarious scenarios. Our SGADS incorporates variational inference with\nnormalizing flows, enabling the intelligent vehicle to accurately predict\nfuture driving trajectories. Moreover, we propose the formulation of robust\nsafety constraints. Furthermore, we combine reinforcement learning with\ndemonstrations to augment search process of the agent. The experimental results\ndemonstrate that our SGADS can significantly improve safety performance,\nexhibit strong generalization, and enhance the training efficiency of\nintelligent vehicles in complex urban scenarios compared to existing methods.\n","authors":["Zuojin Tang","Xiaoyu Chen","YongQiang Li","Jianyu Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11792v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13301v1","updated":"2024-01-24T08:49:50Z","published":"2024-01-24T08:49:50Z","title":"Classification of Radiologically Isolated Syndrome and Clinically\n  Isolated Syndrome with Machine-Learning Techniques","summary":"  Background and purpose: The unanticipated detection by magnetic resonance\nimaging (MRI) in the brain of asymptomatic subjects of white matter lesions\nsuggestive of multiple sclerosis (MS) has been named radiologically isolated\nsyndrome (RIS). As the difference between early MS [i.e. clinically isolated\nsyndrome (CIS)] and RIS is the occurrence of a clinical event, it is logical to\nimprove detection of the subclinical form without interfering with MRI as there\nare radiological diagnostic criteria for that. Our objective was to use\nmachine-learning classification methods to identify morphometric measures that\nhelp to discriminate patients with RIS from those with CIS.\n  Methods: We used a multimodal 3-T MRI approach by combining MRI biomarkers\n(cortical thickness, cortical and subcortical grey matter volume, and white\nmatter integrity) of a cohort of 17 patients with RIS and 17 patients with CIS\nfor single-subject level classification.\n  Results: The best proposed models to predict the diagnosis of CIS and RIS\nwere based on the Naive Bayes, Bagging and Multilayer Perceptron classifiers\nusing only three features: the left rostral middle frontal gyrus volume and the\nfractional anisotropy values in the right amygdala and right lingual gyrus. The\nNaive Bayes obtained the highest accuracy [overall classification, 0.765; area\nunder the receiver operating characteristic (AUROC), 0.782].\n  Conclusions: A machine-learning approach applied to multimodal MRI data may\ndifferentiate between the earliest clinical expressions of MS (CIS and RIS)\nwith an accuracy of 78%.\n  Keywords: Bagging; Multilayer Perceptron; Naive Bayes classifier; clinically\nisolated syndrome; diffusion tensor imaging; machine-learning; magnetic\nresonance imaging; multiple sclerosis; radiologically isolated syndrome.\n","authors":["V Mato-Abad","A Labiano-Fontcuberta","S Rodriguez-Yanez","R Garcia-Vazquez","CR Munteanu","J Andrade-Garda","A Domingo-Santos","V Galan Sanchez-Seco","Y Aladro","ML Martinez-Gines","L Ayuso","J Benito-Leon"],"pdf_url":"https://arxiv.org/pdf/2401.13301v1.pdf","comment":"24 pages, 2 tables"},{"id":"http://arxiv.org/abs/2306.12194v3","updated":"2024-01-24T08:45:46Z","published":"2023-06-21T11:42:23Z","title":"Split Learning in 6G Edge Networks","summary":"  With the proliferation of distributed edge computing resources, the 6G mobile\nnetwork will evolve into a network for connected intelligence. Along this line,\nthe proposal to incorporate federated learning into the mobile edge has gained\nconsiderable interest in recent years. However, the deployment of federated\nlearning faces substantial challenges as massive resource-limited IoT devices\ncan hardly support on-device model training. This leads to the emergence of\nsplit learning (SL) which enables servers to handle the major training workload\nwhile still enhancing data privacy. In this article, we offer a brief overview\nof key advancements in SL and articulate its seamless integration with wireless\nedge networks. We begin by illustrating the tailored 6G architecture to support\nedge SL. Then, we examine the critical design issues for edge SL, including\ninnovative resource-efficient learning frameworks and resource management\nstrategies under a single edge server. Additionally, we expand the scope to\nmulti-edge scenarios, exploring multi-edge collaboration and mobility\nmanagement from a networking perspective. Finally, we discuss open problems for\nedge SL, including convergence analysis, asynchronous SL and U-shaped SL.\n","authors":["Zheng Lin","Guanqiao Qu","Xianhao Chen","Kaibin Huang"],"pdf_url":"https://arxiv.org/pdf/2306.12194v3.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2302.14648v2","updated":"2024-01-24T08:45:26Z","published":"2023-02-04T07:26:06Z","title":"Digital Over-the-Air Federated Learning in Multi-Antenna Systems","summary":"  In this paper, the performance optimization of federated learning (FL), when\ndeployed over a realistic wireless multiple-input multiple-output (MIMO)\ncommunication system with digital modulation and over-the-air computation\n(AirComp) is studied. In particular, a MIMO system is considered in which edge\ndevices transmit their local FL models (trained using their locally collected\ndata) to a parameter server (PS) using beamforming to maximize the number of\ndevices scheduled for transmission. The PS, acting as a central controller,\ngenerates a global FL model using the received local FL models and broadcasts\nit back to all devices. Due to the limited bandwidth in a wireless network,\nAirComp is adopted to enable efficient wireless data aggregation. However,\nfading of wireless channels can produce aggregate distortions in an\nAirComp-based FL scheme. To tackle this challenge, we propose a modified\nfederated averaging (FedAvg) algorithm that combines digital modulation with\nAirComp to mitigate wireless fading while ensuring the communication\nefficiency. This is achieved by a joint transmit and receive beamforming\ndesign, which is formulated as an optimization problem to dynamically adjust\nthe beamforming matrices based on current FL model parameters so as to minimize\nthe transmitting error and ensure the FL performance. To achieve this goal, we\nfirst analytically characterize how the beamforming matrices affect the\nperformance of the FedAvg in different iterations. Based on this relationship,\nan artificial neural network (ANN) is used to estimate the local FL models of\nall devices and adjust the beamforming matrices at the PS for future model\ntransmission. The algorithmic advantages and improved performance of the\nproposed methodologies are demonstrated through extensive numerical\nexperiments.\n","authors":["Sihua Wang","Mingzhe Chen","Cong Shen","Changchuan Yin","Christopher G. Brinton"],"pdf_url":"https://arxiv.org/pdf/2302.14648v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12233v2","updated":"2024-01-24T08:39:26Z","published":"2024-01-19T11:32:47Z","title":"Memorization in Self-Supervised Learning Improves Downstream\n  Generalization","summary":"  Self-supervised learning (SSL) has recently received significant attention\ndue to its ability to train high-performance encoders purely on unlabeled\ndata-often scraped from the internet. This data can still be sensitive and\nempirical evidence suggests that SSL encoders memorize private information of\ntheir training data and can disclose them at inference time. Since existing\ntheoretical definitions of memorization from supervised learning rely on\nlabels, they do not transfer to SSL. To address this gap, we propose SSLMem, a\nframework for defining memorization within SSL. Our definition compares the\ndifference in alignment of representations for data points and their augmented\nviews returned by both encoders that were trained on these data points and\nencoders that were not. Through comprehensive empirical analysis on diverse\nencoder architectures and datasets we highlight that even though SSL relies on\nlarge datasets and strong augmentations-both known in supervised learning as\nregularization techniques that reduce overfitting-still significant fractions\nof training data points experience high memorization. Through our empirical\nresults, we show that this memorization is essential for encoders to achieve\nhigher generalization performance on different downstream tasks.\n","authors":["Wenhao Wang","Muhammad Ahmad Kaleem","Adam Dziedzic","Michael Backes","Nicolas Papernot","Franziska Boenisch"],"pdf_url":"https://arxiv.org/pdf/2401.12233v2.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2312.01878v6","updated":"2024-01-24T08:11:38Z","published":"2023-12-04T13:20:15Z","title":"HGPROMPT: Bridging Homogeneous and Heterogeneous Graphs for Few-shot\n  Prompt Learning","summary":"  Graph neural networks (GNNs) and heterogeneous graph neural networks (HGNNs)\nare prominent techniques for homogeneous and heterogeneous graph representation\nlearning, yet their performance in an end-to-end supervised framework greatly\ndepends on the availability of task-specific supervision. To reduce the\nlabeling cost, pre-training on self-supervised pretext tasks has become a\npopular paradigm,but there is often a gap between the pre-trained model and\ndownstream tasks, stemming from the divergence in their objectives. To bridge\nthe gap, prompt learning has risen as a promising direction especially in\nfew-shot settings, without the need to fully fine-tune the pre-trained model.\nWhile there has been some early exploration of prompt-based learning on graphs,\nthey primarily deal with homogeneous graphs, ignoring the heterogeneous graphs\nthat are prevalent in downstream applications. In this paper, we propose\nHGPROMPT, a novel pre-training and prompting framework to unify not only\npre-training and downstream tasks but also homogeneous and heterogeneous graphs\nvia a dual-template design. Moreover, we propose dual-prompt in HGPROMPT to\nassist a downstream task in locating the most relevant prior to bridge the gaps\ncaused by not only feature variations but also heterogeneity differences across\ntasks. Finally, we thoroughly evaluate and analyze HGPROMPT through extensive\nexperiments on three public datasets.\n","authors":["Xingtong Yu","Yuan Fang","Zemin Liu","Xinming Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.01878v6.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2310.07223v3","updated":"2024-01-24T08:11:02Z","published":"2023-10-11T06:13:50Z","title":"Bidirectional recurrent imputation and abundance estimation of LULC\n  classes with MODIS multispectral time series and geo-topographic and climatic\n  data","summary":"  Remotely sensed data are dominated by mixed Land Use and Land Cover (LULC)\ntypes. Spectral unmixing (SU) is a key technique that disentangles mixed pixels\ninto constituent LULC types and their abundance fractions. While existing\nstudies on Deep Learning (DL) for SU typically focus on single time-step\nhyperspectral (HS) or multispectral (MS) data, our work pioneers SU using MODIS\nMS time series, addressing missing data with end-to-end DL models. Our approach\nenhances a Long-Short Term Memory (LSTM)-based model by incorporating\ngeographic, topographic (geo-topographic), and climatic ancillary information.\nNotably, our method eliminates the need for explicit endmember extraction,\ninstead learning the input-output relationship between mixed spectra and LULC\nabundances through supervised learning. Experimental results demonstrate that\nintegrating spectral-temporal input data with geo-topographic and climatic\ninformation significantly improves the estimation of LULC abundances in mixed\npixels. To facilitate this study, we curated a novel labeled dataset for\nAndalusia (Spain) with monthly MODIS multispectral time series at 460m\nresolution for 2013. Named Andalusia MultiSpectral MultiTemporal Unmixing\n(Andalusia-MSMTU), this dataset provides pixel-level annotations of LULC\nabundances along with ancillary information. The dataset\n(https://zenodo.org/records/7752348) and code\n(https://github.com/jrodriguezortega/MSMTU) are available to the public.\n","authors":["José Rodríguez-Ortega","Rohaifa Khaldi","Domingo Alcaraz-Segura","Siham Tabik"],"pdf_url":"https://arxiv.org/pdf/2310.07223v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03468v3","updated":"2024-01-24T07:56:04Z","published":"2023-04-07T04:10:26Z","title":"Toward Practical Entity Alignment Method Design: Insights from New\n  Highly Heterogeneous Knowledge Graph Datasets","summary":"  The flourishing of knowledge graph applications has driven the need for\nentity alignment (EA) across KGs. However, the heterogeneity of practical KGs,\ncharacterized by differing scales, structures, and limited overlapping\nentities, greatly surpasses that of existing EA datasets. This discrepancy\nhighlights an oversimplified heterogeneity in current EA datasets, which\nobstructs a full understanding of the advancements achieved by recent EA\nmethods. In this paper, we study the performance of EA methods in practical\nsettings, specifically focusing on the alignment of highly heterogeneous KGs\n(HHKGs). Firstly, we address the oversimplified heterogeneity settings of\ncurrent datasets and propose two new HHKG datasets that closely mimic practical\nEA scenarios. Then, based on these datasets, we conduct extensive experiments\nto evaluate previous representative EA methods. Our findings reveal that, in\naligning HHKGs, valuable structure information can hardly be exploited through\nmessage-passing and aggregation mechanisms. This phenomenon leads to inferior\nperformance of existing EA methods, especially those based on GNNs. These\nfindings shed light on the potential problems associated with the conventional\napplication of GNN-based methods as a panacea for all EA datasets.\nConsequently, in light of these observations and to elucidate what EA\nmethodology is genuinely beneficial in practical scenarios, we undertake an\nin-depth analysis by implementing a simple but effective approach: Simple-HHEA.\nThis method adaptly integrates entity name, structure, and temporal information\nto navigate the challenges posed by HHKGs. Our experiment results conclude that\nthe key to the future EA model design in practice lies in their adaptability\nand efficiency to varying information quality conditions, as well as their\ncapability to capture patterns across HHKGs.\n","authors":["Xuhui Jiang","Chengjin Xu","Yinghan Shen","Yuanzhuo Wang","Fenglong Su","Fei Sun","Zixuan Li","Zhichao Shi","Jian Guo","Huawei Shen"],"pdf_url":"https://arxiv.org/pdf/2304.03468v3.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2306.08013v6","updated":"2024-01-24T07:48:34Z","published":"2023-06-13T11:46:00Z","title":"TopP&R: Robust Support Estimation Approach for Evaluating Fidelity and\n  Diversity in Generative Models","summary":"  We propose a robust and reliable evaluation metric for generative models by\nintroducing topological and statistical treatments for rigorous support\nestimation. Existing metrics, such as Inception Score (IS), Frechet Inception\nDistance (FID), and the variants of Precision and Recall (P&R), heavily rely on\nsupports that are estimated from sample features. However, the reliability of\ntheir estimation has not been seriously discussed (and overlooked) even though\nthe quality of the evaluation entirely depends on it. In this paper, we propose\nTopological Precision and Recall (TopP&R, pronounced 'topper'), which provides\na systematic approach to estimating supports, retaining only topologically and\nstatistically important features with a certain level of confidence. This not\nonly makes TopP&R strong for noisy features, but also provides statistical\nconsistency. Our theoretical and experimental results show that TopP&R is\nrobust to outliers and non-independent and identically distributed (Non-IID)\nperturbations, while accurately capturing the true trend of change in samples.\nTo the best of our knowledge, this is the first evaluation metric focused on\nthe robust estimation of the support and provides its statistical consistency\nunder noise.\n","authors":["Pum Jun Kim","Yoojin Jang","Jisu Kim","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2306.08013v6.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.13282v1","updated":"2024-01-24T07:47:01Z","published":"2024-01-24T07:47:01Z","title":"RefreshNet: Learning Multiscale Dynamics through Hierarchical Refreshing","summary":"  Forecasting complex system dynamics, particularly for long-term predictions,\nis persistently hindered by error accumulation and computational burdens. This\nstudy presents RefreshNet, a multiscale framework developed to overcome these\nchallenges, delivering an unprecedented balance between computational\nefficiency and predictive accuracy. RefreshNet incorporates convolutional\nautoencoders to identify a reduced order latent space capturing essential\nfeatures of the dynamics, and strategically employs multiple recurrent neural\nnetwork (RNN) blocks operating at varying temporal resolutions within the\nlatent space, thus allowing the capture of latent dynamics at multiple temporal\nscales. The unique \"refreshing\" mechanism in RefreshNet allows coarser blocks\nto reset inputs of finer blocks, effectively controlling and alleviating error\naccumulation. This design demonstrates superiority over existing techniques\nregarding computational efficiency and predictive accuracy, especially in\nlong-term forecasting. The framework is validated using three benchmark\napplications: the FitzHugh-Nagumo system, the Reaction-Diffusion equation, and\nKuramoto-Sivashinsky dynamics. RefreshNet significantly outperforms\nstate-of-the-art methods in long-term forecasting accuracy and speed, marking a\nsignificant advancement in modeling complex systems and opening new avenues in\nunderstanding and predicting their behavior.\n","authors":["Junaid Farooq","Danish Rafiq","Pantelis R. Vlachas","Mohammad Abid Bazaz"],"pdf_url":"https://arxiv.org/pdf/2401.13282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10280v2","updated":"2024-01-24T06:35:43Z","published":"2023-10-16T11:11:43Z","title":"Mimicking the Maestro: Exploring the Efficacy of a Virtual AI Teacher in\n  Fine Motor Skill Acquisition","summary":"  Motor skills, especially fine motor skills like handwriting, play an\nessential role in academic pursuits and everyday life. Traditional methods to\nteach these skills, although effective, can be time-consuming and inconsistent.\nWith the rise of advanced technologies like robotics and artificial\nintelligence, there is increasing interest in automating such teaching\nprocesses using these technologies, via human-robot and human-computer\ninteractions. In this study, we examine the potential of a virtual AI teacher\nin emulating the techniques of human educators for motor skill acquisition. We\nintroduce an AI teacher model that captures the distinct characteristics of\nhuman instructors. Using a Reinforcement Learning environment tailored to mimic\nteacher-learner interactions, we tested our AI model against four guiding\nhypotheses, emphasizing improved learner performance, enhanced rate of skill\nacquisition, and reduced variability in learning outcomes. Our findings,\nvalidated on synthetic learners, revealed significant improvements across all\ntested hypotheses. Notably, our model showcased robustness across different\nlearners and settings and demonstrated adaptability to handwriting. This\nresearch underscores the potential of integrating Reinforcement Learning and\nImitation Learning models with robotics in revolutionizing the teaching of\ncritical motor skills.\n","authors":["Hadar Mulian","Segev Shlomov","Lior Limonad","Alessia Noccaro","Silvia Buscaglione"],"pdf_url":"https://arxiv.org/pdf/2310.10280v2.pdf","comment":"arXiv admin note: The first version of this paper has been removed by\n  arXiv administrators as the submitter did not have the right to agree to the\n  license at the time of submission. This version resolves the rights issue,\n  includes two additional authors, and is cleared to go public"},{"id":"http://arxiv.org/abs/2303.15991v4","updated":"2024-01-24T06:03:32Z","published":"2023-03-26T16:09:48Z","title":"Efficient Parallel Split Learning over Resource-constrained Wireless\n  Edge Networks","summary":"  The increasingly deeper neural networks hinder the democratization of\nprivacy-enhancing distributed learning, such as federated learning (FL), to\nresource-constrained devices. To overcome this challenge, in this paper, we\nadvocate the integration of edge computing paradigm and parallel split learning\n(PSL), allowing multiple client devices to offload substantial training\nworkloads to an edge server via layer-wise model split. By observing that\nexisting PSL schemes incur excessive training latency and large volume of data\ntransmissions, we propose an innovative PSL framework, namely, efficient\nparallel split learning (EPSL), to accelerate model training. To be specific,\nEPSL parallelizes client-side model training and reduces the dimension of local\ngradients for back propagation (BP) via last-layer gradient aggregation,\nleading to a significant reduction in server-side training and communication\nlatency. Moreover, by considering the heterogeneous channel conditions and\ncomputing capabilities at client devices, we jointly optimize subchannel\nallocation, power control, and cut layer selection to minimize the per-round\nlatency. Simulation results show that the proposed EPSL framework significantly\ndecreases the training latency needed to achieve a target accuracy compared\nwith the state-of-the-art benchmarks, and the tailored resource management and\nlayer split strategy can considerably reduce latency than the counterpart\nwithout optimization.\n","authors":["Zheng Lin","Guangyu Zhu","Yiqin Deng","Xianhao Chen","Yue Gao","Kaibin Huang","Yuguang Fang"],"pdf_url":"https://arxiv.org/pdf/2303.15991v4.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2401.13239v1","updated":"2024-01-24T05:57:36Z","published":"2024-01-24T05:57:36Z","title":"Adaptive Crowdsourcing Via Self-Supervised Learning","summary":"  Common crowdsourcing systems average estimates of a latent quantity of\ninterest provided by many crowdworkers to produce a group estimate. We develop\na new approach -- just-predict-others -- that leverages self-supervised\nlearning and a novel aggregation scheme. This approach adapts weights assigned\nto crowdworkers based on estimates they provided for previous quantities. When\nskills vary across crowdworkers or their estimates correlate, the weighted sum\noffers a more accurate group estimate than the average. Existing algorithms\nsuch as expectation maximization can, at least in principle, produce similarly\naccurate group estimates. However, their computational requirements become\nonerous when complex models, such as neural networks, are required to express\nrelationships among crowdworkers. Just-predict-others accommodates such\ncomplexity as well as many other practical challenges. We analyze the efficacy\nof just-predict-others through theoretical and computational studies. Among\nother things, we establish asymptotic optimality as the number of engagements\nper crowdworker grows.\n","authors":["Anmol Kagrecha","Henrik Marklund","Benjamin Van Roy","Hong Jun Jeon","Richard Zeckhauser"],"pdf_url":"https://arxiv.org/pdf/2401.13239v1.pdf","comment":"29 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.13236v1","updated":"2024-01-24T05:41:34Z","published":"2024-01-24T05:41:34Z","title":"How to Collaborate: Towards Maximizing the Generalization Performance in\n  Cross-Silo Federated Learning","summary":"  Federated learning (FL) has attracted vivid attention as a privacy-preserving\ndistributed learning framework. In this work, we focus on cross-silo FL, where\nclients become the model owners after training and are only concerned about the\nmodel's generalization performance on their local data. Due to the data\nheterogeneity issue, asking all the clients to join a single FL training\nprocess may result in model performance degradation. To investigate the\neffectiveness of collaboration, we first derive a generalization bound for each\nclient when collaborating with others or when training independently. We show\nthat the generalization performance of a client can be improved only by\ncollaborating with other clients that have more training data and similar data\ndistribution. Our analysis allows us to formulate a client utility maximization\nproblem by partitioning clients into multiple collaborating groups. A\nhierarchical clustering-based collaborative training (HCCT) scheme is then\nproposed, which does not need to fix in advance the number of groups. We\nfurther analyze the convergence of HCCT for general non-convex loss functions\nwhich unveils the effect of data similarity among clients. Extensive\nsimulations show that HCCT achieves better generalization performance than\nbaseline schemes, whereas it degenerates to independent training and\nconventional FL in specific scenarios.\n","authors":["Yuchang Sun","Marios Kountouris","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.13236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13229v1","updated":"2024-01-24T04:57:32Z","published":"2024-01-24T04:57:32Z","title":"From Random to Informed Data Selection: A Diversity-Based Approach to\n  Optimize Human Annotation and Few-Shot Learning","summary":"  A major challenge in Natural Language Processing is obtaining annotated data\nfor supervised learning. An option is the use of crowdsourcing platforms for\ndata annotation. However, crowdsourcing introduces issues related to the\nannotator's experience, consistency, and biases. An alternative is to use\nzero-shot methods, which in turn have limitations compared to their few-shot or\nfully supervised counterparts. Recent advancements driven by large language\nmodels show potential, but struggle to adapt to specialized domains with\nseverely limited data. The most common approaches therefore involve the human\nitself randomly annotating a set of datapoints to build initial datasets. But\nrandomly sampling data to be annotated is often inefficient as it ignores the\ncharacteristics of the data and the specific needs of the model. The situation\nworsens when working with imbalanced datasets, as random sampling tends to\nheavily bias towards the majority classes, leading to excessive annotated data.\nTo address these issues, this paper contributes an automatic and informed data\nselection architecture to build a small dataset for few-shot learning. Our\nproposal minimizes the quantity and maximizes diversity of data selected for\nhuman annotation, while improving model performance.\n","authors":["Alexandre Alcoforado","Thomas Palmeira Ferraz","Lucas Hideki Okamura","Israel Campos Fama","Arnold Moya Lavado","Bárbara Dias Bueno","Bruno Veloso","Anna Helena Reali Costa"],"pdf_url":"https://arxiv.org/pdf/2401.13229v1.pdf","comment":"Accepted at PROPOR 2024 - The 16th International Conference on\n  Computational Processing of Portuguese"},{"id":"http://arxiv.org/abs/2301.11824v3","updated":"2024-01-24T04:54:40Z","published":"2023-01-27T16:25:43Z","title":"PECAN: A Deterministic Certified Defense Against Backdoor Attacks","summary":"  Neural networks are vulnerable to backdoor poisoning attacks, where the\nattackers maliciously poison the training set and insert triggers into the test\ninput to change the prediction of the victim model. Existing defenses for\nbackdoor attacks either provide no formal guarantees or come with\nexpensive-to-compute and ineffective probabilistic guarantees. We present\nPECAN, an efficient and certified approach for defending against backdoor\nattacks. The key insight powering PECAN is to apply off-the-shelf test-time\nevasion certification techniques on a set of neural networks trained on\ndisjoint partitions of the data. We evaluate PECAN on image classification and\nmalware detection datasets. Our results demonstrate that PECAN can (1)\nsignificantly outperform the state-of-the-art certified backdoor defense, both\nin defense strength and efficiency, and (2) on real back-door attacks, PECAN\ncan reduce attack success rate by order of magnitude when compared to a range\nof baselines from the literature.\n","authors":["Yuhao Zhang","Aws Albarghouthi","Loris D'Antoni"],"pdf_url":"https://arxiv.org/pdf/2301.11824v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13227v1","updated":"2024-01-24T04:50:16Z","published":"2024-01-24T04:50:16Z","title":"Scalable Link Prediction on Large-Scale Heterogeneous Graphs with Large\n  Language Models","summary":"  Exploring the application of large-scale language models to graph learning is\na novel endeavor. However, the vast amount of information inherent in large\ngraphs poses significant challenges to this process. This paper focuses on the\nlink prediction task and introduces LPNL (Link Prediction via Natural\nLanguage), a framework based on a large language model designed for scalable\nlink prediction on large-scale heterogeneous graphs.We design novel prompts for\nlink prediction that articulate graph details in natural language. We propose a\ntwo-stage sampling pipeline to extract crucial information from large-scale\nheterogeneous graphs, and a divide-and-conquer strategy to control the input\ntoken count within predefined limits, addressing the challenge of overwhelming\ninformation. We fine-tune a T5 model based on our self-supervised learning\ndesigned for for link prediction. Extensive experiments on a large public\nheterogeneous graphs demonstrate that LPNL outperforms various advanced\nbaselines, highlighting its remarkable performance in link prediction tasks on\nlarge-scale graphs.\n","authors":["Baolong Bi","Shenghua Liu","Yiwei Wang","Lingrui Mei","Xueqi Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06343v2","updated":"2024-01-24T04:44:58Z","published":"2023-10-10T06:26:05Z","title":"Boosting Continuous Control with Consistency Policy","summary":"  Due to its training stability and strong expression, the diffusion model has\nattracted considerable attention in offline reinforcement learning. However,\nseveral challenges have also come with it: 1) The demand for a large number of\ndiffusion steps makes the diffusion-model-based methods time inefficient and\nlimits their applications in real-time control; 2) How to achieve policy\nimprovement with accurate guidance for diffusion model-based policy is still an\nopen problem. Inspired by the consistency model, we propose a novel\ntime-efficiency method named Consistency Policy with Q-Learning (CPQL), which\nderives action from noise by a single step. By establishing a mapping from the\nreverse diffusion trajectories to the desired policy, we simultaneously address\nthe issues of time efficiency and inaccurate guidance when updating diffusion\nmodel-based policy with the learned Q-function. We demonstrate that CPQL can\nachieve policy improvement with accurate guidance for offline reinforcement\nlearning, and can be seamlessly extended for online RL tasks. Experimental\nresults indicate that CPQL achieves new state-of-the-art performance on 11\noffline and 21 online tasks, significantly improving inference speed by nearly\n45 times compared to Diffusion-QL. We will release our code later.\n","authors":["Yuhui Chen","Haoran Li","Dongbin Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06343v2.pdf","comment":"18 pages, 9 pages"},{"id":"http://arxiv.org/abs/2401.12722v2","updated":"2024-01-24T04:43:05Z","published":"2024-01-23T12:48:27Z","title":"Falcon: Fair Active Learning using Multi-armed Bandits","summary":"  Biased data can lead to unfair machine learning models, highlighting the\nimportance of embedding fairness at the beginning of data analysis,\nparticularly during dataset curation and labeling. In response, we propose\nFalcon, a scalable fair active learning framework. Falcon adopts a data-centric\napproach that improves machine learning model fairness via strategic sample\nselection. Given a user-specified group fairness measure, Falcon identifies\nsamples from \"target groups\" (e.g., (attribute=female, label=positive)) that\nare the most informative for improving fairness. However, a challenge arises\nsince these target groups are defined using ground truth labels that are not\navailable during sample selection. To handle this, we propose a novel\ntrial-and-error method, where we postpone using a sample if the predicted label\nis different from the expected one and falls outside the target group. We also\nobserve the trade-off that selecting more informative samples results in higher\nlikelihood of postponing due to undesired label prediction, and the optimal\nbalance varies per dataset. We capture the trade-off between informativeness\nand postpone rate as policies and propose to automatically select the best\npolicy using adversarial multi-armed bandit methods, given their computational\nefficiency and theoretical guarantees. Experiments show that Falcon\nsignificantly outperforms existing fair active learning approaches in terms of\nfairness and accuracy and is more efficient. In particular, only Falcon\nsupports a proper trade-off between accuracy and fairness where its maximum\nfairness score is 1.8-4.5x higher than the second-best results.\n","authors":["Ki Hyun Tae","Hantian Zhang","Jaeyoung Park","Kexin Rong","Steven Euijong Whang"],"pdf_url":"https://arxiv.org/pdf/2401.12722v2.pdf","comment":"Accepted to VLDB 2024"},{"id":"http://arxiv.org/abs/2309.00976v3","updated":"2024-01-24T04:41:27Z","published":"2023-09-02T16:20:41Z","title":"Pure Message Passing Can Estimate Common Neighbor for Link Prediction","summary":"  Message Passing Neural Networks (MPNNs) have emerged as the {\\em de facto}\nstandard in graph representation learning. However, when it comes to link\nprediction, they often struggle, surpassed by simple heuristics such as Common\nNeighbor (CN). This discrepancy stems from a fundamental limitation: while\nMPNNs excel in node-level representation, they stumble with encoding the joint\nstructural features essential to link prediction, like CN. To bridge this gap,\nwe posit that, by harnessing the orthogonality of input vectors, pure\nmessage-passing can indeed capture joint structural features. Specifically, we\nstudy the proficiency of MPNNs in approximating CN heuristics. Based on our\nfindings, we introduce the Message Passing Link Predictor (MPLP), a novel link\nprediction model. MPLP taps into quasi-orthogonal vectors to estimate\nlink-level structural features, all while preserving the node-level\ncomplexities. Moreover, our approach demonstrates that leveraging\nmessage-passing to capture structural features could offset MPNNs'\nexpressiveness limitations at the expense of estimation variance. We conduct\nexperiments on benchmark datasets from various domains, where our method\nconsistently outperforms the baseline methods.\n","authors":["Kaiwen Dong","Zhichun Guo","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2309.00976v3.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2401.13219v1","updated":"2024-01-24T04:16:28Z","published":"2024-01-24T04:16:28Z","title":"TEPI: Taxonomy-aware Embedding and Pseudo-Imaging for Scarcely-labeled\n  Zero-shot Genome Classification","summary":"  A species' genetic code or genome encodes valuable evolutionary, biological,\nand phylogenetic information that aids in species recognition, taxonomic\nclassification, and understanding genetic predispositions like drug resistance\nand virulence. However, the vast number of potential species poses significant\nchallenges in developing a general-purpose whole genome classification tool.\nTraditional bioinformatics tools have made notable progress but lack\nscalability and are computationally expensive. Machine learning-based\nframeworks show promise but must address the issue of large classification\nvocabularies with long-tail distributions. In this study, we propose addressing\nthis problem through zero-shot learning using TEPI, Taxonomy-aware Embedding\nand Pseudo-Imaging. We represent each genome as pseudo-images and map them to a\ntaxonomy-aware embedding space for reasoning and classification. This embedding\nspace captures compositional and phylogenetic relationships of species,\nenabling predictions in extensive search spaces. We evaluate TEPI using two\nrigorous zero-shot settings and demonstrate its generalization capabilities\nqualitatively on curated, large-scale, publicly sourced data.\n","authors":["Sathyanarayanan Aakur","Vishalini R. Laguduva","Priyadharsini Ramamurthy","Akhilesh Ramachandran"],"pdf_url":"https://arxiv.org/pdf/2401.13219v1.pdf","comment":"Accepted to IEEE JBHI"},{"id":"http://arxiv.org/abs/2401.13216v1","updated":"2024-01-24T03:57:45Z","published":"2024-01-24T03:57:45Z","title":"On Principled Local Optimization Methods for Federated Learning","summary":"  Federated Learning (FL), a distributed learning paradigm that scales\non-device learning collaboratively, has emerged as a promising approach for\ndecentralized AI applications. Local optimization methods such as Federated\nAveraging (FedAvg) are the most prominent methods for FL applications. Despite\ntheir simplicity and popularity, the theoretical understanding of local\noptimization methods is far from clear. This dissertation aims to advance the\ntheoretical foundation of local methods in the following three directions.\n  First, we establish sharp bounds for FedAvg, the most popular algorithm in\nFederated Learning. We demonstrate how FedAvg may suffer from a notion we call\niterate bias, and how an additional third-order smoothness assumption may\nmitigate this effect and lead to better convergence rates. We explain this\nphenomenon from a Stochastic Differential Equation (SDE) perspective.\n  Second, we propose Federated Accelerated Stochastic Gradient Descent (FedAc),\nthe first principled acceleration of FedAvg, which provably improves the\nconvergence rate and communication efficiency. Our technique uses on a\npotential-based perturbed iterate analysis, a novel stability analysis of\ngeneralized accelerated SGD, and a strategic tradeoff between acceleration and\nstability.\n  Third, we study the Federated Composite Optimization problem, which extends\nthe classic smooth setting by incorporating a shared non-smooth regularizer. We\nshow that direct extensions of FedAvg may suffer from the \"curse of primal\naveraging,\" resulting in slow convergence. As a solution, we propose a new\nprimal-dual algorithm, Federated Dual Averaging, which overcomes the curse of\nprimal averaging by employing a novel inter-client dual averaging procedure.\n","authors":["Honglin Yuan"],"pdf_url":"https://arxiv.org/pdf/2401.13216v1.pdf","comment":"Stanford University Doctoral Dissertation"},{"id":"http://arxiv.org/abs/2401.13214v1","updated":"2024-01-24T03:56:33Z","published":"2024-01-24T03:56:33Z","title":"AMANet: Advancing SAR Ship Detection with Adaptive Multi-Hierarchical\n  Attention Network","summary":"  Recently, methods based on deep learning have been successfully applied to\nship detection for synthetic aperture radar (SAR) images. Despite the\ndevelopment of numerous ship detection methodologies, detecting small and\ncoastal ships remains a significant challenge due to the limited features and\nclutter in coastal environments. For that, a novel adaptive multi-hierarchical\nattention module (AMAM) is proposed to learn multi-scale features and\nadaptively aggregate salient features from various feature layers, even in\ncomplex environments. Specifically, we first fuse information from adjacent\nfeature layers to enhance the detection of smaller targets, thereby achieving\nmulti-scale feature enhancement. Then, to filter out the adverse effects of\ncomplex backgrounds, we dissect the previously fused multi-level features on\nthe channel, individually excavate the salient regions, and adaptively\namalgamate features originating from different channels. Thirdly, we present a\nnovel adaptive multi-hierarchical attention network (AMANet) by embedding the\nAMAM between the backbone network and the feature pyramid network (FPN).\nBesides, the AMAM can be readily inserted between different frameworks to\nimprove object detection. Lastly, extensive experiments on two large-scale SAR\nship detection datasets demonstrate that our AMANet method is superior to\nstate-of-the-art methods.\n","authors":["Xiaolin Ma","Junkai Cheng","Aihua Li","Yuhua Zhang","Zhilong Lin"],"pdf_url":"https://arxiv.org/pdf/2401.13214v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.13212v1","updated":"2024-01-24T03:49:51Z","published":"2024-01-24T03:49:51Z","title":"AdCorDA: Classifier Refinement via Adversarial Correction and Domain\n  Adaptation","summary":"  This paper describes a simple yet effective technique for refining a\npretrained classifier network. The proposed AdCorDA method is based on\nmodification of the training set and making use of the duality between network\nweights and layer inputs. We call this input space training. The method\nconsists of two stages - adversarial correction followed by domain adaptation.\nAdversarial correction uses adversarial attacks to correct incorrect\ntraining-set classifications. The incorrectly classified samples of the\ntraining set are removed and replaced with the adversarially corrected samples\nto form a new training set, and then, in the second stage, domain adaptation is\nperformed back to the original training set. Extensive experimental validations\nshow significant accuracy boosts of over 5% on the CIFAR-100 dataset. The\ntechnique can be straightforwardly applied to refinement of weight-quantized\nneural networks, where experiments show substantial enhancement in performance\nover the baseline. The adversarial correction technique also results in\nenhanced robustness to adversarial attacks.\n","authors":["Lulan Shen","Ali Edalati","Brett Meyer","Warren Gross","James J. Clark"],"pdf_url":"https://arxiv.org/pdf/2401.13212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13210v1","updated":"2024-01-24T03:43:45Z","published":"2024-01-24T03:43:45Z","title":"Multitask Active Learning for Graph Anomaly Detection","summary":"  In the web era, graph machine learning has been widely used on ubiquitous\ngraph-structured data. As a pivotal component for bolstering web security and\nenhancing the robustness of graph-based applications, the significance of graph\nanomaly detection is continually increasing. While Graph Neural Networks (GNNs)\nhave demonstrated efficacy in supervised and semi-supervised graph anomaly\ndetection, their performance is contingent upon the availability of sufficient\nground truth labels. The labor-intensive nature of identifying anomalies from\ncomplex graph structures poses a significant challenge in real-world\napplications. Despite that, the indirect supervision signals from other tasks\n(e.g., node classification) are relatively abundant. In this paper, we propose\na novel MultItask acTIve Graph Anomaly deTEction framework, namely MITIGATE.\nFirstly, by coupling node classification tasks, MITIGATE obtains the capability\nto detect out-of-distribution nodes without known anomalies. Secondly, MITIGATE\nquantifies the informativeness of nodes by the confidence difference across\ntasks, allowing samples with conflicting predictions to provide informative yet\nnot excessively challenging information for subsequent training. Finally, to\nenhance the likelihood of selecting representative nodes that are distant from\nknown patterns, MITIGATE adopts a masked aggregation mechanism for distance\nmeasurement, considering both inherent features of nodes and current labeled\nstatus. Empirical studies on four datasets demonstrate that MITIGATE\nsignificantly outperforms the state-of-the-art methods for anomaly detection.\nOur code is publicly available at: https://github.com/AhaChang/MITIGATE.\n","authors":["Wenjing Chang","Kay Liu","Kaize Ding","Philip S. Yu","Jianjun Yu"],"pdf_url":"https://arxiv.org/pdf/2401.13210v1.pdf","comment":"Preprint. Under review. Code available at\n  https://github.com/AhaChang/MITIGATE"},{"id":"http://arxiv.org/abs/2401.13206v1","updated":"2024-01-24T03:28:48Z","published":"2024-01-24T03:28:48Z","title":"Self-Improving Interference Management Based on Deep Learning With\n  Uncertainty Quantification","summary":"  This paper presents a groundbreaking self-improving interference management\nframework tailored for wireless communications, integrating deep learning with\nuncertainty quantification to enhance overall system performance. Our approach\naddresses the computational challenges inherent in traditional\noptimization-based algorithms by harnessing deep learning models to predict\noptimal interference management solutions. A significant breakthrough of our\nframework is its acknowledgment of the limitations inherent in data-driven\nmodels, particularly in scenarios not adequately represented by the training\ndataset. To overcome these challenges, we propose a method for uncertainty\nquantification, accompanied by a qualifying criterion, to assess the\ntrustworthiness of model predictions. This framework strategically alternates\nbetween model-generated solutions and traditional algorithms, guided by a\ncriterion that assesses the prediction credibility based on quantified\nuncertainties. Experimental results validate the framework's efficacy,\ndemonstrating its superiority over traditional deep learning models, notably in\nscenarios underrepresented in the training dataset. This work marks a\npioneering endeavor in harnessing self-improving deep learning for interference\nmanagement, through the lens of uncertainty quantification.\n","authors":["Hyun-Suk Lee","Do-Yup Kim","Kyungsik Min"],"pdf_url":"https://arxiv.org/pdf/2401.13206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11648v2","updated":"2024-01-24T03:04:25Z","published":"2024-01-22T01:58:32Z","title":"Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal\n  Contrastive EHR Modelling with Hierarchical Regularisation","summary":"  Predicting next visit diagnosis using Electronic Health Records (EHR) is an\nessential task in healthcare, critical for devising proactive future plans for\nboth healthcare providers and patients. Nonetheless, many preceding studies\nhave not sufficiently addressed the heterogeneous and hierarchical\ncharacteristics inherent in EHR data, inevitably leading to sub-optimal\nperformance. To this end, we propose NECHO, a novel medical code-centric\nmultimodal contrastive EHR learning framework with hierarchical regularisation.\nFirst, we integrate multifaceted information encompassing medical codes,\ndemographics, and clinical notes using a tailored network design and a pair of\nbimodal contrastive losses, all of which pivot around a medical code\nrepresentation. We also regularise modality-specific encoders using a parental\nlevel information in medical ontology to learn hierarchical structure of EHR\ndata. A series of experiments on MIMIC-III data demonstrates effectiveness of\nour approach.\n","authors":["Heejoon Koo"],"pdf_url":"https://arxiv.org/pdf/2401.11648v2.pdf","comment":"Accepted to EACL 2024 (The 18th Conference of the European Chapter of\n  the Association for Computational Linguistics)"},{"id":"http://arxiv.org/abs/2401.13200v1","updated":"2024-01-24T03:03:17Z","published":"2024-01-24T03:03:17Z","title":"Topology-aware Embedding Memory for Learning on Expanding Graphs","summary":"  Memory replay based techniques have shown great success for continual\nlearning with incrementally accumulated Euclidean data. Directly applying them\nto continually expanding graphs, however, leads to the potential memory\nexplosion problem due to the need to buffer representative nodes and their\nassociated topological neighborhood structures. To this end, we systematically\nanalyze the key challenges in the memory explosion problem, and present a\ngeneral framework, i.e., Parameter Decoupled Graph Neural Networks (PDGNNs)\nwith Topology-aware Embedding Memory (TEM), to tackle this issue. The proposed\nframework not only reduces the memory space complexity from $\\mathcal{O}(nd^L)$\nto $\\mathcal{O}(n)$~\\footnote{$n$: memory budget, $d$: average node degree,\n$L$: the radius of the GNN receptive field}, but also fully utilizes the\ntopological information for memory replay. Specifically, PDGNNs decouple\ntrainable parameters from the computation ego-subgraph via\n\\textit{Topology-aware Embeddings} (TEs), which compress ego-subgraphs into\ncompact vectors (i.e., TEs) to reduce the memory consumption. Based on this\nframework, we discover a unique \\textit{pseudo-training effect} in continual\nlearning on expanding graphs and this effect motivates us to develop a novel\n\\textit{coverage maximization sampling} strategy that can enhance the\nperformance with a tight memory budget. Thorough empirical studies demonstrate\nthat, by tackling the memory explosion problem and incorporating topological\ninformation into memory replay, PDGNNs with TEM significantly outperform\nstate-of-the-art techniques, especially in the challenging class-incremental\nsetting.\n","authors":["Xikun Zhang","Dongjin Song","Yixin Chen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2401.13200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08298v2","updated":"2024-01-24T02:58:56Z","published":"2023-02-16T13:56:32Z","title":"Unleashing the Potential of Acquisition Functions in High-Dimensional\n  Bayesian Optimization","summary":"  Bayesian optimization (BO) is widely used to optimize expensive-to-evaluate\nblack-box functions.BO first builds a surrogate model to represent the\nobjective function and assesses its uncertainty. It then decides where to\nsample by maximizing an acquisition function (AF) based on the surrogate model.\nHowever, when dealing with high-dimensional problems, finding the global\nmaximum of the AF becomes increasingly challenging. In such cases, the\ninitialization of the AF maximizer plays a pivotal role, as an inadequate setup\ncan severely hinder the effectiveness of the AF.\n  This paper investigates a largely understudied problem concerning the impact\nof AF maximizer initialization on exploiting AFs' capability. Our large-scale\nempirical study shows that the widely used random initialization strategy often\nfails to harness the potential of an AF. In light of this, we propose a better\ninitialization approach by employing multiple heuristic optimizers to leverage\nthe historical data of black-box optimization to generate initial points for\nthe AF maximize. We evaluate our approach with a range of heavily studied\nsynthetic functions and real-world applications. Experimental results show that\nour techniques, while simple, can significantly enhance the standard BO and\noutperform state-of-the-art methods by a large margin in most test cases.\n","authors":["Jiayu Zhao","Renyu Yang","Shenghao Qiu","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2302.08298v2.pdf","comment":"Accepted by Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2401.05535v2","updated":"2024-01-24T02:58:54Z","published":"2024-01-10T20:02:47Z","title":"Improving the Accuracy and Interpretability of Random Forests via Forest\n  Pruning","summary":"  Decades after their inception, random forests continue to provide\nstate-of-the-art accuracy in a variety of learning problems, outperforming in\nthis respect alternative machine learning algorithms such as decision trees or\neven neural networks. However, being an ensemble method, the one aspect where\nrandom forests tend to severely underperform decision trees is\ninterpretability. In the present work, we propose a post-hoc approach that aims\nto have the best of both worlds: the accuracy of random forests and the\ninterpretability of decision trees. To this end, we present two forest-pruning\nmethods to find an optimal sub-forest within a given random forest, and then,\nwhen applicable, combine the selected trees into one. Our first method relies\non constrained exhaustive search, while our second method is based on an\nadaptation of the LASSO methodology. Extensive experiments over synthetic and\nreal world datasets show that, in the majority of scenarios, at least one of\nthe two methods proposed is more accurate than the original random forest,\nwhile just using a small fraction of the trees, aiding result interpretability.\nCompared to current state-of-the-art forest pruning methods, namely sequential\nforward selection and (a variation of) sequential backward selection, our\nmethods tend to outperform both of them, whether in terms of accuracy, number\nof trees employed, or both.\n","authors":["Albert Dorador"],"pdf_url":"https://arxiv.org/pdf/2401.05535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08097v3","updated":"2024-01-24T02:37:10Z","published":"2023-07-16T16:43:38Z","title":"EasyTPP: Towards Open Benchmarking Temporal Point Processes","summary":"  Continuous-time event sequences play a vital role in real-world domains such\nas healthcare, finance, online shopping, social networks, and so on. To model\nsuch data, temporal point processes (TPPs) have emerged as the most natural and\ncompetitive models, making a significant impact in both academic and\napplication communities. Despite the emergence of many powerful models in\nrecent years, there hasn't been a central benchmark for these models and future\nresearch endeavors. This lack of standardization impedes researchers and\npractitioners from comparing methods and reproducing results, potentially\nslowing down progress in this field. In this paper, we present EasyTPP, the\nfirst central repository of research assets (e.g., data, models, evaluation\nprograms, documentations) in the area of event sequence modeling. Our EasyTPP\nmakes several unique contributions to this area: a unified interface of using\nexisting datasets and adding new datasets; a wide range of evaluation programs\nthat are easy to use and extend as well as facilitate reproducible research;\nimplementations of popular neural TPPs, together with a rich library of modules\nby composing which one could quickly build complex models. All the data and\nimplementation can be found at\nhttps://github.com/ant-research/EasyTemporalPointProcess. We will actively\nmaintain this benchmark and welcome contributions from other researchers and\npractitioners. Our benchmark will help promote reproducible research in this\nfield, thus accelerating research progress as well as making more significant\nreal-world impacts.\n","authors":["Siqiao Xue","Xiaoming Shi","Zhixuan Chu","Yan Wang","Hongyan Hao","Fan Zhou","Caigao Jiang","Chen Pan","James Y. Zhang","Qingsong Wen","Jun Zhou","Hongyuan Mei"],"pdf_url":"https://arxiv.org/pdf/2307.08097v3.pdf","comment":"ICLR 2024 camera ready"},{"id":"http://arxiv.org/abs/2401.13192v1","updated":"2024-01-24T02:36:52Z","published":"2024-01-24T02:36:52Z","title":"Generative Design of Crystal Structures by Point Cloud Representations\n  and Diffusion Model","summary":"  Efficiently generating energetically stable crystal structures has long been\na challenge in material design, primarily due to the immense arrangement of\natoms in a crystal lattice. To facilitate the discovery of stable material, we\npresent a framework for the generation of synthesizable materials, leveraging a\npoint cloud representation to encode intricate structural information. At the\nheart of this framework lies the introduction of a diffusion model as its\nfoundational pillar. To gauge the efficacy of our approach, we employ it to\nreconstruct input structures from our training datasets, rigorously validating\nits high reconstruction performance. Furthermore, we demonstrate the profound\npotential of Point Cloud-Based Crystal Diffusion (PCCD) by generating entirely\nnew materials, emphasizing their synthesizability. Our research stands as a\nnoteworthy contribution to the advancement of materials design and synthesis\nthrough the cutting-edge avenue of generative design instead of the\nconventional substitution or experience-based discovery.\n","authors":["Zhelin Li","Rami Mrad","Runxian Jiao","Guan Huang","Jun Shan","Shibing Chu","Yuanping Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13192v1.pdf","comment":"I am ready to submit to a journal, but I have not"},{"id":"http://arxiv.org/abs/2401.12435v2","updated":"2024-01-24T02:31:26Z","published":"2024-01-23T02:04:15Z","title":"Quantitative Analysis of Molecular Transport in the Extracellular Space\n  Using Physics-Informed Neural Network","summary":"  The brain extracellular space (ECS), an irregular, extremely tortuous\nnanoscale space located between cells or between cells and blood vessels, is\ncrucial for nerve cell survival. It plays a pivotal role in high-level brain\nfunctions such as memory, emotion, and sensation. However, the specific form of\nmolecular transport within the ECS remain elusive. To address this challenge,\nthis paper proposes a novel approach to quantitatively analyze the molecular\ntransport within the ECS by solving an inverse problem derived from the\nadvection-diffusion equation (ADE) using a physics-informed neural network\n(PINN). PINN provides a streamlined solution to the ADE without the need for\nintricate mathematical formulations or grid settings. Additionally, the\noptimization of PINN facilitates the automatic computation of the diffusion\ncoefficient governing long-term molecule transport and the velocity of\nmolecules driven by advection. Consequently, the proposed method allows for the\nquantitative analysis and identification of the specific pattern of molecular\ntransport within the ECS through the calculation of the Peclet number.\nExperimental validation on two datasets of magnetic resonance images (MRIs)\ncaptured at different time points showcases the effectiveness of the proposed\nmethod. Notably, our simulations reveal identical molecular transport patterns\nbetween datasets representing rats with tracer injected into the same brain\nregion. These findings highlight the potential of PINN as a promising tool for\ncomprehensively exploring molecular transport within the ECS.\n","authors":["Jiayi Xie","Hongfeng Li","Jin Cheng","Qingrui Cai","Hanbo Tan","Lingyun Zu","Xiaobo Qu","Hongbin Han"],"pdf_url":"https://arxiv.org/pdf/2401.12435v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12784v2","updated":"2024-01-24T02:20:53Z","published":"2023-12-20T06:10:27Z","title":"Fast Cell Library Characterization for Design Technology Co-Optimization\n  Based on Graph Neural Networks","summary":"  Design technology co-optimization (DTCO) plays a critical role in achieving\noptimal power, performance, and area (PPA) for advanced semiconductor process\ndevelopment. Cell library characterization is essential in DTCO flow, but\ntraditional methods are time-consuming and costly. To overcome these\nchallenges, we propose a graph neural network (GNN)-based machine learning\nmodel for rapid and accurate cell library characterization. Our model\nincorporates cell structures and demonstrates high prediction accuracy across\nvarious process-voltage-temperature (PVT) corners and technology parameters.\nValidation with 512 unseen technology corners and over one million test data\npoints shows accurate predictions of delay, power, and input pin capacitance\nfor 33 types of cells, with a mean absolute percentage error (MAPE) $\\le$ 0.95%\nand a speed-up of 100X compared with SPICE simulations. Additionally, we\ninvestigate system-level metrics such as worst negative slack (WNS), leakage\npower, and dynamic power using predictions obtained from the GNN-based model on\nunseen corners. Our model achieves precise predictions, with absolute error\n$\\le$3.0 ps for WNS, percentage errors $\\le$0.60% for leakage power, and\n$\\le$0.99% for dynamic power, when compared to golden reference. With the\ndeveloped model, we further proposed a fine-grained drive strength\ninterpolation methodology to enhance PPA for small-to-medium-scale designs,\nresulting in an approximate 1-3% improvement.\n","authors":["Tianliang Ma","Zhihui Deng","Xuguang Sun","Leilai Shao Kainlu Low"],"pdf_url":"https://arxiv.org/pdf/2312.12784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13185v1","updated":"2024-01-24T02:16:03Z","published":"2024-01-24T02:16:03Z","title":"Shortcutting Cross-Validation: Efficiently Deriving Column-Wise Centered\n  and Scaled Training Set $\\mathbf{X}^\\mathbf{T}\\mathbf{X}$ and\n  $\\mathbf{X}^\\mathbf{T}\\mathbf{Y}$ Without Full Recomputation of Matrix\n  Products or Statistical Moments","summary":"  Cross-validation is a widely used technique for assessing the performance of\npredictive models on unseen data. Many predictive models, such as Kernel-Based\nPartial Least-Squares (PLS) models, require the computation of\n$\\mathbf{X}^{\\mathbf{T}}\\mathbf{X}$ and $\\mathbf{X}^{\\mathbf{T}}\\mathbf{Y}$\nusing only training set samples from the input and output matrices,\n$\\mathbf{X}$ and $\\mathbf{Y}$, respectively. In this work, we present three\nalgorithms that efficiently compute these matrices. The first one allows no\ncolumn-wise preprocessing. The second one allows column-wise centering around\nthe training set means. The third one allows column-wise centering and\ncolumn-wise scaling around the training set means and standard deviations.\nDemonstrating correctness and superior computational complexity, they offer\nsignificant cross-validation speedup compared with straight-forward\ncross-validation and previous work on fast cross-validation - all without data\nleakage. Their suitability for parallelization is highlighted with an\nopen-source Python implementation combining our algorithms with Improved Kernel\nPLS.\n","authors":["Ole-Christian Galbo Engstrøm"],"pdf_url":"https://arxiv.org/pdf/2401.13185v1.pdf","comment":"24 pages, 1 table, 6 algorithms"},{"id":"http://arxiv.org/abs/2401.12764v2","updated":"2024-01-24T02:02:38Z","published":"2024-01-23T13:44:15Z","title":"Fast Nonlinear Two-Time-Scale Stochastic Approximation: Achieving\n  $O(1/k)$ Finite-Sample Complexity","summary":"  This paper proposes to develop a new variant of the two-time-scale stochastic\napproximation to find the roots of two coupled nonlinear operators, assuming\nonly noisy samples of these operators can be observed. Our key idea is to\nleverage the classic Ruppert-Polyak averaging technique to dynamically estimate\nthe operators through their samples. The estimated values of these averaging\nsteps will then be used in the two-time-scale stochastic approximation updates\nto find the desired solution. Our main theoretical result is to show that under\nthe strongly monotone condition of the underlying nonlinear operators the\nmean-squared errors of the iterates generated by the proposed method converge\nto zero at an optimal rate $O(1/k)$, where $k$ is the number of iterations. Our\nresult significantly improves the existing result of two-time-scale stochastic\napproximation, where the best known finite-time convergence rate is\n$O(1/k^{2/3})$.\n","authors":["Thinh T. Doan"],"pdf_url":"https://arxiv.org/pdf/2401.12764v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10795v2","updated":"2024-01-24T02:00:05Z","published":"2023-11-17T04:30:31Z","title":"How False Data Affects Machine Learning Models in Electrochemistry?","summary":"  Recently, the selection of machine learning model based on only the data\ndistribution without concerning the noise of the data. This study aims to\ndistinguish, which models perform well under noisy data, and establish whether\nstacking machine learning models actually provide robustness to otherwise\nweak-to-noise models. The electrochemical data were tested with 12 standalone\nmodels and stacking model. This includes XGB, LGBM, RF, GB, ADA, NN, ELAS,\nLASS, RIDGE, SVM, KNN, DT, and the stacking model. It is found that linear\nmodels handle noise well with the average error of (slope) to 1.75 F g-1 up to\nerror per 100% percent noise added; but it suffers from prediction accuracy due\nto having an average of 60.19 F g-1 estimated at minimal error at 0% noise\nadded. Tree-based models fail in terms of noise handling (average slope is\n55.24 F g-1 at 100% percent noise), but it can provide higher prediction\naccuracy (lowest error of 23.9 F g-1) than that of linear. To address the\ncontroversial between prediction accuracy and error handling, the stacking\nmodel was constructed, which is not only show high accuracy (intercept of 25.03\nF g-1), but it also exhibits good noise handling (slope of 43.58 F g-1), making\nstacking models a relatively low risk and viable choice for beginner and\nexperienced machine learning research in electrochemistry. Even though neural\nnetworks (NN) are gaining popularity in the electrochemistry field. However,\nthis study presents that NN is not suitable for electrochemical data, and\nimproper tuning resulting in a model that is susceptible to noise. Thus, STACK\nmodels should provide better benefits in that even with untuned base models,\nthey can achieve an accurate and noise-tolerant model. Overall, this work\nprovides insight into machine learning model selection for electrochemical\ndata, which should aid the understanding of data science in chemistry context.\n","authors":["Krittapong Deshsorna","Luckhana Lawtrakul","Pawin Iamprasertkun"],"pdf_url":"https://arxiv.org/pdf/2311.10795v2.pdf","comment":"40 pages, 11 figures"},{"id":"http://arxiv.org/abs/2401.12509v2","updated":"2024-01-24T01:56:12Z","published":"2024-01-23T06:02:03Z","title":"Digital cloning of online social networks for language-sensitive\n  agent-based modeling of misinformation spread","summary":"  We develop a simulation framework for studying misinformation spread within\nonline social networks that blends agent-based modeling and natural language\nprocessing techniques. While many other agent-based simulations exist in this\nspace, questions over their fidelity and generalization to existing networks in\npart hinders their ability to provide actionable insights. To partially address\nthese concerns, we create a 'digital clone' of a known misinformation sharing\nnetwork by downloading social media histories for over ten thousand of its\nusers. We parse these histories to both extract the structure of the network\nand model the nuanced ways in which information is shared and spread among its\nmembers. Unlike many other agent-based methods in this space, information\nsharing between users in our framework is sensitive to topic of discussion,\nuser preferences, and online community dynamics. To evaluate the fidelity of\nour method, we seed our cloned network with a set of posts recorded in the base\nnetwork and compare propagation dynamics between the two, observing reasonable\nagreement across the twin networks over a variety of metrics. Lastly, we\nexplore how the cloned network may serve as a flexible, low-cost testbed for\nmisinformation countermeasure evaluation and red teaming analysis. We hope the\ntools explored here augment existing efforts in the space and unlock new\nopportunities for misinformation countermeasure evaluation, a field that may\nbecome increasingly important to consider with the anticipated rise of\nmisinformation campaigns fueled by generative artificial intelligence.\n","authors":["Prateek Puri","Gabriel Hassler","Anton Shenk","Sai Katragadda"],"pdf_url":"https://arxiv.org/pdf/2401.12509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13178v1","updated":"2024-01-24T01:51:00Z","published":"2024-01-24T01:51:00Z","title":"AgentBoard: An Analytical Evaluation Board of Multi-turn LLM Agents","summary":"  Evaluating large language models (LLMs) as general-purpose agents is\nessential for understanding their capabilities and facilitating their\nintegration into practical applications. However, the evaluation process\npresents substantial challenges. A primary obstacle is the benchmarking of\nagent performance across diverse scenarios within a unified framework,\nespecially in maintaining partially-observable environments and ensuring\nmulti-round interactions. Moreover, current evaluation frameworks mostly focus\non the final success rate, revealing few insights during the process and\nfailing to provide a deep understanding of the model abilities. To address\nthese challenges, we introduce AgentBoard, a pioneering comprehensive benchmark\nand accompanied open-source evaluation framework tailored to analytical\nevaluation of LLM agents. AgentBoard offers a fine-grained progress rate metric\nthat captures incremental advancements as well as a comprehensive evaluation\ntoolkit that features easy assessment of agents for multi-faceted analysis\nthrough interactive visualization. This not only sheds light on the\ncapabilities and limitations of LLM agents but also propels the\ninterpretability of their performance to the forefront. Ultimately, AgentBoard\nserves as a significant step towards demystifying agent behaviors and\naccelerating the development of stronger LLM agents.\n","authors":["Chang Ma","Junlei Zhang","Zhihao Zhu","Cheng Yang","Yujiu Yang","Yaohui Jin","Zhenzhong Lan","Lingpeng Kong","Junxian He"],"pdf_url":"https://arxiv.org/pdf/2401.13178v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.13177v1","updated":"2024-01-24T01:50:29Z","published":"2024-01-24T01:50:29Z","title":"Deep Learning Model Reuse in the HuggingFace Community: Challenges,\n  Benefit and Trends","summary":"  The ubiquity of large-scale Pre-Trained Models (PTMs) is on the rise,\nsparking interest in model hubs, and dedicated platforms for hosting PTMs.\nDespite this trend, a comprehensive exploration of the challenges that users\nencounter and how the community leverages PTMs remains lacking. To address this\ngap, we conducted an extensive mixed-methods empirical study by focusing on\ndiscussion forums and the model hub of HuggingFace, the largest public model\nhub. Based on our qualitative analysis, we present a taxonomy of the challenges\nand benefits associated with PTM reuse within this community. We then conduct a\nquantitative study to track model-type trends and model documentation evolution\nover time. Our findings highlight prevalent challenges such as limited guidance\nfor beginner users, struggles with model output comprehensibility in training\nor inference, and a lack of model understanding. We also identified interesting\ntrends among models where some models maintain high upload rates despite a\ndecline in topics related to them. Additionally, we found that despite the\nintroduction of model documentation tools, its quantity has not increased over\ntime, leading to difficulties in model comprehension and selection among users.\nOur study sheds light on new challenges in reusing PTMs that were not reported\nbefore and we provide recommendations for various stakeholders involved in PTM\nreuse.\n","authors":["Mina Taraghi","Gianolli Dorcelus","Armstrong Foundjem","Florian Tambon","Foutse Khomh"],"pdf_url":"https://arxiv.org/pdf/2401.13177v1.pdf","comment":"Accepted by IEEE SANER 2024"},{"id":"http://arxiv.org/abs/2203.13883v5","updated":"2024-01-24T01:50:22Z","published":"2022-03-25T19:45:33Z","title":"Multi-modal Misinformation Detection: Approaches, Challenges and\n  Opportunities","summary":"  As social media platforms are evolving from text-based forums into\nmulti-modal environments, the nature of misinformation in social media is also\ntransforming accordingly. Taking advantage of the fact that visual modalities\nsuch as images and videos are more favorable and attractive to the users and\ntextual contents are sometimes skimmed carelessly, misinformation spreaders\nhave recently targeted contextual connections between the modalities e.g., text\nand image. Hence many researchers have developed automatic techniques for\ndetecting possible cross-modal discordance in web-based content. We analyze,\ncategorize and identify existing approaches in addition to challenges and\nshortcomings they face in order to unearth new research opportunities in the\nfield of multi-modal misinformation detection.\n","authors":["Sara Abdali","Sina shaham","Bhaskar Krishnamachari"],"pdf_url":"https://arxiv.org/pdf/2203.13883v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07448v2","updated":"2024-01-24T01:48:00Z","published":"2024-01-15T03:25:37Z","title":"Formal Logic Enabled Personalized Federated Learning Through Property\n  Inference","summary":"  Recent advancements in federated learning (FL) have greatly facilitated the\ndevelopment of decentralized collaborative applications, particularly in the\ndomain of Artificial Intelligence of Things (AIoT). However, a critical aspect\nmissing from the current research landscape is the ability to enable\ndata-driven client models with symbolic reasoning capabilities. Specifically,\nthe inherent heterogeneity of participating client devices poses a significant\nchallenge, as each client exhibits unique logic reasoning properties. Failing\nto consider these device-specific specifications can result in critical\nproperties being missed in the client predictions, leading to suboptimal\nperformance. In this work, we propose a new training paradigm that leverages\ntemporal logic reasoning to address this issue. Our approach involves enhancing\nthe training process by incorporating mechanically generated logic expressions\nfor each FL client. Additionally, we introduce the concept of aggregation\nclusters and develop a partitioning algorithm to effectively group clients\nbased on the alignment of their temporal reasoning properties. We evaluate the\nproposed method on two tasks: a real-world traffic volume prediction task\nconsisting of sensory data from fifteen states and a smart city multi-task\nprediction utilizing synthetic data. The evaluation results exhibit clear\nimprovements, with performance accuracy improved by up to 54% across all\nsequential prediction models.\n","authors":["Ziyan An","Taylor T. Johnson","Meiyi Ma"],"pdf_url":"https://arxiv.org/pdf/2401.07448v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13171v1","updated":"2024-01-24T01:33:39Z","published":"2024-01-24T01:33:39Z","title":"Compositional Generative Inverse Design","summary":"  Inverse design, where we seek to design input variables in order to optimize\nan underlying objective function, is an important problem that arises across\nfields such as mechanical engineering to aerospace engineering. Inverse design\nis typically formulated as an optimization problem, with recent works\nleveraging optimization across learned dynamics models. However, as models are\noptimized they tend to fall into adversarial modes, preventing effective\nsampling. We illustrate that by instead optimizing over the learned energy\nfunction captured by the diffusion model, we can avoid such adversarial\nexamples and significantly improve design performance. We further illustrate\nhow such a design system is compositional, enabling us to combine multiple\ndifferent diffusion models representing subcomponents of our desired system to\ndesign systems with every specified component. In an N-body interaction task\nand a challenging 2D multi-airfoil design task, we demonstrate that by\ncomposing the learned diffusion model at test time, our method allows us to\ndesign initial states and boundary shapes that are more complex than those in\nthe training data. Our method outperforms state-of-the-art neural inverse\ndesign method by an average of 41.5% in prediction MAE and 14.3% in design\nobjective for the N-body dataset and discovers formation flying to minimize\ndrag in the multi-airfoil design task. Project website and code can be found at\nhttps://github.com/AI4Science-WestlakeU/cindm.\n","authors":["Tailin Wu","Takashi Maruyama","Long Wei","Tao Zhang","Yilun Du","Gianluca Iaccarino","Jure Leskovec"],"pdf_url":"https://arxiv.org/pdf/2401.13171v1.pdf","comment":"ICLR 2024 spotlight. 30 pages, 17 figures"},{"id":"http://arxiv.org/abs/2309.14808v2","updated":"2024-01-24T01:22:59Z","published":"2023-09-26T10:06:28Z","title":"Revisiting Softmax Masking: Stop Gradient for Enhancing Stability in\n  Replay-based Continual Learning","summary":"  In replay-based methods for continual learning, replaying input samples in\nepisodic memory has shown its effectiveness in alleviating catastrophic\nforgetting. However, the potential key factor of cross-entropy loss with\nsoftmax in causing catastrophic forgetting has been underexplored. In this\npaper, we analyze the effect of softmax and revisit softmax masking with\nnegative infinity to shed light on its ability to mitigate catastrophic\nforgetting. Based on the analyses, it is found that negative infinity masked\nsoftmax is not always compatible with dark knowledge. To improve the\ncompatibility, we propose a general masked softmax that controls the stability\nby adjusting the gradient scale to old and new classes. We demonstrate that\nutilizing our method on other replay-based methods results in better\nperformance, primarily by enhancing model stability in continual learning\nbenchmarks, even when the buffer size is set to an extremely small value.\n","authors":["Hoyong Kim","Minchan Kwon","Kangil Kim"],"pdf_url":"https://arxiv.org/pdf/2309.14808v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12539v2","updated":"2024-01-24T01:09:01Z","published":"2023-08-24T03:53:55Z","title":"CALM : A Multi-task Benchmark for Comprehensive Assessment of Language\n  Model Bias","summary":"  As language models (LMs) become increasingly powerful and widely used, it is\nimportant to quantify them for sociodemographic bias with potential for harm.\nPrior measures of bias are sensitive to perturbations in the templates designed\nto compare performance across social groups, due to factors such as low\ndiversity or limited number of templates. Also, most previous work considers\nonly one NLP task. We introduce Comprehensive Assessment of Language Models\n(CALM) for robust measurement of two types of universally relevant\nsociodemographic bias, gender and race. CALM integrates sixteen datasets for\nquestion-answering, sentiment analysis and natural language inference. Examples\nfrom each dataset are filtered to produce 224 templates with high diversity\n(e.g., length, vocabulary). We assemble 50 highly frequent person names for\neach of seven distinct demographic groups to generate 78,400 prompts covering\nthe three NLP tasks. Our empirical evaluation shows that CALM bias scores are\nmore robust and far less sensitive than previous bias measurements to\nperturbations in the templates, such as synonym substitution, or to random\nsubset selection of templates. We apply CALM to 20 large language models, and\nfind that for 2 language model series, larger parameter models tend to be more\nbiased than smaller ones. The T0 series is the least biased model families, of\nthe 20 LLMs investigated here. The code is available at\nhttps://github.com/vipulgupta1011/CALM.\n","authors":["Vipul Gupta","Pranav Narayanan Venkit","Hugo Laurençon","Shomir Wilson","Rebecca J. Passonneau"],"pdf_url":"https://arxiv.org/pdf/2308.12539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12838v2","updated":"2024-01-24T00:39:57Z","published":"2023-02-24T15:36:28Z","title":"A Multimodal Graph Neural Network Framework of Cancer Molecular Subtype\n  Classification","summary":"  The recent development of high-throughput sequencing creates a large\ncollection of multi-omics data, which enables researchers to better investigate\ncancer molecular profiles and cancer taxonomy based on molecular subtypes.\nIntegrating multi-omics data has been proven to be effective for building more\nprecise classification models. Current multi-omics integrative models mainly\nuse early fusion by concatenation or late fusion based on deep neural networks.\nDue to the nature of biological systems, graphs are a better representation of\nbio-medical data. Although few graph neural network (GNN) based multi-omics\nintegrative methods have been proposed, they suffer from three common\ndisadvantages. One is most of them use only one type of connection, either\ninter-omics or intra-omic connection; second, they only consider one kind of\nGNN layer, either graph convolution network (GCN) or graph attention network\n(GAT); and third, most of these methods lack testing on a more complex cancer\nclassification task. We propose a novel end-to-end multi-omics GNN framework\nfor accurate and robust cancer subtype classification. The proposed model\nutilizes multi-omics data in the form of heterogeneous multi-layer graphs that\ncombines both inter-omics and intra-omic connections from established\nbiological knowledge. The proposed model incorporates learned graph features\nand global genome features for accurate classification. We test the proposed\nmodel on TCGA Pan-cancer dataset and TCGA breast cancer dataset for molecular\nsubtype and cancer subtype classification, respectively. The proposed model\noutperforms four current state-of-the-art baseline models in multiple\nevaluation metrics. The comparative analysis of GAT-based models and GCN-based\nmodels reveals that GAT-based models are preferred for smaller graphs with less\ninformation and GCN-based models are preferred for larger graphs with extra\ninformation.\n","authors":["Bingjun Li","Sheida Nabavi"],"pdf_url":"https://arxiv.org/pdf/2302.12838v2.pdf","comment":"18 pages, 4 figure"},{"id":"http://arxiv.org/abs/2401.13160v1","updated":"2024-01-24T00:36:13Z","published":"2024-01-24T00:36:13Z","title":"SpacTor-T5: Pre-training T5 Models with Span Corruption and Replaced\n  Token Detection","summary":"  Pre-training large language models is known to be extremely resource\nintensive and often times inefficient, under-utilizing the information\nencapsulated in the training text sequences. In this paper, we present SpacTor,\na new training procedure consisting of (1) a hybrid objective combining span\ncorruption (SC) and token replacement detection (RTD), and (2) a two-stage\ncurriculum that optimizes the hybrid objective over the initial $\\tau$\niterations, then transitions to standard SC loss. We show empirically that the\neffectiveness of the hybrid objective is tied to the two-stage pre-training\nschedule, and provide extensive analysis on why this is the case. In our\nexperiments with encoder-decoder architectures (T5) on a variety of NLP tasks,\nSpacTor-T5 yields the same downstream performance as standard SC pre-training,\nwhile enabling a 50% reduction in pre-training iterations and 40% reduction in\ntotal FLOPs. Alternatively, given the same amount of computing budget, we find\nthat SpacTor results in significantly improved downstream benchmark\nperformance.\n","authors":["Ke Ye","Heinrich Jiang","Afshin Rostamizadeh","Ayan Chakrabarti","Giulia DeSalvo","Jean-François Kagy","Lazaros Karydas","Gui Citovsky","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2401.13160v1.pdf","comment":"9+13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.13157v1","updated":"2024-01-24T00:33:53Z","published":"2024-01-24T00:33:53Z","title":"Time-Aware Knowledge Representations of Dynamic Objects with\n  Multidimensional Persistence","summary":"  Learning time-evolving objects such as multivariate time series and dynamic\nnetworks requires the development of novel knowledge representation mechanisms\nand neural network architectures, which allow for capturing implicit\ntime-dependent information contained in the data. Such information is typically\nnot directly observed but plays a key role in the learning task performance. In\nturn, lack of time dimension in knowledge encoding mechanisms for\ntime-dependent data leads to frequent model updates, poor learning performance,\nand, as a result, subpar decision-making. Here we propose a new approach to a\ntime-aware knowledge representation mechanism that notably focuses on implicit\ntime-dependent topological information along multiple geometric dimensions. In\nparticular, we propose a new approach, named \\textit{Temporal MultiPersistence}\n(TMP), which produces multidimensional topological fingerprints of the data by\nusing the existing single parameter topological summaries. The main idea behind\nTMP is to merge the two newest directions in topological representation\nlearning, that is, multi-persistence which simultaneously describes data shape\nevolution along multiple key parameters, and zigzag persistence to enable us to\nextract the most salient data shape information over time. We derive\ntheoretical guarantees of TMP vectorizations and show its utility, in\napplication to forecasting on benchmark traffic flow, Ethereum blockchain, and\nelectrocardiogram datasets, demonstrating the competitive performance,\nespecially, in scenarios of limited data records. In addition, our TMP method\nimproves the computational efficiency of the state-of-the-art multipersistence\nsummaries up to 59.5 times.\n","authors":["Baris Coskunuzer","Ignacio Segovia-Dominguez","Yuzhou Chen","Yulia R. Gel"],"pdf_url":"https://arxiv.org/pdf/2401.13157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01757v3","updated":"2024-01-24T23:58:13Z","published":"2023-01-31T01:40:26Z","title":"RS-Del: Edit Distance Robustness Certificates for Sequence Classifiers\n  via Randomized Deletion","summary":"  Randomized smoothing is a leading approach for constructing classifiers that\nare certifiably robust against adversarial examples. Existing work on\nrandomized smoothing has focused on classifiers with continuous inputs, such as\nimages, where $\\ell_p$-norm bounded adversaries are commonly studied. However,\nthere has been limited work for classifiers with discrete or variable-size\ninputs, such as for source code, which require different threat models and\nsmoothing mechanisms. In this work, we adapt randomized smoothing for discrete\nsequence classifiers to provide certified robustness against edit\ndistance-bounded adversaries. Our proposed smoothing mechanism randomized\ndeletion (RS-Del) applies random deletion edits, which are (perhaps\nsurprisingly) sufficient to confer robustness against adversarial deletion,\ninsertion and substitution edits. Our proof of certification deviates from the\nestablished Neyman-Pearson approach, which is intractable in our setting, and\nis instead organized around longest common subsequences. We present a case\nstudy on malware detection--a binary classification problem on byte sequences\nwhere classifier evasion is a well-established threat model. When applied to\nthe popular MalConv malware detection model, our smoothing mechanism RS-Del\nachieves a certified accuracy of 91% at an edit distance radius of 128 bytes.\n","authors":["Zhuoqun Huang","Neil G. Marchant","Keane Lucas","Lujo Bauer","Olga Ohrimenko","Benjamin I. P. Rubinstein"],"pdf_url":"https://arxiv.org/pdf/2302.01757v3.pdf","comment":"Final camera-ready version for NeurIPS 2023. 36 pages, 7 figures, 12\n  tables. Includes 20 pages of appendices. Code available at\n  https://github.com/Dovermore/randomized-deletion"},{"id":"http://arxiv.org/abs/2401.13858v1","updated":"2024-01-24T23:45:31Z","published":"2024-01-24T23:45:31Z","title":"Inverse Molecular Design with Multi-Conditional Diffusion Guidance","summary":"  Inverse molecular design with diffusion models holds great potential for\nadvancements in material and drug discovery. Despite success in unconditional\nmolecule generation, integrating multiple properties such as synthetic score\nand gas permeability as condition constraints into diffusion models remains\nunexplored. We introduce multi-conditional diffusion guidance. The proposed\nTransformer-based denoising model has a condition encoder that learns the\nrepresentations of numerical and categorical conditions. The denoising model,\nconsisting of a structure encoder-decoder, is trained for denoising under the\nrepresentation of conditions. The diffusion process becomes graph-dependent to\naccurately estimate graph-related noise in molecules, unlike the previous\nmodels that focus solely on the marginal distributions of atoms or bonds. We\nextensively validate our model for multi-conditional polymer and small molecule\ngeneration. Results demonstrate our superiority across metrics from\ndistribution learning to condition control for molecular properties. An inverse\npolymer design task for gas separation with feedback from domain experts\nfurther demonstrates its practical utility.\n","authors":["Gang Liu","Jiaxin Xu","Tengfei Luo","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.13858v1.pdf","comment":"20 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2401.13854v1","updated":"2024-01-24T23:35:29Z","published":"2024-01-24T23:35:29Z","title":"Embedding Attack Project (Work Report)","summary":"  This report summarizes all the MIA experiments (Membership Inference Attacks)\nof the Embedding Attack Project, including threat models, experimental setup,\nexperimental results, findings and discussion. Current results cover the\nevaluation of two main MIA strategies (loss-based and embedding-based MIAs) on\n6 AI models ranging from Computer Vision to Language Modelling. There are two\nongoing experiments on MIA defense and neighborhood-comparison embedding\nattacks. These are ongoing projects.\n  The current work on MIA and PIA can be summarized into six conclusions: (1)\nAmount of overfitting is directly proportional to model's vulnerability; (2)\nearly embedding layers in the model are less susceptible to privacy leaks; (3)\nDeeper model layers contain more membership information; (4) Models are more\nvulnerable to MIA if both embeddings and corresponding training labels are\ncompromised; (5) it is possible to use pseudo-labels to increase the MIA\nsuccess; and (6) although MIA and PIA success rates are proportional, reducing\nthe MIA does not necessarily reduce the PIA.\n","authors":["Jiameng Pu","Zafar Takhirov"],"pdf_url":"https://arxiv.org/pdf/2401.13854v1.pdf","comment":"13 pages, 5 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2302.14465v2","updated":"2024-01-24T16:05:44Z","published":"2023-02-28T10:14:28Z","title":"Video Quality Assessment with Texture Information Fusion for Streaming\n  Applications","summary":"  The rise in video streaming applications has increased the demand for video\nquality assessment (VQA). In 2016, Netflix introduced Video Multi-Method\nAssessment Fusion (VMAF), a full reference VQA metric that strongly correlates\nwith perceptual quality, but its computation is time-intensive. We propose a\nDiscrete Cosine Transform (DCT)-energy-based VQA with texture information\nfusion (VQ-TIF) model for video streaming applications that determines the\nvisual quality of the reconstructed video compared to the original video.\nVQ-TIF extracts Structural Similarity (SSIM) and spatiotemporal features of the\nframes from the original and reconstructed videos and fuses them using a long\nshort-term memory (LSTM)-based model to estimate the visual quality.\nExperimental results show that VQ-TIF estimates the visual quality with a\nPearson Correlation Coefficient (PCC) of 0.96 and a Mean Absolute Error (MAE)\nof 2.71, on average, compared to the ground truth VMAF scores. Additionally,\nVQ-TIF estimates the visual quality at a rate of 9.14 times faster than the\nstate-of-the-art VMAF implementation, along with an 89.44 % reduction in energy\nconsumption, assuming an Ultra HD (2160p) display resolution.\n","authors":["Vignesh V Menon","Prajit T Rajendran","Reza Farahani","Klaus Schoeffmann","Christian Timmerer"],"pdf_url":"https://arxiv.org/pdf/2302.14465v2.pdf","comment":"2024 Mile High Video (MHV)"},{"id":"http://arxiv.org/abs/2401.13478v1","updated":"2024-01-24T14:23:12Z","published":"2024-01-24T14:23:12Z","title":"SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval","summary":"  Multi-modal information retrieval (MMIR) is a rapidly evolving field, where\nsignificant progress, particularly in image-text pairing, has been made through\nadvanced representation learning and cross-modality alignment research.\nHowever, current benchmarks for evaluating MMIR performance in image-text\npairing within the scientific domain show a notable gap, where chart and table\nimages described in scholarly language usually do not play a significant role.\nTo bridge this gap, we develop a specialised scientific MMIR (SciMMIR)\nbenchmark by leveraging open-access paper collections to extract data relevant\nto the scientific domain. This benchmark comprises 530K meticulously curated\nimage-text pairs, extracted from figures and tables with detailed captions in\nscientific documents. We further annotate the image-text pairs with two-level\nsubset-subcategory hierarchy annotations to facilitate a more comprehensive\nevaluation of the baselines. We conducted zero-shot and fine-tuning evaluations\non prominent multi-modal image-captioning and visual language models, such as\nCLIP and BLIP. Our analysis offers critical insights for MMIR in the scientific\ndomain, including the impact of pre-training and fine-tuning settings and the\ninfluence of the visual and textual encoders. All our data and checkpoints are\npublicly available at https://github.com/Wusiwei0410/SciMMIR.\n","authors":["Siwei Wu","Yizhi Li","Kang Zhu","Ge Zhang","Yiming Liang","Kaijing Ma","Chenghao Xiao","Haoran Zhang","Bohao Yang","Wenhu Chen","Wenhao Huang","Noura Al Moubayed","Jie Fu","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2401.13478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10608v2","updated":"2024-01-24T13:47:33Z","published":"2024-01-19T10:37:27Z","title":"M2ORT: Many-To-One Regression Transformer for Spatial Transcriptomics\n  Prediction from Histopathology Images","summary":"  The advancement of Spatial Transcriptomics (ST) has facilitated the\nspatially-aware profiling of gene expressions based on histopathology images.\nAlthough ST data offers valuable insights into the micro-environment of tumors,\nits acquisition cost remains expensive. Therefore, directly predicting the ST\nexpressions from digital pathology images is desired. Current methods usually\nadopt existing regression backbones for this task, which ignore the inherent\nmulti-scale hierarchical data structure of digital pathology images. To address\nthis limit, we propose M2ORT, a many-to-one regression Transformer that can\naccommodate the hierarchical structure of the pathology images through a\ndecoupled multi-scale feature extractor. Different from traditional models that\nare trained with one-to-one image-label pairs, M2ORT accepts multiple pathology\nimages of different magnifications at a time to jointly predict the gene\nexpressions at their corresponding common ST spot, aiming at learning a\nmany-to-one relationship through training. We have tested M2ORT on three public\nST datasets and the experimental results show that M2ORT can achieve\nstate-of-the-art performance with fewer parameters and floating-point\noperations (FLOPs). The code is available at:\nhttps://github.com/Dootmaan/M2ORT/.\n","authors":["Hongyi Wang","Xiuju Du","Jing Liu","Shuyi Ouyang","Yen-Wei Chen","Lanfen Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17369v2","updated":"2024-01-24T12:22:40Z","published":"2023-05-27T05:00:14Z","title":"Modularized Zero-shot VQA with Pre-trained Models","summary":"  Large-scale pre-trained models (PTMs) show great zero-shot capabilities. In\nthis paper, we study how to leverage them for zero-shot visual question\nanswering (VQA). Our approach is motivated by a few observations. First, VQA\nquestions often require multiple steps of reasoning, which is still a\ncapability that most PTMs lack. Second, different steps in VQA reasoning chains\nrequire different skills such as object detection and relational reasoning, but\na single PTM may not possess all these skills. Third, recent work on zero-shot\nVQA does not explicitly consider multi-step reasoning chains, which makes them\nless interpretable compared with a decomposition-based approach. We propose a\nmodularized zero-shot network that explicitly decomposes questions into sub\nreasoning steps and is highly interpretable. We convert sub reasoning tasks to\nacceptable objectives of PTMs and assign tasks to proper PTMs without any\nadaptation. Our experiments on two VQA benchmarks under the zero-shot setting\ndemonstrate the effectiveness of our method and better interpretability\ncompared with several baselines.\n","authors":["Rui Cao","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2305.17369v2.pdf","comment":"accepted as Findings in ACL 2023; Code available:\n  https://github.com/abril4416/Mod-Zero-VQA"},{"id":"http://arxiv.org/abs/2401.13260v1","updated":"2024-01-24T06:55:55Z","published":"2024-01-24T06:55:55Z","title":"MF-AED-AEC: Speech Emotion Recognition by Leveraging Multimodal Fusion,\n  ASR Error Detection, and ASR Error Correction","summary":"  The prevalent approach in speech emotion recognition (SER) involves\nintegrating both audio and textual information to comprehensively identify the\nspeaker's emotion, with the text generally obtained through automatic speech\nrecognition (ASR). An essential issue of this approach is that ASR errors from\nthe text modality can worsen the performance of SER. Previous studies have\nproposed using an auxiliary ASR error detection task to adaptively assign\nweights of each word in ASR hypotheses. However, this approach has limited\nimprovement potential because it does not address the coherence of semantic\ninformation in the text. Additionally, the inherent heterogeneity of different\nmodalities leads to distribution gaps between their representations, making\ntheir fusion challenging. Therefore, in this paper, we incorporate two\nauxiliary tasks, ASR error detection (AED) and ASR error correction (AEC), to\nenhance the semantic coherence of ASR text, and further introduce a novel\nmulti-modal fusion (MF) method to learn shared representations across\nmodalities. We refer to our method as MF-AED-AEC. Experimental results indicate\nthat MF-AED-AEC significantly outperforms the baseline model by a margin of\n4.1\\%.\n","authors":["Jiajun He","Xiaohan Shi","Xingfeng Li","Tomoki Toda"],"pdf_url":"https://arxiv.org/pdf/2401.13260v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.13249v1","updated":"2024-01-24T06:19:44Z","published":"2024-01-24T06:19:44Z","title":"MOS-FAD: Improving Fake Audio Detection Via Automatic Mean Opinion Score\n  Prediction","summary":"  Automatic Mean Opinion Score (MOS) prediction is employed to evaluate the\nquality of synthetic speech. This study extends the application of predicted\nMOS to the task of Fake Audio Detection (FAD), as we expect that MOS can be\nused to assess how close synthesized speech is to the natural human voice. We\npropose MOS-FAD, where MOS can be leveraged at two key points in FAD: training\ndata selection and model fusion. In training data selection, we demonstrate\nthat MOS enables effective filtering of samples from unbalanced datasets. In\nthe model fusion, our results demonstrate that incorporating MOS as a gating\nmechanism in FAD model fusion enhances overall performance.\n","authors":["Wangjin Zhou","Zhengdong Yang","Chenhui Chu","Sheng Li","Raj Dabre","Yi Zhao","Kawahara Tatsuya"],"pdf_url":"https://arxiv.org/pdf/2401.13249v1.pdf","comment":"Accepted in ICASSP2024"},{"id":"http://arxiv.org/abs/2203.13883v5","updated":"2024-01-24T01:50:22Z","published":"2022-03-25T19:45:33Z","title":"Multi-modal Misinformation Detection: Approaches, Challenges and\n  Opportunities","summary":"  As social media platforms are evolving from text-based forums into\nmulti-modal environments, the nature of misinformation in social media is also\ntransforming accordingly. Taking advantage of the fact that visual modalities\nsuch as images and videos are more favorable and attractive to the users and\ntextual contents are sometimes skimmed carelessly, misinformation spreaders\nhave recently targeted contextual connections between the modalities e.g., text\nand image. Hence many researchers have developed automatic techniques for\ndetecting possible cross-modal discordance in web-based content. We analyze,\ncategorize and identify existing approaches in addition to challenges and\nshortcomings they face in order to unearth new research opportunities in the\nfield of multi-modal misinformation detection.\n","authors":["Sara Abdali","Sina shaham","Bhaskar Krishnamachari"],"pdf_url":"https://arxiv.org/pdf/2203.13883v5.pdf","comment":null}]},"2024-01-25T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.14400v1","updated":"2024-01-25T18:59:32Z","published":"2024-01-25T18:59:32Z","title":"Modular Adaptation of Multilingual Encoders to Written Swiss German\n  Dialect","summary":"  Creating neural text encoders for written Swiss German is challenging due to\na dearth of training data combined with dialectal variation. In this paper, we\nbuild on several existing multilingual encoders and adapt them to Swiss German\nusing continued pre-training. Evaluation on three diverse downstream tasks\nshows that simply adding a Swiss German adapter to a modular encoder achieves\n97.5% of fully monolithic adaptation performance. We further find that for the\ntask of retrieving Swiss German sentences given Standard German queries,\nadapting a character-level model is more effective than the other adaptation\nstrategies. We release our code and the models trained for our experiments at\nhttps://github.com/ZurichNLP/swiss-german-text-encoders\n","authors":["Jannis Vamvas","Noëmi Aepli","Rico Sennrich"],"pdf_url":"https://arxiv.org/pdf/2401.14400v1.pdf","comment":"First Workshop on Modular and Open Multilingual NLP (MOOMIN 2024)"},{"id":"http://arxiv.org/abs/2401.14373v1","updated":"2024-01-25T18:24:13Z","published":"2024-01-25T18:24:13Z","title":"TURNA: A Turkish Encoder-Decoder Language Model for Enhanced\n  Understanding and Generation","summary":"  The recent advances in natural language processing have predominantly favored\nwell-resourced English-centric models, resulting in a significant gap with\nlow-resource languages. In this work, we introduce the language model TURNA,\nwhich is developed for the low-resource language Turkish and is capable of both\nnatural language understanding and generation tasks. TURNA is pretrained with\nan encoder-decoder architecture based on the unified framework UL2 with a\ndiverse corpus that we specifically curated for this purpose. We evaluated\nTURNA with three generation tasks and five understanding tasks for Turkish. The\nresults show that TURNA outperforms several multilingual models in both\nunderstanding and generation tasks, and competes with monolingual Turkish\nmodels in understanding tasks. TURNA is made available at\nhttps://huggingface.co/boun-tabi-LMG/TURNA .\n","authors":["Gökçe Uludoğan","Zeynep Yirmibeşoğlu Balal","Furkan Akkurt","Melikşah Türker","Onur Güngör","Susan Üsküdarlı"],"pdf_url":"https://arxiv.org/pdf/2401.14373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14603v2","updated":"2024-01-25T18:15:31Z","published":"2023-05-24T00:57:35Z","title":"OpenPI2.0: An Improved Dataset for Entity Tracking in Texts","summary":"  Much text describes a changing world (e.g., procedures, stories, newswires),\nand understanding them requires tracking how entities change. An earlier\ndataset, OpenPI, provided crowdsourced annotations of entity state changes in\ntext. However, a major limitation was that those annotations were free-form and\ndid not identify salient changes, hampering model evaluation. To overcome these\nlimitations, we present an improved dataset, OpenPI2.0, where entities and\nattributes are fully canonicalized and additional entity salience annotations\nare added. On our fairer evaluation setting, we find that current\nstate-of-the-art language models are far from competent. We also show that\nusing state changes of salient entities as a chain-of-thought prompt,\ndownstream performance is improved on tasks such as question answering and\nclassical planning, outperforming the setting involving all related entities\nindiscriminately. We offer OpenPI2.0 for the continued development of models\nthat can understand the dynamics of entities in text.\n","authors":["Li Zhang","Hainiu Xu","Abhinav Kommula","Chris Callison-Burch","Niket Tandon"],"pdf_url":"https://arxiv.org/pdf/2305.14603v2.pdf","comment":"In EACL 2024"},{"id":"http://arxiv.org/abs/2401.14367v1","updated":"2024-01-25T18:14:57Z","published":"2024-01-25T18:14:57Z","title":"Genie: Achieving Human Parity in Content-Grounded Datasets Generation","summary":"  The lack of high-quality data for content-grounded generation tasks has been\nidentified as a major obstacle to advancing these tasks. To address this gap,\nwe propose Genie, a novel method for automatically generating high-quality\ncontent-grounded data. It consists of three stages: (a) Content Preparation,\n(b) Generation: creating task-specific examples from the content (e.g.,\nquestion-answer pairs or summaries). (c) Filtering mechanism aiming to ensure\nthe quality and faithfulness of the generated data. We showcase this\nmethodology by generating three large-scale synthetic data, making wishes, for\nLong-Form Question-Answering (LFQA), summarization, and information extraction.\nIn a human evaluation, our generated data was found to be natural and of high\nquality. Furthermore, we compare models trained on our data with models trained\non human-written data -- ELI5 and ASQA for LFQA and CNN-DailyMail for\nSummarization. We show that our models are on par with or outperforming models\ntrained on human-generated data and consistently outperforming them in\nfaithfulness. Finally, we applied our method to create LFQA data within the\nmedical domain and compared a model trained on it with models trained on other\ndomains.\n","authors":["Asaf Yehudai","Boaz Carmeli","Yosi Mass","Ofir Arviv","Nathaniel Mills","Assaf Toledo","Eyal Shnarch","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2401.14367v1.pdf","comment":"Accepted to ICLR24"},{"id":"http://arxiv.org/abs/2401.14360v1","updated":"2024-01-25T18:06:19Z","published":"2024-01-25T18:06:19Z","title":"A Comparative Analysis of Noise Reduction Methods in Sentiment Analysis\n  on Noisy Bengali Texts","summary":"  While Bengali is considered a language with limited resources, sentiment\nanalysis has been a subject of extensive research in the literature.\nNevertheless, there is a scarcity of exploration into sentiment analysis\nspecifically in the realm of noisy Bengali texts. In this paper, we introduce a\ndataset (NC-SentNoB) that we annotated manually to identify ten different types\nof noise found in a pre-existing sentiment analysis dataset comprising of\naround 15K noisy Bengali texts. At first, given an input noisy text, we\nidentify the noise type, addressing this as a multi-label classification task.\nThen, we introduce baseline noise reduction methods to alleviate noise prior to\nconducting sentiment analysis. Finally, we assess the performance of fine-tuned\nsentiment analysis models with both noisy and noise-reduced texts to make\ncomparisons. The experimental findings indicate that the noise reduction\nmethods utilized are not satisfactory, highlighting the need for more suitable\nnoise reduction methods in future research endeavors. We have made the\nimplementation and dataset presented in this paper publicly available at\nhttps://github.com/ktoufiquee/A-Comparative-Analysis-of-Noise-Reduction-Methods-in-Sentiment-Analysis-on-Noisy-Bengali-Texts\n","authors":["Kazi Toufique Elahi","Tasnuva Binte Rahman","Shakil Shahriar","Samir Sarker","Md. Tanvir Rouf Shawon","G. M. Shahariar"],"pdf_url":"https://arxiv.org/pdf/2401.14360v1.pdf","comment":"Accepted in The 9th Workshop on Noisy and User-generated Text\n  (W-NUT), 18th Conference of the European Chapter of the Association for\n  Computational Linguistics (EACL 2024)"},{"id":"http://arxiv.org/abs/2312.09043v2","updated":"2024-01-25T17:57:02Z","published":"2023-12-14T15:40:27Z","title":"Topic Bias in Emotion Classification","summary":"  Emotion corpora are typically sampled based on keyword/hashtag search or by\nasking study participants to generate textual instances. In any case, these\ncorpora are not uniform samples representing the entirety of a domain. We\nhypothesize that this practice of data acquisition leads to unrealistic\ncorrelations between overrepresented topics in these corpora that harm the\ngeneralizability of models. Such topic bias could lead to wrong predictions for\ninstances like \"I organized the service for my aunt's funeral.\" when funeral\nevents are over-represented for instances labeled with sadness, despite the\nemotion of pride being more appropriate here. In this paper, we study this\ntopic bias both from the data and the modeling perspective. We first label a\nset of emotion corpora automatically via topic modeling and show that emotions\nin fact correlate with specific topics. Further, we see that emotion\nclassifiers are confounded by such topics. Finally, we show that the\nestablished debiasing method of adversarial correction via gradient reversal\nmitigates the issue. Our work points out issues with existing emotion corpora\nand that more representative resources are required for fair evaluation of\nmodels predicting affective concepts from text.\n","authors":["Maximilian Wegge","Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2312.09043v2.pdf","comment":"accepted to W-NUT at EACL 2024"},{"id":"http://arxiv.org/abs/2306.13588v2","updated":"2024-01-25T17:52:07Z","published":"2023-06-23T16:21:40Z","title":"System-Level Natural Language Feedback","summary":"  Natural language (NL) feedback offers rich insights into user experience.\nWhile existing studies focus on an instance-level approach, where feedback is\nused to refine specific examples, we introduce a framework for system-level use\nof NL feedback. We show how to use feedback to formalize system-level design\ndecisions in a human-in-the-loop-process -- in order to produce better models.\nIn particular this is done through: (i) metric design for tasks; and (ii)\nlanguage model prompt design for refining model responses. We conduct two case\nstudies of this approach for improving search query and dialog response\ngeneration, demonstrating the effectiveness of system-level feedback. We show\nthe combination of system-level and instance-level feedback brings further\ngains, and that human written instance-level feedback results in more grounded\nrefinements than GPT-3.5 written ones, underlying the importance of human\nfeedback for building systems. We release our code and data at\nhttps://github.com/yyy-Apple/Sys-NL-Feedback.\n","authors":["Weizhe Yuan","Kyunghyun Cho","Jason Weston"],"pdf_url":"https://arxiv.org/pdf/2306.13588v2.pdf","comment":"Accepted by EACL 2024"},{"id":"http://arxiv.org/abs/2401.05561v3","updated":"2024-01-25T17:49:03Z","published":"2024-01-10T22:07:21Z","title":"TrustLLM: Trustworthiness in Large Language Models","summary":"  Large language models (LLMs), exemplified by ChatGPT, have gained\nconsiderable attention for their excellent natural language processing\ncapabilities. Nonetheless, these LLMs present many challenges, particularly in\nthe realm of trustworthiness. Therefore, ensuring the trustworthiness of LLMs\nemerges as an important topic. This paper introduces TrustLLM, a comprehensive\nstudy of trustworthiness in LLMs, including principles for different dimensions\nof trustworthiness, established benchmark, evaluation, and analysis of\ntrustworthiness for mainstream LLMs, and discussion of open challenges and\nfuture directions. Specifically, we first propose a set of principles for\ntrustworthy LLMs that span eight different dimensions. Based on these\nprinciples, we further establish a benchmark across six dimensions including\ntruthfulness, safety, fairness, robustness, privacy, and machine ethics. We\nthen present a study evaluating 16 mainstream LLMs in TrustLLM, consisting of\nover 30 datasets. Our findings firstly show that in general trustworthiness and\nutility (i.e., functional effectiveness) are positively related. Secondly, our\nobservations reveal that proprietary LLMs generally outperform most open-source\ncounterparts in terms of trustworthiness, raising concerns about the potential\nrisks of widely accessible open-source LLMs. However, a few open-source LLMs\ncome very close to proprietary ones. Thirdly, it is important to note that some\nLLMs may be overly calibrated towards exhibiting trustworthiness, to the extent\nthat they compromise their utility by mistakenly treating benign prompts as\nharmful and consequently not responding. Finally, we emphasize the importance\nof ensuring transparency not only in the models themselves but also in the\ntechnologies that underpin trustworthiness. Knowing the specific trustworthy\ntechnologies that have been employed is crucial for analyzing their\neffectiveness.\n","authors":["Lichao Sun","Yue Huang","Haoran Wang","Siyuan Wu","Qihui Zhang","Chujie Gao","Yixin Huang","Wenhan Lyu","Yixuan Zhang","Xiner Li","Zhengliang Liu","Yixin Liu","Yijue Wang","Zhikun Zhang","Bhavya Kailkhura","Caiming Xiong","Chaowei Xiao","Chunyuan Li","Eric Xing","Furong Huang","Hao Liu","Heng Ji","Hongyi Wang","Huan Zhang","Huaxiu Yao","Manolis Kellis","Marinka Zitnik","Meng Jiang","Mohit Bansal","James Zou","Jian Pei","Jian Liu","Jianfeng Gao","Jiawei Han","Jieyu Zhao","Jiliang Tang","Jindong Wang","John Mitchell","Kai Shu","Kaidi Xu","Kai-Wei Chang","Lifang He","Lifu Huang","Michael Backes","Neil Zhenqiang Gong","Philip S. Yu","Pin-Yu Chen","Quanquan Gu","Ran Xu","Rex Ying","Shuiwang Ji","Suman Jana","Tianlong Chen","Tianming Liu","Tianyi Zhou","William Wang","Xiang Li","Xiangliang Zhang","Xiao Wang","Xing Xie","Xun Chen","Xuyu Wang","Yan Liu","Yanfang Ye","Yinzhi Cao","Yong Chen","Yue Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.05561v3.pdf","comment":"This work is still under work and we welcome your contribution"},{"id":"http://arxiv.org/abs/2401.13527v2","updated":"2024-01-25T17:24:52Z","published":"2024-01-24T15:25:01Z","title":"SpeechGPT-Gen: Scaling Chain-of-Information Speech Generation","summary":"  Benefiting from effective speech modeling, current Speech Large Language\nModels (SLLMs) have demonstrated exceptional capabilities in in-context speech\ngeneration and efficient generalization to unseen speakers. However, the\nprevailing information modeling process is encumbered by certain redundancies,\nleading to inefficiencies in speech generation. We propose Chain-of-Information\nGeneration (CoIG), a method for decoupling semantic and perceptual information\nin large-scale speech generation. Building on this, we develop SpeechGPT-Gen,\nan 8-billion-parameter SLLM efficient in semantic and perceptual information\nmodeling. It comprises an autoregressive model based on LLM for semantic\ninformation modeling and a non-autoregressive model employing flow matching for\nperceptual information modeling. Additionally, we introduce the novel approach\nof infusing semantic information into the prior distribution to enhance the\nefficiency of flow matching. Extensive experimental results demonstrate that\nSpeechGPT-Gen markedly excels in zero-shot text-to-speech, zero-shot voice\nconversion, and speech-to-speech dialogue, underscoring CoIG's remarkable\nproficiency in capturing and modeling speech's semantic and perceptual\ndimensions. Code and models are available at\nhttps://github.com/0nutation/SpeechGPT.\n","authors":["Dong Zhang","Xin Zhang","Jun Zhan","Shimin Li","Yaqian Zhou","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2401.13527v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2401.14295v1","updated":"2024-01-25T16:34:00Z","published":"2024-01-25T16:34:00Z","title":"Topologies of Reasoning: Demystifying Chains, Trees, and Graphs of\n  Thoughts","summary":"  The field of natural language processing (NLP) has witnessed significant\nprogress in recent years, with a notable focus on improving large language\nmodels' (LLM) performance through innovative prompting techniques. Among these,\nprompt engineering coupled with structures has emerged as a promising paradigm,\nwith designs such as Chain-of-Thought, Tree of Thoughts, or Graph of Thoughts,\nin which the overall LLM reasoning is guided by a structure such as a graph. As\nillustrated with numerous examples, this paradigm significantly enhances the\nLLM's capability to solve numerous tasks, ranging from logical or mathematical\nreasoning to planning or creative writing. To facilitate the understanding of\nthis growing field and pave the way for future developments, we devise a\ngeneral blueprint for effective and efficient LLM reasoning schemes. For this,\nwe conduct an in-depth analysis of the prompt execution pipeline, clarifying\nand clearly defining different concepts. We then build the first taxonomy of\nstructure-enhanced LLM reasoning schemes. We focus on identifying fundamental\nclasses of harnessed structures, and we analyze the representations of these\nstructures, algorithms executed with these structures, and many others. We\nrefer to these structures as reasoning topologies, because their representation\nbecomes to a degree spatial, as they are contained within the LLM context. Our\nstudy compares existing prompting schemes using the proposed taxonomy,\ndiscussing how certain design choices lead to different patterns in performance\nand cost. We also outline theoretical underpinnings, relationships between\nprompting and others parts of the LLM ecosystem such as knowledge bases, and\nthe associated research challenges. Our work will help to advance future prompt\nengineering techniques.\n","authors":["Maciej Besta","Florim Memedi","Zhenyu Zhang","Robert Gerstenberger","Nils Blach","Piotr Nyczyk","Marcin Copik","Grzegorz Kwaśniewski","Jürgen Müller","Lukas Gianinazzi","Ales Kubicek","Hubert Niewiadomski","Onur Mutlu","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2401.14295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14280v1","updated":"2024-01-25T16:11:41Z","published":"2024-01-25T16:11:41Z","title":"RomanSetu: Efficiently unlocking multilingual capabilities of Large\n  Language Models models via Romanization","summary":"  This study addresses the challenge of extending Large Language Models (LLMs)\nto non-English languages, specifically those using non-Latin scripts. We\npropose an innovative approach that utilizes the romanized form of text as an\ninterface for LLMs, hypothesizing that its frequent informal use and shared\ntokens with English enhance cross-lingual alignment. Focusing on Hindi, we\ndemonstrate through Hindi-to-English translation and sentiment analysis tasks\nthat romanized text not only significantly improves inference efficiency due to\nits lower fertility compared to native text but also achieves competitive\nperformance with limited pre-training. Additionally, our novel multi-script\nprompting approach, which combines romanized and native texts, shows promise in\nfurther enhancing task performance. These findings suggest the potential of\nromanization in bridging the language gap for LLM applications, with future\nwork aimed at expanding this approach to more languages and tasks.\n","authors":["Jaavid Aktar Husain","Raj Dabre","Aswanth Kumar","Ratish Puduppully","Anoop Kunchukuttan"],"pdf_url":"https://arxiv.org/pdf/2401.14280v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.14267v1","updated":"2024-01-25T16:01:49Z","published":"2024-01-25T16:01:49Z","title":"Transformers and Cortical Waves: Encoders for Pulling In Context Across\n  Time","summary":"  The capabilities of transformer networks such as ChatGPT and other Large\nLanguage Models (LLMs) have captured the world's attention. The crucial\ncomputational mechanism underlying their performance relies on transforming a\ncomplete input sequence - for example, all the words in a sentence into a long\n\"encoding vector\" - that allows transformers to learn long-range temporal\ndependencies in naturalistic sequences. Specifically, \"self-attention\" applied\nto this encoding vector enhances temporal context in transformers by computing\nassociations between pairs of words in the input sequence. We suggest that\nwaves of neural activity, traveling across single cortical regions or across\nmultiple regions at the whole-brain scale, could implement a similar encoding\nprinciple. By encapsulating recent input history into a single spatial pattern\nat each moment in time, cortical waves may enable temporal context to be\nextracted from sequences of sensory inputs, the same computational principle\nused in transformers.\n","authors":["Lyle Muller","Patricia S. Churchland","Terrence J. Sejnowski"],"pdf_url":"https://arxiv.org/pdf/2401.14267v1.pdf","comment":"25 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.13892v3","updated":"2024-01-25T15:36:44Z","published":"2023-11-23T10:23:51Z","title":"General Phrase Debiaser: Debiasing Masked Language Models at a\n  Multi-Token Level","summary":"  The social biases and unwelcome stereotypes revealed by pretrained language\nmodels are becoming obstacles to their application. Compared to numerous\ndebiasing methods targeting word level, there has been relatively less\nattention on biases present at phrase level, limiting the performance of\ndebiasing in discipline domains. In this paper, we propose an automatic\nmulti-token debiasing pipeline called \\textbf{General Phrase Debiaser}, which\nis capable of mitigating phrase-level biases in masked language models.\nSpecifically, our method consists of a \\textit{phrase filter stage} that\ngenerates stereotypical phrases from Wikipedia pages as well as a \\textit{model\ndebias stage} that can debias models at the multi-token level to tackle bias\nchallenges on phrases. The latter searches for prompts that trigger model's\nbias, and then uses them for debiasing. State-of-the-art results on standard\ndatasets and metrics show that our approach can significantly reduce gender\nbiases on both career and multiple disciplines, across models with varying\nparameter sizes.\n","authors":["Bingkang Shi","Xiaodan Zhang","Dehan Kong","Yulei Wu","Zongzhen Liu","Honglei Lyu","Longtao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.13892v3.pdf","comment":"Accepted by ICASSP 2024 as mian conference paper"},{"id":"http://arxiv.org/abs/2401.14242v1","updated":"2024-01-25T15:33:20Z","published":"2024-01-25T15:33:20Z","title":"Improving Natural Language Capability of Code Large Language Model","summary":"  Code large language models (Code LLMs) have demonstrated remarkable\nperformance in code generation. Nonetheless, most existing works focus on\nboosting code LLMs from the perspective of programming capabilities, while\ntheir natural language capabilities receive less attention. To fill this gap,\nwe thus propose a novel framework, comprising two modules: AttentionExtractor,\nwhich is responsible for extracting key phrases from the user's natural\nlanguage requirements, and AttentionCoder, which leverages these extracted\nphrases to generate target code to solve the requirement. This framework\npioneers an innovative idea by seamlessly integrating code LLMs with\ntraditional natural language processing tools. To validate the effectiveness of\nthe framework, we craft a new code generation benchmark, called MultiNL-H,\ncovering five natural languages. Extensive experimental results demonstrate the\neffectiveness of our proposed framework.\n","authors":["Wei Li","Daoguang Zan","Bei Guan","Ailun Yu","Xiaolin Chen","Yongji Wang"],"pdf_url":"https://arxiv.org/pdf/2401.14242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14240v1","updated":"2024-01-25T15:28:07Z","published":"2024-01-25T15:28:07Z","title":"Enhanced Labeling Technique for Reddit Text and Fine-Tuned Longformer\n  Models for Classifying Depression Severity in English and Luganda","summary":"  Depression is a global burden and one of the most challenging mental health\nconditions to control. Experts can detect its severity early using the Beck\nDepression Inventory (BDI) questionnaire, administer appropriate medication to\npatients, and impede its progression. Due to the fear of potential\nstigmatization, many patients turn to social media platforms like Reddit for\nadvice and assistance at various stages of their journey. This research\nextracts text from Reddit to facilitate the diagnostic process. It employs a\nproposed labeling approach to categorize the text and subsequently fine-tunes\nthe Longformer model. The model's performance is compared against baseline\nmodels, including Naive Bayes, Random Forest, Support Vector Machines, and\nGradient Boosting. Our findings reveal that the Longformer model outperforms\nthe baseline models in both English (48%) and Luganda (45%) languages on a\ncustom-made dataset.\n","authors":["Richard Kimera","Daniela N. Rim","Joseph Kirabira","Ubong Godwin Udomah","Heeyoul Choi"],"pdf_url":"https://arxiv.org/pdf/2401.14240v1.pdf","comment":"In IEEE Proceedings of the 14th International Conference on ICT\n  Convergence (ICTC), Jeju, Korea, October 2023"},{"id":"http://arxiv.org/abs/2305.04928v4","updated":"2024-01-25T15:23:34Z","published":"2023-05-05T12:14:22Z","title":"From Zero to Hero: Harnessing Transformers for Biomedical Named Entity\n  Recognition in Zero- and Few-shot Contexts","summary":"  Supervised named entity recognition (NER) in the biomedical domain depends on\nlarge sets of annotated texts with the given named entities. The creation of\nsuch datasets can be time-consuming and expensive, while extraction of new\nentities requires additional annotation tasks and retraining the model. To\naddress these challenges, this paper proposes a method for zero- and few-shot\nNER in the biomedical domain. The method is based on transforming the task of\nmulti-class token classification into binary token classification and\npre-training on a large amount of datasets and biomedical entities, which allow\nthe model to learn semantic relations between the given and potentially novel\nnamed entity labels. We have achieved average F1 scores of 35.44% for zero-shot\nNER, 50.10% for one-shot NER, 69.94% for 10-shot NER, and 79.51% for 100-shot\nNER on 9 diverse evaluated biomedical entities with fine-tuned PubMedBERT-based\nmodel. The results demonstrate the effectiveness of the proposed method for\nrecognizing new biomedical entities with no or limited number of examples,\noutperforming previous transformer-based methods, and being comparable to\nGPT3-based models using models with over 1000 times fewer parameters. We make\nmodels and developed code publicly available.\n","authors":["Miloš Košprdić","Nikola Prodanović","Adela Ljajić","Bojana Bašaragin","Nikola Milošević"],"pdf_url":"https://arxiv.org/pdf/2305.04928v4.pdf","comment":"Collaboration between Bayer Pharma R&D and Serbian Institute for\n  Artificial Intelligence Research and Development"},{"id":"http://arxiv.org/abs/2305.10818v3","updated":"2024-01-25T15:15:17Z","published":"2023-05-18T08:56:05Z","title":"Diffusion Language Models Generation Can Be Halted Early","summary":"  Diffusion Language models (DLMs) are a promising avenue for text generation\ndue to their practical properties on tractable controllable generation. They\nalso have the advantage of not having to predict text autoregressively.\nHowever, despite these notable features, DLMs have not yet reached the\nperformance levels of their autoregressive counterparts. One of the ways to\nreduce the performance gap between these two types of language models is to\nspeed up the generation of DLMs. Therefore, we propose a novel methodology to\naddress this issue in this work. It enables the execution of more generation\nsteps within a given time frame, leading to higher-quality outputs.\nSpecifically, our methods estimate DLMs completeness of text generation and\nallow adaptive halting of the generation process. We evaluate our methods on\nPlaid, SSD, and CDCD DLMs and create a cohesive perspective on their generation\nworkflows. Finally, we confirm that our methods allow halting these models and\ndecrease the generation time by $10$-$40$\\% without a drop in the quality of\nmodel samples.\n","authors":["Sofia Maria Lo Cicero Vaina","Nikita Balagansky","Daniil Gavrilov"],"pdf_url":"https://arxiv.org/pdf/2305.10818v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14228v1","updated":"2024-01-25T15:11:07Z","published":"2024-01-25T15:11:07Z","title":"Assessing the Portability of Parameter Matrices Trained by\n  Parameter-Efficient Finetuning Methods","summary":"  As the cost of training ever larger language models has grown, so has the\ninterest in reusing previously learnt knowledge. Transfer learning methods have\nshown how reusing non-task-specific knowledge can help in subsequent\ntask-specific learning. In this paper, we investigate the inverse: porting\nwhole functional modules that encode task-specific knowledge from one model to\nanother. We designed a study comprising 1,440 training/testing runs to test the\nportability of modules trained by parameter-efficient finetuning (PEFT)\ntechniques, using sentiment analysis as an example task. We test portability in\na wide range of scenarios, involving different PEFT techniques and different\npretrained host models, among other dimensions. We compare the performance of\nported modules with that of equivalent modules trained (i) from scratch, and\n(ii) from parameters sampled from the same distribution as the ported module.\nWe find that the ported modules far outperform the two alternatives tested, but\nthat there are interesting performance differences between the four PEFT\ntechniques. We conclude that task-specific knowledge in the form of\nstructurally modular sets of parameters as produced by PEFT techniques is\nhighly portable, but that degree of success depends on type of PEFT and on\ndifferences between originating and receiving pretrained models.\n","authors":["Mohammed Sabry","Anya Belz"],"pdf_url":"https://arxiv.org/pdf/2401.14228v1.pdf","comment":"Accepted to Findings of EACL 2024. Camera ready version"},{"id":"http://arxiv.org/abs/2401.14215v1","updated":"2024-01-25T14:54:33Z","published":"2024-01-25T14:54:33Z","title":"Commonsense-augmented Memory Construction and Management in Long-term\n  Conversations via Context-aware Persona Refinement","summary":"  Memorizing and utilizing speakers' personas is a common practice for response\ngeneration in long-term conversations. Yet, human-authored datasets often\nprovide uninformative persona sentences that hinder response quality. This\npaper presents a novel framework that leverages commonsense-based persona\nexpansion to address such issues in long-term conversation. While prior work\nfocuses on not producing personas that contradict others, we focus on\ntransforming contradictory personas into sentences that contain rich speaker\ninformation, by refining them based on their contextual backgrounds with\ndesigned strategies. As the pioneer of persona expansion in multi-session\nsettings, our framework facilitates better response generation via human-like\npersona refinement. The supplementary video of our work is available at\nhttps://caffeine-15bbf.web.app/.\n","authors":["Hana Kim","Kai Tzu-iunn Ong","Seoyeon Kim","Dongha Lee","Jinyoung Yeo"],"pdf_url":"https://arxiv.org/pdf/2401.14215v1.pdf","comment":"Accepted to EACL 2024"},{"id":"http://arxiv.org/abs/2401.14212v1","updated":"2024-01-25T14:53:30Z","published":"2024-01-25T14:53:30Z","title":"Explicitly Representing Syntax Improves Sentence-to-layout Prediction of\n  Unexpected Situations","summary":"  Recognizing visual entities in a natural language sentence and arranging them\nin a 2D spatial layout require a compositional understanding of language and\nspace. This task of layout prediction is valuable in text-to-image synthesis as\nit allows localized and controlled in-painting of the image. In this\ncomparative study it is shown that we can predict layouts from language\nrepresentations that implicitly or explicitly encode sentence syntax, if the\nsentences mention similar entity-relationships to the ones seen during\ntraining. To test compositional understanding, we collect a test set of\ngrammatically correct sentences and layouts describing compositions of entities\nand relations that unlikely have been seen during training. Performance on this\ntest set substantially drops, showing that current models rely on correlations\nin the training data and have difficulties in understanding the structure of\nthe input sentences. We propose a novel structural loss function that better\nenforces the syntactic structure of the input sentence and show large\nperformance gains in the task of 2D spatial layout prediction conditioned on\ntext. The loss has the potential to be used in other generation tasks where a\ntree-like structure underlies the conditioning modality. Code, trained models\nand the USCOCO evaluation set will be made available via github.\n","authors":["Wolf Nuyts","Ruben Cartuyvels","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2401.14212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00509v4","updated":"2024-01-25T14:52:51Z","published":"2022-12-01T14:01:13Z","title":"CultureBERT: Measuring Corporate Culture With Transformer-Based Language\n  Models","summary":"  This paper introduces transformer-based language models to the literature\nmeasuring corporate culture from text documents. We compile a unique data set\nof employee reviews that were labeled by human evaluators with respect to the\ninformation the reviews reveal about the firms' corporate culture. Using this\ndata set, we fine-tune state-of-the-art transformer-based language models to\nperform the same classification task. In out-of-sample predictions, our\nlanguage models classify 17 to 30 percentage points more of employee reviews in\nline with human evaluators than traditional approaches of text classification.\nWe make our models publicly available.\n","authors":["Sebastian Koch","Stefan Pasch"],"pdf_url":"https://arxiv.org/pdf/2212.00509v4.pdf","comment":"23 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.12756v2","updated":"2024-01-25T14:32:36Z","published":"2024-01-23T13:35:47Z","title":"What the Weight?! A Unified Framework for Zero-Shot Knowledge\n  Composition","summary":"  The knowledge encapsulated in a model is the core factor determining its\nfinal performance on downstream tasks. Much research in NLP has focused on\nefficient methods for storing and adapting different types of knowledge, e.g.,\nin dedicated modularized structures, and on how to effectively combine these,\ne.g., by learning additional parameters. However, given the many possible\noptions, a thorough understanding of the mechanisms involved in these\ncompositions is missing, and hence it remains unclear which strategies to\nutilize. To address this research gap, we propose a novel framework for\nzero-shot module composition, which encompasses existing and some novel\nvariations for selecting, weighting, and combining parameter modules under a\nsingle unified notion. Focusing on the scenario of domain knowledge and adapter\nlayers, our framework provides a systematic unification of concepts, allowing\nus to conduct the first comprehensive benchmarking study of various zero-shot\nknowledge composition strategies. In particular, we test two module combination\nmethods and five selection and weighting strategies for their effectiveness and\nefficiency in an extensive experimental setup. Our results highlight the\nefficacy of ensembling but also hint at the power of simple though\noften-ignored weighting methods. Further in-depth analyses allow us to\nunderstand the role of weighting vs. top-k selection, and show that, to a\ncertain extent, the performance of adapter composition can even be predicted.\n","authors":["Carolin Holtermann","Markus Frohmann","Navid Rekabsaz","Anne Lauscher"],"pdf_url":"https://arxiv.org/pdf/2401.12756v2.pdf","comment":"Accepted to Findings of the ACL: EACL 2024"},{"id":"http://arxiv.org/abs/2401.14196v1","updated":"2024-01-25T14:17:53Z","published":"2024-01-25T14:17:53Z","title":"DeepSeek-Coder: When the Large Language Model Meets Programming -- The\n  Rise of Code Intelligence","summary":"  The rapid development of large language models has revolutionized code\nintelligence in software development. However, the predominance of\nclosed-source models has restricted extensive research and development. To\naddress this, we introduce the DeepSeek-Coder series, a range of open-source\ncode models with sizes from 1.3B to 33B, trained from scratch on 2 trillion\ntokens. These models are pre-trained on a high-quality project-level code\ncorpus and employ a fill-in-the-blank task with a 16K window to enhance code\ngeneration and infilling. Our extensive evaluations demonstrate that\nDeepSeek-Coder not only achieves state-of-the-art performance among open-source\ncode models across multiple benchmarks but also surpasses existing\nclosed-source models like Codex and GPT-3.5. Furthermore, DeepSeek-Coder models\nare under a permissive license that allows for both research and unrestricted\ncommercial use.\n","authors":["Daya Guo","Qihao Zhu","Dejian Yang","Zhenda Xie","Kai Dong","Wentao Zhang","Guanting Chen","Xiao Bi","Y. Wu","Y. K. Li","Fuli Luo","Yingfei Xiong","Wenfeng Liang"],"pdf_url":"https://arxiv.org/pdf/2401.14196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14194v1","updated":"2024-01-25T14:07:34Z","published":"2024-01-25T14:07:34Z","title":"Parameter-Efficient Conversational Recommender System as a Language\n  Processing Task","summary":"  Conversational recommender systems (CRS) aim to recommend relevant items to\nusers by eliciting user preference through natural language conversation. Prior\nwork often utilizes external knowledge graphs for items' semantic information,\na language model for dialogue generation, and a recommendation module for\nranking relevant items. This combination of multiple components suffers from a\ncumbersome training process, and leads to semantic misalignment issues between\ndialogue generation and item recommendation. In this paper, we represent items\nin natural language and formulate CRS as a natural language processing task.\nAccordingly, we leverage the power of pre-trained language models to encode\nitems, understand user intent via conversation, perform item recommendation\nthrough semantic matching, and generate dialogues. As a unified model, our\nPECRS (Parameter-Efficient CRS), can be optimized in a single stage, without\nrelying on non-textual metadata such as a knowledge graph. Experiments on two\nbenchmark CRS datasets, ReDial and INSPIRED, demonstrate the effectiveness of\nPECRS on recommendation and conversation. Our code is available at:\nhttps://github.com/Ravoxsg/efficient_unified_crs.\n","authors":["Mathieu Ravaut","Hao Zhang","Lu Xu","Aixin Sun","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2401.14194v1.pdf","comment":"9 pages, 4 figures, 7 tables, EACL 2024 conference"},{"id":"http://arxiv.org/abs/2401.14192v1","updated":"2024-01-25T14:03:15Z","published":"2024-01-25T14:03:15Z","title":"How Can Large Language Models Understand Spatial-Temporal Data?","summary":"  While Large Language Models (LLMs) dominate tasks like natural language\nprocessing and computer vision, harnessing their power for spatial-temporal\nforecasting remains challenging. The disparity between sequential text and\ncomplex spatial-temporal data hinders this application. To address this issue,\nthis paper introduces STG-LLM, an innovative approach empowering LLMs for\nspatial-temporal forecasting. We tackle the data mismatch by proposing: 1)\nSTG-Tokenizer: This spatial-temporal graph tokenizer transforms intricate graph\ndata into concise tokens capturing both spatial and temporal relationships; 2)\nSTG-Adapter: This minimalistic adapter, consisting of linear encoding and\ndecoding layers, bridges the gap between tokenized data and LLM comprehension.\nBy fine-tuning only a small set of parameters, it can effectively grasp the\nsemantics of tokens generated by STG-Tokenizer, while preserving the original\nnatural language understanding capabilities of LLMs. Extensive experiments on\ndiverse spatial-temporal benchmark datasets show that STG-LLM successfully\nunlocks LLM potential for spatial-temporal forecasting. Remarkably, our\napproach achieves competitive performance on par with dedicated SOTA methods.\n","authors":["Lei Liu","Shuo Yu","Runze Wang","Zhenxun Ma","Yanming Shen"],"pdf_url":"https://arxiv.org/pdf/2401.14192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12522v2","updated":"2024-01-25T14:02:03Z","published":"2024-01-23T06:36:49Z","title":"BiTA: Bi-Directional Tuning for Lossless Acceleration in Large Language\n  Models","summary":"  Large language models (LLMs) commonly employ autoregressive generation during\ninference, leading to high memory bandwidth demand and consequently extended\nlatency. To mitigate this inefficiency, we present Bi-directional Tuning for\nlossless Acceleration (BiTA), an innovative method expediting LLMs via\nstreamlined semi-autoregressive generation and draft verification. Inspired by\nthe concept of prompt tuning, we enhance LLMs with a parameter-efficient design\ncalled bi-directional tuning for the capability in semi-autoregressive\ngeneration. Employing efficient tree-based decoding, the models perform draft\ncandidate generation and verification in parallel, ensuring outputs identical\nto their autoregressive counterparts under greedy sampling. BiTA serves as a\nlightweight plug-in module, seamlessly boosting the inference efficiency of\nexisting LLMs without requiring additional assistance models or incurring\nsignificant extra memory costs. Applying the proposed BiTA, LLaMA-2-70B-Chat\nachieves a 2.7$\\times$ speedup on the MT-Bench benchmark. Extensive experiments\nconfirm our method surpasses state-of-the-art acceleration techniques.\n","authors":["Feng Lin","Hanling Yi","Hongbin Li","Yifan Yang","Xiaotian Yu","Guangming Lu","Rong Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.12522v2.pdf","comment":"An appendix has been included. Source code at\n  https://github.com/linfeng93/BiTA"},{"id":"http://arxiv.org/abs/2311.11482v2","updated":"2024-01-25T13:54:42Z","published":"2023-11-20T01:51:13Z","title":"Meta Prompting for AGI Systems","summary":"  This paper presents a comprehensive study of Meta Prompting, an innovative\ntechnique reshaping the utilization of large language models (LLMs),\nmulti-modal foundation models, and AI systems in problem-solving and data\ninterpretation. Grounded in type theory and category theory, Meta Prompting\nemphasizes the structure and syntax of information over traditional\ncontent-centric methods. The paper explores the formal definitions of Meta\nPrompting (MP), sets it apart from Few-Shot Prompting, and underlines its\neffectiveness in various AI applications. A key focus is on extending Meta\nPrompting to complex reasoning tasks, showing how it effectively deconstructs\nintricate problems into simpler sub-problems, enhancing token efficiency and\nenabling more equitable problem-solving comparisons, especially against\nfew-shot example methods. Additionally, the paper introduces Meta Prompting for\nPrompting Tasks, allowing LLMs to self-generate new prompts in an iterative,\nmetaprogramming-like manner. This innovative approach marks a significant leap\nin AI's autonomous and adaptive capabilities. The paper also pioneers the\nintegration of Meta Prompting into multi-modal foundation model settings,\ntackling the challenges and opportunities of incorporating varied data types\nsuch as images, audio, and video within the structured Meta Prompting\nframework. (The code is available at\nhttps://github.com/meta-prompting/meta-prompting)\n","authors":["Yifan Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.11482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01673v2","updated":"2024-01-25T13:44:10Z","published":"2023-07-04T12:06:07Z","title":"Disentanglement in a GAN for Unconditional Speech Synthesis","summary":"  Can we develop a model that can synthesize realistic speech directly from a\nlatent space, without explicit conditioning? Despite several efforts over the\nlast decade, previous adversarial and diffusion-based approaches still struggle\nto achieve this, even on small-vocabulary datasets. To address this, we propose\nAudioStyleGAN (ASGAN) -- a generative adversarial network for unconditional\nspeech synthesis tailored to learn a disentangled latent space. Building upon\nthe StyleGAN family of image synthesis models, ASGAN maps sampled noise to a\ndisentangled latent vector which is then mapped to a sequence of audio features\nso that signal aliasing is suppressed at every layer. To successfully train\nASGAN, we introduce a number of new techniques, including a modification to\nadaptive discriminator augmentation which probabilistically skips discriminator\nupdates. We apply it on the small-vocabulary Google Speech Commands digits\ndataset, where it achieves state-of-the-art results in unconditional speech\nsynthesis. It is also substantially faster than existing top-performing\ndiffusion models. We confirm that ASGAN's latent space is disentangled: we\ndemonstrate how simple linear operations in the space can be used to perform\nseveral tasks unseen during training. Specifically, we perform evaluations in\nvoice conversion, speech enhancement, speaker verification, and keyword\nclassification. Our work indicates that GANs are still highly competitive in\nthe unconditional speech synthesis landscape, and that disentangled latent\nspaces can be used to aid generalization to unseen tasks. Code, models,\nsamples: https://github.com/RF5/simple-asgan/\n","authors":["Matthew Baas","Herman Kamper"],"pdf_url":"https://arxiv.org/pdf/2307.01673v2.pdf","comment":"12 pages, 5 tables, 4 figures. Accepted to IEEE TASLP. arXiv admin\n  note: substantial text overlap with arXiv:2210.05271"},{"id":"http://arxiv.org/abs/2401.14166v1","updated":"2024-01-25T13:20:47Z","published":"2024-01-25T13:20:47Z","title":"BayesPrompt: Prompting Large-Scale Pre-Trained Language Models on\n  Few-shot Inference via Debiased Domain Abstraction","summary":"  As a novel and effective fine-tuning paradigm based on large-scale\npre-trained language models (PLMs), prompt-tuning aims to reduce the gap\nbetween downstream tasks and pre-training objectives. While prompt-tuning has\nyielded continuous advancements in various tasks, such an approach still\nremains a persistent defect: prompt-tuning methods fail to generalize to\nspecific few-shot patterns. From the perspective of distribution analyses, we\ndisclose that the intrinsic issues behind the phenomenon are the\nover-multitudinous conceptual knowledge contained in PLMs and the abridged\nknowledge for target downstream domains, which jointly result in that PLMs\nmis-locate the knowledge distributions corresponding to the target domains in\nthe universal knowledge embedding space. To this end, we intuitively explore to\napproximate the unabridged target domains of downstream tasks in a debiased\nmanner, and then abstract such domains to generate discriminative prompts,\nthereby providing the de-ambiguous guidance for PLMs. Guided by such an\nintuition, we propose a simple yet effective approach, namely BayesPrompt, to\nlearn prompts that contain the domain discriminative information against the\ninterference from domain-irrelevant knowledge. BayesPrompt primitively\nleverages known distributions to approximate the debiased factual distributions\nof target domains and further uniformly samples certain representative features\nfrom the approximated distributions to generate the ultimate prompts for PLMs.\nWe provide theoretical insights with the connection to domain adaptation.\nEmpirically, our method achieves state-of-the-art performance on benchmarks.\n","authors":["Jiangmeng Li","Fei Song","Yifan Jin","Wenwen Qiang","Changwen Zheng","Fuchun Sun","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2401.14166v1.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2401.01623v4","updated":"2024-01-25T13:10:15Z","published":"2024-01-03T08:49:12Z","title":"Can AI Be as Creative as Humans?","summary":"  Creativity serves as a cornerstone for societal progress and innovation. With\nthe rise of advanced generative AI models capable of tasks once reserved for\nhuman creativity, the study of AI's creative potential becomes imperative for\nits responsible development and application. In this paper, we prove in theory\nthat AI can be as creative as humans under the condition that it can properly\nfit the data generated by human creators. Therefore, the debate on AI's\ncreativity is reduced into the question of its ability to fit a sufficient\namount of data. To arrive at this conclusion, this paper first addresses the\ncomplexities in defining creativity by introducing a new concept called\nRelative Creativity. Rather than attempting to define creativity universally,\nwe shift the focus to whether AI can match the creative abilities of a\nhypothetical human. The methodological shift leads to a statistically\nquantifiable assessment of AI's creativity, term Statistical Creativity. This\nconcept, statistically comparing the creative abilities of AI with those of\nspecific human groups, facilitates theoretical exploration of AI's creative\npotential. Our analysis reveals that by fitting extensive conditional data\nwithout marginalizing out the generative conditions, AI can emerge as a\nhypothetical new creator. The creator possesses the same creative abilities on\npar with the human creators it was trained on. Building on theoretical\nfindings, we discuss the application in prompt-conditioned autoregressive\nmodels, providing a practical means for evaluating creative abilities of\ngenerative AI models, such as Large Language Models (LLMs). Additionally, this\nstudy provides an actionable training guideline, bridging the theoretical\nquantification of creativity with practical model training.\n","authors":["Haonan Wang","James Zou","Michael Mozer","Anirudh Goyal","Alex Lamb","Linjun Zhang","Weijie J Su","Zhun Deng","Michael Qizhe Xie","Hannah Brown","Kenji Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2401.01623v4.pdf","comment":"The paper examines AI's creativity, introducing Relative and\n  Statistical Creativity for theoretical and practical analysis, along with\n  practical training guidelines. Project Page: ai-relative-creativity.github.io"},{"id":"http://arxiv.org/abs/2401.14151v1","updated":"2024-01-25T13:03:20Z","published":"2024-01-25T13:03:20Z","title":"True Knowledge Comes from Practice: Aligning LLMs with Embodied\n  Environments via Reinforcement Learning","summary":"  Despite the impressive performance across numerous tasks, large language\nmodels (LLMs) often fail in solving simple decision-making tasks due to the\nmisalignment of the knowledge in LLMs with environments. On the contrary,\nreinforcement learning (RL) agents learn policies from scratch, which makes\nthem always align with environments but difficult to incorporate prior\nknowledge for efficient explorations. To narrow the gap, we propose TWOSOME, a\nnovel general online framework that deploys LLMs as decision-making agents to\nefficiently interact and align with embodied environments via RL without\nrequiring any prepared datasets or prior knowledge of the environments.\nFirstly, we query the joint probabilities of each valid action with LLMs to\nform behavior policies. Then, to enhance the stability and robustness of the\npolicies, we propose two normalization methods and summarize four prompt design\nprinciples. Finally, we design a novel parameter-efficient training\narchitecture where the actor and critic share one frozen LLM equipped with\nlow-rank adapters (LoRA) updated by PPO. We conduct extensive experiments to\nevaluate TWOSOME. i) TWOSOME exhibits significantly better sample efficiency\nand performance compared to the conventional RL method, PPO, and prompt tuning\nmethod, SayCan, in both classical decision-making environment, Overcooked, and\nsimulated household environment, VirtualHome. ii) Benefiting from LLMs'\nopen-vocabulary feature, TWOSOME shows superior generalization ability to\nunseen tasks. iii) Under our framework, there is no significant loss of the\nLLMs' original ability during online PPO finetuning.\n","authors":["Weihao Tan","Wentao Zhang","Shanqi Liu","Longtao Zheng","Xinrun Wang","Bo An"],"pdf_url":"https://arxiv.org/pdf/2401.14151v1.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2307.16778v2","updated":"2024-01-25T12:48:10Z","published":"2023-07-31T15:44:15Z","title":"KoBBQ: Korean Bias Benchmark for Question Answering","summary":"  The Bias Benchmark for Question Answering (BBQ) is designed to evaluate\nsocial biases of language models (LMs), but it is not simple to adapt this\nbenchmark to cultural contexts other than the US because social biases depend\nheavily on the cultural context. In this paper, we present KoBBQ, a Korean bias\nbenchmark dataset, and we propose a general framework that addresses\nconsiderations for cultural adaptation of a dataset. Our framework includes\npartitioning the BBQ dataset into three classes--Simply-Transferred (can be\nused directly after cultural translation), Target-Modified (requires\nlocalization in target groups), and Sample-Removed (does not fit Korean\nculture)-- and adding four new categories of bias specific to Korean culture.\nWe conduct a large-scale survey to collect and validate the social biases and\nthe targets of the biases that reflect the stereotypes in Korean culture. The\nresulting KoBBQ dataset comprises 268 templates and 76,048 samples across 12\ncategories of social bias. We use KoBBQ to measure the accuracy and bias scores\nof several state-of-the-art multilingual LMs. The results clearly show\ndifferences in the bias of LMs as measured by KoBBQ and a machine-translated\nversion of BBQ, demonstrating the need for and utility of a well-constructed,\nculturally-aware social bias benchmark.\n","authors":["Jiho Jin","Jiseon Kim","Nayeon Lee","Haneul Yoo","Alice Oh","Hwaran Lee"],"pdf_url":"https://arxiv.org/pdf/2307.16778v2.pdf","comment":"TACL 2024 (pre-MIT Press publication version)"},{"id":"http://arxiv.org/abs/2401.14135v1","updated":"2024-01-25T12:31:41Z","published":"2024-01-25T12:31:41Z","title":"Convolutional Neural Networks can achieve binary bail judgement\n  classification","summary":"  There is an evident lack of implementation of Machine Learning (ML) in the\nlegal domain in India, and any research that does take place in this domain is\nusually based on data from the higher courts of law and works with English\ndata. The lower courts and data from the different regional languages of India\nare often overlooked. In this paper, we deploy a Convolutional Neural Network\n(CNN) architecture on a corpus of Hindi legal documents. We perform a bail\nPrediction task with the help of a CNN model and achieve an overall accuracy of\n93\\% which is an improvement on the benchmark accuracy, set by Kapoor et al.\n(2022), albeit in data from 20 districts of the Indian state of Uttar Pradesh.\n","authors":["Amit Barman","Devangan Roy","Debapriya Paul","Indranil Dutta","Shouvik Kumar Guha","Samir Karmakar","Sudip Kumar Naskar"],"pdf_url":"https://arxiv.org/pdf/2401.14135v1.pdf","comment":"Accepted on 20th International Conference on Natural Language\n  Processing (ICON)"},{"id":"http://arxiv.org/abs/2401.14113v1","updated":"2024-01-25T11:47:58Z","published":"2024-01-25T11:47:58Z","title":"On the Affinity, Rationality, and Diversity of Hierarchical Topic\n  Modeling","summary":"  Hierarchical topic modeling aims to discover latent topics from a corpus and\norganize them into a hierarchy to understand documents with desirable semantic\ngranularity. However, existing work struggles with producing topic hierarchies\nof low affinity, rationality, and diversity, which hampers document\nunderstanding. To overcome these challenges, we in this paper propose Transport\nPlan and Context-aware Hierarchical Topic Model (TraCo). Instead of early\nsimple topic dependencies, we propose a transport plan dependency method. It\nconstrains dependencies to ensure their sparsity and balance, and also\nregularizes topic hierarchy building with them. This improves affinity and\ndiversity of hierarchies. We further propose a context-aware disentangled\ndecoder. Rather than previously entangled decoding, it distributes different\nsemantic granularity to topics at different levels by disentangled decoding.\nThis facilitates the rationality of hierarchies. Experiments on benchmark\ndatasets demonstrate that our method surpasses state-of-the-art baselines,\neffectively improving the affinity, rationality, and diversity of hierarchical\ntopic modeling with better performance on downstream tasks.\n","authors":["Xiaobao Wu","Fengjun Pan","Thong Nguyen","Yichao Feng","Chaoqun Liu","Cong-Duy Nguyen","Anh Tuan Luu"],"pdf_url":"https://arxiv.org/pdf/2401.14113v1.pdf","comment":"Accepted to AAAI2024 conference"},{"id":"http://arxiv.org/abs/2401.14109v1","updated":"2024-01-25T11:45:21Z","published":"2024-01-25T11:45:21Z","title":"CompactifAI: Extreme Compression of Large Language Models using\n  Quantum-Inspired Tensor Networks","summary":"  Large Language Models (LLMs) such as ChatGPT and LlaMA are advancing rapidly\nin generative Artificial Intelligence (AI), but their immense size poses\nsignificant challenges, such as huge training and inference costs, substantial\nenergy demands, and limitations for on-site deployment. Traditional compression\nmethods such as pruning, distillation, and low-rank approximation focus on\nreducing the effective number of neurons in the network, while quantization\nfocuses on reducing the numerical precision of individual weights to reduce the\nmodel size while keeping the number of neurons fixed. While these compression\nmethods have been relatively successful in practice, there's no compelling\nreason to believe that truncating the number of neurons is an optimal strategy.\nIn this context, this paper introduces CompactifAI, an innovative LLM\ncompression approach using quantum-inspired Tensor Networks that focuses on the\nmodel's correlation space instead, allowing for a more controlled, refined and\ninterpretable model compression. Our method is versatile and can be implemented\nwith - or on top of - other compression techniques. As a benchmark, we\ndemonstrate that CompactifAI alone enables compression of the LlaMA-2 7B model\nto only $30\\%$ of its original size while recovering over $90\\%$ of the\noriginal accuracy after a brief distributed retraining.\n","authors":["Andrei Tomut","Saeed S. Jahromi","Sukhbinder Singh","Faysal Ishtiaq","Cesar Muñoz","Prabdeep Singh Bajaj","Ali Elborady","Gianni del Bimbo","Mehrazin Alizadeh","David Montero","Pablo Martin-Ramiro","Muhammad Ibrahim","Oussama Tahiri Alaoui","John Malcolm","Samuel Mugel","Roman Orus"],"pdf_url":"https://arxiv.org/pdf/2401.14109v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.11562v5","updated":"2024-01-25T11:20:16Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models","summary":"  Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, e.g., Large Language Models (LLMs), there is a growing\ninterest in exploring their abilities in reasoning tasks. In this paper, we\nintroduce seminal foundation models proposed or adaptable for reasoning,\nhighlighting the latest advancements in various reasoning tasks, methods, and\nbenchmarks. We then delve into the potential future directions behind the\nemergence of reasoning abilities within foundation models. We also discuss the\nrelevance of multimodal learning, autonomous agents, and super alignment in the\ncontext of reasoning. By discussing these future research directions, we hope\nto inspire researchers in their exploration of this field, stimulate further\nadvancements in reasoning with foundation models, and contribute to the\ndevelopment of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v5.pdf","comment":"20 Figures, 160 Pages, 750+ References, Project Page\n  https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2401.14067v1","updated":"2024-01-25T10:43:00Z","published":"2024-01-25T10:43:00Z","title":"Ta'keed: The First Generative Fact-Checking System for Arabic Claims","summary":"  This paper introduces Ta'keed, an explainable Arabic automatic fact-checking\nsystem. While existing research often focuses on classifying claims as \"True\"\nor \"False,\" there is a limited exploration of generating explanations for claim\ncredibility, particularly in Arabic. Ta'keed addresses this gap by assessing\nclaim truthfulness based on retrieved snippets, utilizing two main components:\ninformation retrieval and LLM-based claim verification. We compiled the\nArFactEx, a testing gold-labelled dataset with manually justified references,\nto evaluate the system. The initial model achieved a promising F1 score of 0.72\nin the classification task. Meanwhile, the system's generated explanations are\ncompared with gold-standard explanations syntactically and semantically. The\nstudy recommends evaluating using semantic similarities, resulting in an\naverage cosine similarity score of 0.76. Additionally, we explored the impact\nof varying snippet quantities on claim classification accuracy, revealing a\npotential correlation, with the model using the top seven hits outperforming\nothers with an F1 score of 0.77.\n","authors":["Saud Althabiti","Mohammad Ammar Alsalka","Eric Atwell"],"pdf_url":"https://arxiv.org/pdf/2401.14067v1.pdf","comment":"9 pages, conference paper"},{"id":"http://arxiv.org/abs/2306.14806v2","updated":"2024-01-25T10:26:14Z","published":"2023-06-26T16:05:59Z","title":"A Positive-Unlabeled Metric Learning Framework for Document-Level\n  Relation Extraction with Incomplete Labeling","summary":"  The goal of document-level relation extraction (RE) is to identify relations\nbetween entities that span multiple sentences. Recently, incomplete labeling in\ndocument-level RE has received increasing attention, and some studies have used\nmethods such as positive-unlabeled learning to tackle this issue, but there is\nstill a lot of room for improvement. Motivated by this, we propose a\npositive-augmentation and positive-mixup positive-unlabeled metric learning\nframework (P3M). Specifically, we formulate document-level RE as a metric\nlearning problem. We aim to pull the distance closer between entity pair\nembedding and their corresponding relation embedding, while pushing it farther\naway from the none-class relation embedding. Additionally, we adapt the\npositive-unlabeled learning to this loss objective. In order to improve the\ngeneralizability of the model, we use dropout to augment positive samples and\npropose a positive-none-class mixup method. Extensive experiments show that P3M\nimproves the F1 score by approximately 4-10 points in document-level RE with\nincomplete labeling, and achieves state-of-the-art results in fully labeled\nscenarios. Furthermore, P3M has also demonstrated robustness to prior\nestimation bias in incomplete labeled scenarios.\n","authors":["Ye Wang","Huazheng Pan","Tao Zhang","Wen Wu","Wenxin Hu"],"pdf_url":"https://arxiv.org/pdf/2306.14806v2.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2308.01154v2","updated":"2024-01-25T10:04:49Z","published":"2023-08-02T13:58:37Z","title":"Arithmetic with Language Models: from Memorization to Computation","summary":"  A better understanding of the emergent computation and problem-solving\ncapabilities of recent large language models is of paramount importance to\nfurther improve them and broaden their applicability. This work investigates\nhow a language model, trained to predict the next token, can perform arithmetic\ncomputations generalizing beyond training data. Binary addition and\nmultiplication constitute a good testbed for this purpose, since they require a\nvery small vocabulary and exhibit relevant input/output discontinuities making\nsmooth input interpolation ineffective for novel data. We successfully trained\na light language model to learn these tasks and ran a number of experiments to\ninvestigate the extrapolation capabilities and internal information processing.\nOur findings support the hypothesis that the language model works as an\nEncoding-Regression-Decoding machine where the computation takes place in the\nvalue space once the input token representation is mapped to an appropriate\ninternal representation.\n","authors":["Davide Maltoni","Matteo Ferrara"],"pdf_url":"https://arxiv.org/pdf/2308.01154v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14043v1","updated":"2024-01-25T09:47:55Z","published":"2024-01-25T09:47:55Z","title":"Towards Goal-oriented Large Language Model Prompting: A Survey","summary":"  Large Language Models (LLMs) have shown prominent performance in various\ndownstream tasks in which prompt engineering plays a pivotal role in optimizing\nLLMs' performance. This paper, not as an overview of current prompt engineering\nmethods, aims to highlight the limitation of designing prompts while holding an\nanthropomorphic assumption that expects LLMs to think like humans. From our\nreview of 35 representative studies, we demonstrate that a goal-oriented prompt\nformulation, which guides LLMs to follow established human logical thinking,\nsignificantly improves the performance of LLMs. Furthermore, We introduce a\nnovel taxonomy that categorizes goal-oriented prompting methods into five\ninterconnected stages and we demonstrate the broad applicability of our\nframework by summarizing ten applicable tasks. With four future directions\nproposed, we hope to further emphasize and promote goal-oriented prompt\nengineering.\n","authors":["Haochen Li","Jonathan Leung","Zhiqi Shen"],"pdf_url":"https://arxiv.org/pdf/2401.14043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06450v2","updated":"2024-01-25T09:42:40Z","published":"2023-08-12T03:05:44Z","title":"ERNetCL: A novel emotion recognition network in textual conversation\n  based on curriculum learning strategy","summary":"  Emotion recognition in conversation (ERC) has emerged as a research hotspot\nin domains such as conversational robots and question-answer systems. How to\nefficiently and adequately retrieve contextual emotional cues has been one of\nthe key challenges in the ERC task. Existing efforts do not fully model the\ncontext and employ complex network structures, resulting in limited performance\ngains. In this paper, we propose a novel emotion recognition network based on\ncurriculum learning strategy (ERNetCL). The proposed ERNetCL primarily consists\nof temporal encoder (TE), spatial encoder (SE), and curriculum learning (CL)\nloss. We utilize TE and SE to combine the strengths of previous methods in a\nsimplistic manner to efficiently capture temporal and spatial contextual\ninformation in the conversation. To ease the harmful influence resulting from\nemotion shift and simulate the way humans learn curriculum from easy to hard,\nwe apply the idea of CL to the ERC task to progressively optimize the network\nparameters. At the beginning of training, we assign lower learning weights to\ndifficult samples. As the epoch increases, the learning weights for these\nsamples are gradually raised. Extensive experiments on four datasets exhibit\nthat our proposed method is effective and dramatically beats other baseline\nmodels.\n","authors":["Jiang Li","Xiaoping Wang","Yingjian Liu","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.06450v2.pdf","comment":"Accepted by Knowledge-Based Systems (KBS)"},{"id":"http://arxiv.org/abs/2401.14040v1","updated":"2024-01-25T09:36:58Z","published":"2024-01-25T09:36:58Z","title":"(Chat)GPT v BERT: Dawn of Justice for Semantic Change Detection","summary":"  In the universe of Natural Language Processing, Transformer-based language\nmodels like BERT and (Chat)GPT have emerged as lexical superheroes with great\npower to solve open research problems. In this paper, we specifically focus on\nthe temporal problem of semantic change, and evaluate their ability to solve\ntwo diachronic extensions of the Word-in-Context (WiC) task: TempoWiC and\nHistoWiC. In particular, we investigate the potential of a novel, off-the-shelf\ntechnology like ChatGPT (and GPT) 3.5 compared to BERT, which represents a\nfamily of models that currently stand as the state-of-the-art for modeling\nsemantic change. Our experiments represent the first attempt to assess the use\nof (Chat)GPT for studying semantic change. Our results indicate that ChatGPT\nperforms significantly worse than the foundational GPT version. Furthermore,\nour results demonstrate that (Chat)GPT achieves slightly lower performance than\nBERT in detecting long-term changes but performs significantly worse in\ndetecting short-term changes.\n","authors":["Francesco Periti","Haim Dubossarsky","Nina Tahmasebi"],"pdf_url":"https://arxiv.org/pdf/2401.14040v1.pdf","comment":"Accepted to the Findings of EACL 2024"},{"id":"http://arxiv.org/abs/2211.03818v2","updated":"2024-01-25T09:30:11Z","published":"2022-11-07T19:06:53Z","title":"Retrieval augmentation of large language models for lay language\n  generation","summary":"  Recent lay language generation systems have used Transformer models trained\non a parallel corpus to increase health information accessibility. However, the\napplicability of these models is constrained by the limited size and topical\nbreadth of available corpora. We introduce CELLS, the largest (63k pairs) and\nbroadest-ranging (12 journals) parallel corpus for lay language generation. The\nabstract and the corresponding lay language summary are written by domain\nexperts, assuring the quality of our dataset. Furthermore, qualitative\nevaluation of expert-authored plain language summaries has revealed background\nexplanation as a key strategy to increase accessibility. Such explanation is\nchallenging for neural models to generate because it goes beyond simplification\nby adding content absent from the source. We derive two specialized paired\ncorpora from CELLS to address key challenges in lay language generation:\ngenerating background explanations and simplifying the original abstract. We\nadopt retrieval-augmented models as an intuitive fit for the task of background\nexplanation generation, and show improvements in summary quality and simplicity\nwhile maintaining factual correctness. Taken together, this work presents the\nfirst comprehensive study of background explanation for lay language\ngeneration, paving the path for disseminating scientific knowledge to a broader\naudience. CELLS is publicly available at:\nhttps://github.com/LinguisticAnomalies/pls_retrieval.\n","authors":["Yue Guo","Wei Qiu","Gondy Leroy","Sheng Wang","Trevor Cohen"],"pdf_url":"https://arxiv.org/pdf/2211.03818v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14021v1","updated":"2024-01-25T09:06:44Z","published":"2024-01-25T09:06:44Z","title":"Accelerating Retrieval-Augmented Language Model Serving with Speculation","summary":"  Retrieval-augmented language models (RaLM) have demonstrated the potential to\nsolve knowledge-intensive natural language processing (NLP) tasks by combining\na non-parametric knowledge base with a parametric language model. Instead of\nfine-tuning a fully parametric model, RaLM excels at its low-cost adaptation to\nthe latest data and better source attribution mechanisms. Among various RaLM\napproaches, iterative RaLM delivers a better generation quality due to a more\nfrequent interaction between the retriever and the language model. Despite the\nbenefits, iterative RaLM usually encounters high overheads due to the frequent\nretrieval step. To this end, we propose RaLMSpec, a speculation-inspired\nframework that provides generic speed-up over iterative RaLM while preserving\nthe same model outputs through speculative retrieval and batched verification.\nBy further incorporating prefetching, optimal speculation stride scheduler, and\nasynchronous verification, RaLMSpec can automatically exploit the acceleration\npotential to the fullest. For naive iterative RaLM serving, extensive\nevaluations over three language models on four downstream QA datasets\ndemonstrate that RaLMSpec can achieve a speed-up ratio of 1.75-2.39x,\n1.04-1.39x, and 1.31-1.77x when the retriever is an exact dense retriever,\napproximate dense retriever, and sparse retriever respectively compared with\nthe baseline. For KNN-LM serving, RaLMSpec can achieve a speed-up ratio up to\n7.59x and 2.45x when the retriever is an exact dense retriever and approximate\ndense retriever, respectively, compared with the baseline.\n","authors":["Zhihao Zhang","Alan Zhu","Lijie Yang","Yihua Xu","Lanting Li","Phitchaya Mangpo Phothilimthana","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2401.14021v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.14019v1","updated":"2024-01-25T08:57:33Z","published":"2024-01-25T08:57:33Z","title":"Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation\n  for Generative AI","summary":"  In the dynamic landscape of generative NLP, traditional text processing\npipelines limit research flexibility and reproducibility, as they are tailored\nto specific dataset, task, and model combinations. The escalating complexity,\ninvolving system prompts, model-specific formats, instructions, and more, calls\nfor a shift to a structured, modular, and customizable solution. Addressing\nthis need, we present Unitxt, an innovative library for customizable textual\ndata preparation and evaluation tailored to generative language models. Unitxt\nnatively integrates with common libraries like HuggingFace and LM-eval-harness\nand deconstructs processing flows into modular components, enabling easy\ncustomization and sharing between practitioners. These components encompass\nmodel-specific formats, task prompts, and many other comprehensive dataset\nprocessing definitions. The Unitxt-Catalog centralizes these components,\nfostering collaboration and exploration in modern textual data workflows.\nBeyond being a tool, Unitxt is a community-driven platform, empowering users to\nbuild, share, and advance their pipelines collaboratively. Join the Unitxt\ncommunity at https://github.com/IBM/unitxt!\n","authors":["Elron Bandel","Yotam Perlitz","Elad Venezian","Roni Friedman-Melamed","Ofir Arviv","Matan Orbach","Shachar Don-Yehyia","Dafna Sheinwald","Ariel Gera","Leshem Choshen","Michal Shmueli-Scheuer","Yoav Katz"],"pdf_url":"https://arxiv.org/pdf/2401.14019v1.pdf","comment":"Submitted to NAACL demo track"},{"id":"http://arxiv.org/abs/2401.14016v1","updated":"2024-01-25T08:48:21Z","published":"2024-01-25T08:48:21Z","title":"Towards Uncertainty-Aware Language Agent","summary":"  While Language Agents have achieved promising success by placing Large\nLanguage Models at the core of a more versatile design that dynamically\ninteracts with the external world, the existing approaches neglect the notion\nof uncertainty during these interactions. We present the Uncertainty-Aware\nLanguage Agent (UALA), a framework that orchestrates the interaction between\nthe agent and the external world using uncertainty quantification. Compared\nwith other well-known counterparts like ReAct, our extensive experiments across\n3 representative tasks (HotpotQA, StrategyQA, MMLU) and various LLM sizes\ndemonstrates that UALA brings a significant improvement of performance, while\nhaving a substantially lower reliance on the external world (i.e., reduced\nnumber of tool calls and tokens). Our analyses provide various insights\nincluding the great potential of UALA compared with agent fine-tuning, and\nunderscoring the unreliably of verbalised confidence of LLMs as a proxy for\nuncertainty.\n","authors":["Jiuzhou Han","Wray Buntine","Ehsan Shareghi"],"pdf_url":"https://arxiv.org/pdf/2401.14016v1.pdf","comment":"The code and data are at https://uala-agent.github.io. arXiv admin\n  note: substantial text overlap with arXiv:2310.05915"},{"id":"http://arxiv.org/abs/2312.05934v2","updated":"2024-01-25T08:37:45Z","published":"2023-12-10T16:52:00Z","title":"Fine-Tuning or Retrieval? Comparing Knowledge Injection in LLMs","summary":"  Large language models (LLMs) encapsulate a vast amount of factual information\nwithin their pre-trained weights, as evidenced by their ability to answer\ndiverse questions across different domains. However, this knowledge is\ninherently limited, relying heavily on the characteristics of the training\ndata. Consequently, using external datasets to incorporate new information or\nrefine the capabilities of LLMs on previously seen information poses a\nsignificant challenge. In this study, we compare two common approaches:\nunsupervised fine-tuning and retrieval-augmented generation (RAG). We evaluate\nboth approaches on a variety of knowledge-intensive tasks across different\ntopics. Our findings reveal that while unsupervised fine-tuning offers some\nimprovement, RAG consistently outperforms it, both for existing knowledge\nencountered during training and entirely new knowledge. Moreover, we find that\nLLMs struggle to learn new factual information through unsupervised\nfine-tuning, and that exposing them to numerous variations of the same fact\nduring training could alleviate this problem.\n","authors":["Oded Ovadia","Menachem Brief","Moshik Mishaeli","Oren Elisha"],"pdf_url":"https://arxiv.org/pdf/2312.05934v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14011v1","updated":"2024-01-25T08:22:10Z","published":"2024-01-25T08:22:10Z","title":"CMMU: A Benchmark for Chinese Multi-modal Multi-type Question\n  Understanding and Reasoning","summary":"  Multi-modal large language models(MLLMs) have achieved remarkable progress\nand demonstrated powerful knowledge comprehension and reasoning abilities.\nHowever, the mastery of domain-specific knowledge, which is essential for\nevaluating the intelligence of MLLMs, continues to be a challenge. Current\nmulti-modal benchmarks for domain-specific knowledge concentrate on\nmultiple-choice questions and are predominantly available in English, which\nimposes limitations on the comprehensiveness of the evaluation. To this end, we\nintroduce CMMU, a novel benchmark for multi-modal and multi-type question\nunderstanding and reasoning in Chinese. CMMU consists of 3,603 questions in 7\nsubjects, covering knowledge from primary to high school. The questions can be\ncategorized into 3 types: multiple-choice, multiple-response, and\nfill-in-the-blank, bringing greater challenges to MLLMs. In addition, we\npropose a rigorous evaluation strategy called ShiftCheck for assessing\nmultiple-choice questions. The strategy aims to reduce position bias, minimize\nthe influence of randomness on correctness, and perform a quantitative analysis\nof position bias. We evaluate seven open-source MLLMs along with GPT4-V,\nGemini-Pro, and Qwen-VL-Plus. The results demonstrate that CMMU poses a\nsignificant challenge to the recent MLLMs.\n","authors":["Zheqi He","Xinya Wu","Pengfei Zhou","Richeng Xuan","Guang Liu","Xi Yang","Qiannan Zhu","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2401.14011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14003v1","updated":"2024-01-25T08:03:38Z","published":"2024-01-25T08:03:38Z","title":"ConstraintChecker: A Plugin for Large Language Models to Reason on\n  Commonsense Knowledge Bases","summary":"  Reasoning over Commonsense Knowledge Bases (CSKB), i.e. CSKB reasoning, has\nbeen explored as a way to acquire new commonsense knowledge based on reference\nknowledge in the original CSKBs and external prior knowledge. Despite the\nadvancement of Large Language Models (LLM) and prompt engineering techniques in\nvarious reasoning tasks, they still struggle to deal with CSKB reasoning. One\nof the problems is that it is hard for them to acquire explicit relational\nconstraints in CSKBs from only in-context exemplars, due to a lack of symbolic\nreasoning capabilities (Bengio et al., 2021). To this end, we proposed\n**ConstraintChecker**, a plugin over prompting techniques to provide and check\nexplicit constraints. When considering a new knowledge instance,\nConstraintChecker employs a rule-based module to produce a list of constraints,\nthen it uses a zero-shot learning module to check whether this knowledge\ninstance satisfies all constraints. The acquired constraint-checking result is\nthen aggregated with the output of the main prompting technique to produce the\nfinal output. Experimental results on CSKB Reasoning benchmarks demonstrate the\neffectiveness of our method by bringing consistent improvements over all\nprompting methods. Codes and data are available at\n\\url{https://github.com/HKUST-KnowComp/ConstraintChecker}.\n","authors":["Quyet V. Do","Tianqing Fang","Shizhe Diao","Zhaowei Wang","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2401.14003v1.pdf","comment":"Proceedings of EACL 2024"},{"id":"http://arxiv.org/abs/2401.13996v1","updated":"2024-01-25T07:47:49Z","published":"2024-01-25T07:47:49Z","title":"Investigate-Consolidate-Exploit: A General Strategy for Inter-Task Agent\n  Self-Evolution","summary":"  This paper introduces Investigate-Consolidate-Exploit (ICE), a novel strategy\nfor enhancing the adaptability and flexibility of AI agents through inter-task\nself-evolution. Unlike existing methods focused on intra-task learning, ICE\npromotes the transfer of knowledge between tasks for genuine self-evolution,\nsimilar to human experience learning. The strategy dynamically investigates\nplanning and execution trajectories, consolidates them into simplified\nworkflows and pipelines, and exploits them for improved task execution. Our\nexperiments on the XAgent framework demonstrate ICE's effectiveness, reducing\nAPI calls by as much as 80% and significantly decreasing the demand for the\nmodel's capability. Specifically, when combined with GPT-3.5, ICE's performance\nmatches that of raw GPT-4 across various agent tasks. We argue that this\nself-evolution approach represents a paradigm shift in agent design,\ncontributing to a more robust AI community and ecosystem, and moving a step\ncloser to full autonomy.\n","authors":["Cheng Qian","Shihao Liang","Yujia Qin","Yining Ye","Xin Cong","Yankai Lin","Yesai Wu","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2401.13996v1.pdf","comment":"18 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.10286v2","updated":"2024-01-25T07:46:34Z","published":"2024-01-16T02:11:35Z","title":"Top in Chinese Data Processing: English Code Models","summary":"  While the alignment between tasks and training corpora is a fundamental\nconsensus in the application of language models, our series of experiments and\nthe metrics we designed reveal that code-based Large Language Models (LLMs)\nsignificantly outperform models trained on data that is closely matched to the\ntasks in non-coding Chinese tasks. Moreover, in tasks high sensitivity to\nChinese hallucinations, models exhibiting fewer linguistic features of the\nChinese language achieve better performance. Our experimental results can be\neasily replicated in Chinese data processing tasks, such as preparing data for\nRetrieval-Augmented Generation (RAG), by simply replacing the base model with a\ncode-based model. Additionally, our research offers a distinct perspective for\ndiscussion on the philosophical \"Chinese Room\" thought experiment.\n","authors":["Linghan Zheng","Hui Liu","Xiaojun Lin","Jiayuan Dong","Yue Sheng","Gang Shi","Zhiwei Liu","Hongwei Chen"],"pdf_url":"https://arxiv.org/pdf/2401.10286v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00789v4","updated":"2024-01-25T07:45:45Z","published":"2023-06-01T15:19:06Z","title":"Improved Cross-Lingual Transfer Learning For Automatic Speech\n  Translation","summary":"  Research in multilingual speech-to-text translation is topical. Having a\nsingle model that supports multiple translation tasks is desirable. The goal of\nthis work it to improve cross-lingual transfer learning in multilingual\nspeech-to-text translation via semantic knowledge distillation. We show that by\ninitializing the encoder of the encoder-decoder sequence-to-sequence\ntranslation model with SAMU-XLS-R, a multilingual speech transformer encoder\ntrained using multi-modal (speech-text) semantic knowledge distillation, we\nachieve significantly better cross-lingual task knowledge transfer than the\nbaseline XLS-R, a multilingual speech transformer encoder trained via\nself-supervised learning. We demonstrate the effectiveness of our approach on\ntwo popular datasets, namely, CoVoST-2 and Europarl. On the 21 translation\ntasks of the CoVoST-2 benchmark, we achieve an average improvement of 12.8 BLEU\npoints over the baselines. In the zero-shot translation scenario, we achieve an\naverage gain of 18.8 and 11.9 average BLEU points on unseen medium and\nlow-resource languages. We make similar observations on Europarl speech\ntranslation benchmark.\n","authors":["Sameer Khurana","Nauman Dawalatabad","Antoine Laurent","Luis Vicente","Pablo Gimeno","Victoria Mingote","James Glass"],"pdf_url":"https://arxiv.org/pdf/2306.00789v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13986v1","updated":"2024-01-25T07:04:30Z","published":"2024-01-25T07:04:30Z","title":"Towards Consistent Natural-Language Explanations via\n  Explanation-Consistency Finetuning","summary":"  Large language models (LLMs) often generate convincing, fluent explanations.\nHowever, different from humans, they often generate inconsistent explanations\non different inputs. For example, an LLM may generate the explanation \"all\nbirds can fly\" when answering the question \"Can sparrows fly?\" but meanwhile\nanswer \"no\" to the related question \"Can penguins fly?\". Explanations should be\nconsistent across related examples so that they allow a human to simulate the\nLLM's decision process on multiple examples. We propose explanation-consistency\nfinetuning (EC-finetuning), a method that adapts LLMs to generate more\nconsistent natural-language explanations on related examples. EC-finetuning\ninvolves finetuning LLMs on synthetic data that is carefully constructed to\ncontain consistent explanations. Across a variety of question-answering\ndatasets in various domains, EC-finetuning yields a 10.0% relative explanation\nconsistency improvement on four finetuning datasets, and generalizes to seven\nout-of-distribution datasets not seen during finetuning (+4.5% relative). Code\nis available at https://github.com/yandachen/explanation-consistency-finetuning .\n","authors":["Yanda Chen","Chandan Singh","Xiaodong Liu","Simiao Zuo","Bin Yu","He He","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2401.13986v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2307.08678"},{"id":"http://arxiv.org/abs/2312.15643v2","updated":"2024-01-25T06:55:39Z","published":"2023-12-25T08:06:20Z","title":"Advancing Abductive Reasoning in Knowledge Graphs through Complex\n  Logical Hypothesis Generation","summary":"  Abductive reasoning is the process of making educated guesses to provide\nexplanations for observations. Although many applications require the use of\nknowledge for explanations, the utilization of abductive reasoning in\nconjunction with structured knowledge, such as a knowledge graph, remains\nlargely unexplored. To fill this gap, this paper introduces the task of complex\nlogical hypothesis generation, as an initial step towards abductive logical\nreasoning with KG. In this task, we aim to generate a complex logical\nhypothesis so that it can explain a set of observations. We find that the\nsupervised trained generative model can generate logical hypotheses that are\nstructurally closer to the reference hypothesis. However, when generalized to\nunseen observations, this training objective does not guarantee better\nhypothesis generation. To address this, we introduce the Reinforcement Learning\nfrom Knowledge Graph (RLF-KG) method, which minimizes differences between\nobservations and conclusions drawn from generated hypotheses according to the\nKG. Experiments show that, with RLF-KG's assistance, the generated hypotheses\nprovide better explanations, and achieve state-of-the-art results on three\nwidely used KGs.\n","authors":["Jiaxin Bai","Yicheng Wang","Tianshi Zheng","Yue Guo","Xin Liu","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2312.15643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13979v1","updated":"2024-01-25T06:45:32Z","published":"2024-01-25T06:45:32Z","title":"Leeroo Orchestrator: Elevating LLMs Performance Through Model\n  Integration","summary":"  In this paper, we propose an architecture to harness the collective knowledge\nof multiple trained LLMs to create a new state-of-the-art. At the core of this\nframework is a LLM-based orchestrator that is adept at picking the right\nunderlying LLM experts for optimal task execution. Inspired by self-play in\nreinforcement learning, we created a loop of query generation, orchestration,\nand evaluation to generate training data for the orchestrator. Our evaluation\nfocused on the MMLU benchmark, employing models with 7B, 13B, and 34B\nparameters available on Hugging Face. The results demonstrate new\nstate-of-the-art open-source models: Our Leeroo orchestrator achieves\nperformance on par with the Mixtral model while incurring only two-thirds of\nits cost. Moreover, increasing the allowed cost surpasses Mixtral's accuracy by\nover 5% at the same cost level, reaching an accuracy of 75.9%. Further\nenhancements were observed when integrating GPT4 into the underlying model\npool. The Leeroo orchestrator nearly matches GPT4's performance at half the\ncost and even exceeds GPT4's results with a 25% cost reduction. These findings\nillustrate the potential of our architecture in creating state-of-the-art and\ncost-effective LLMs by optimizing the synergy between multiple LLMs to achieve\nsuperior performance outcomes.\n","authors":["Alireza Mohammadshahi","Ali Shaikh","Majid Yazdani"],"pdf_url":"https://arxiv.org/pdf/2401.13979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12689v2","updated":"2024-01-25T04:37:38Z","published":"2024-01-23T11:54:09Z","title":"Energy-based Automated Model Evaluation","summary":"  The conventional evaluation protocols on machine learning models rely heavily\non a labeled, i.i.d-assumed testing dataset, which is not often present in real\nworld applications. The Automated Model Evaluation (AutoEval) shows an\nalternative to this traditional workflow, by forming a proximal prediction\npipeline of the testing performance without the presence of ground-truth\nlabels. Despite its recent successes, the AutoEval frameworks still suffer from\nan overconfidence issue, substantial storage and computational cost. In that\nregard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that\nallows the AutoEval framework to be both more efficient and effective. The core\nof the MDE is to establish a meta-distribution statistic, on the information\n(energy) associated with individual samples, then offer a smoother\nrepresentation enabled by energy-based learning. We further provide our\ntheoretical insights by connecting the MDE with the classification loss. We\nprovide extensive experiments across modalities, datasets and different\narchitectural backbones to validate MDE's validity, together with its\nsuperiority compared with prior approaches. We also prove MDE's versatility by\nshowing its seamless integration with large-scale models, and easy adaption to\nlearning scenarios with noisy- or imbalanced- labels. Code and data are\navailable: https://github.com/pengr/Energy_AutoEval\n","authors":["Ru Peng","Heming Zou","Haobo Wang","Yawen Zeng","Zenan Huang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.12689v2.pdf","comment":"ICLR2024 poster paper"},{"id":"http://arxiv.org/abs/2401.10529v2","updated":"2024-01-25T04:11:57Z","published":"2024-01-19T07:10:13Z","title":"Mementos: A Comprehensive Benchmark for Multimodal Large Language Model\n  Reasoning over Image Sequences","summary":"  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in\nhandling a variety of visual-language tasks. However, current MLLM benchmarks\nare predominantly designed to evaluate reasoning based on static information\nabout a single image, and the ability of modern MLLMs to extrapolate from image\nsequences, which is essential for understanding our ever-changing world, has\nbeen less investigated. To address this challenge, this paper introduces\nMementos, a new benchmark designed to assess MLLMs' sequential image reasoning\nabilities. Mementos features 4,761 diverse image sequences with varying\nlengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning\nperformance. Through a careful evaluation of nine recent MLLMs on Mementos,\nincluding GPT-4V and Gemini, we find that they struggle to accurately describe\ndynamic information about given image sequences, often leading to\nhallucinations/misrepresentations of objects and their corresponding behaviors.\nOur quantitative analysis and case studies identify three key factors impacting\nMLLMs' sequential image reasoning: the correlation between object and\nbehavioral hallucinations, the influence of cooccurring behaviors, and the\ncompounding impact of behavioral hallucinations. Our dataset is available at\nhttps://github.com/umd-huang-lab/Mementos.\n","authors":["Xiyao Wang","Yuhang Zhou","Xiaoyu Liu","Hongjin Lu","Yuancheng Xu","Feihong He","Jaehong Yoon","Taixi Lu","Gedas Bertasius","Mohit Bansal","Huaxiu Yao","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.10529v2.pdf","comment":"27 pages, 23 figures"},{"id":"http://arxiv.org/abs/2303.16416v3","updated":"2024-01-25T04:02:23Z","published":"2023-03-29T02:46:18Z","title":"Improving Large Language Models for Clinical Named Entity Recognition\n  via Prompt Engineering","summary":"  Objective: This study quantifies the capabilities of GPT-3.5 and GPT-4 for\nclinical named entity recognition (NER) tasks and proposes task-specific\nprompts to improve their performance. Materials and Methods: We evaluated these\nmodels on two clinical NER tasks: (1) to extract medical problems, treatments,\nand tests from clinical notes in the MTSamples corpus, following the 2010 i2b2\nconcept extraction shared task, and (2) identifying nervous system\ndisorder-related adverse events from safety reports in the vaccine adverse\nevent reporting system (VAERS). To improve the GPT models' performance, we\ndeveloped a clinical task-specific prompt framework that includes (1) baseline\nprompts with task description and format specification, (2) annotation\nguideline-based prompts, (3) error analysis-based instructions, and (4)\nannotated samples for few-shot learning. We assessed each prompt's\neffectiveness and compared the models to BioClinicalBERT. Results: Using\nbaseline prompts, GPT-3.5 and GPT-4 achieved relaxed F1 scores of 0.634, 0.804\nfor MTSamples, and 0.301, 0.593 for VAERS. Additional prompt components\nconsistently improved model performance. When all four components were used,\nGPT-3.5 and GPT-4 achieved relaxed F1 socres of 0.794, 0.861 for MTSamples and\n0.676, 0.736 for VAERS, demonstrating the effectiveness of our prompt\nframework. Although these results trail BioClinicalBERT (F1 of 0.901 for the\nMTSamples dataset and 0.802 for the VAERS), it is very promising considering\nfew training samples are needed. Conclusion: While direct application of GPT\nmodels to clinical NER tasks falls short of optimal performance, our\ntask-specific prompt framework, incorporating medical knowledge and training\nsamples, significantly enhances GPT models' feasibility for potential clinical\napplications.\n","authors":["Yan Hu","Qingyu Chen","Jingcheng Du","Xueqing Peng","Vipina Kuttichi Keloth","Xu Zuo","Yujia Zhou","Zehan Li","Xiaoqian Jiang","Zhiyong Lu","Kirk Roberts","Hua Xu"],"pdf_url":"https://arxiv.org/pdf/2303.16416v3.pdf","comment":"17 pages, 5 tables, 6 figure"},{"id":"http://arxiv.org/abs/2401.13927v1","updated":"2024-01-25T03:57:12Z","published":"2024-01-25T03:57:12Z","title":"Adaptive Text Watermark for Large Language Models","summary":"  The advancement of Large Language Models (LLMs) has led to increasing\nconcerns about the misuse of AI-generated text, and watermarking for\nLLM-generated text has emerged as a potential solution. However, it is\nchallenging to generate high-quality watermarked text while maintaining strong\nsecurity, robustness, and the ability to detect watermarks without prior\nknowledge of the prompt or model. This paper proposes an adaptive watermarking\nstrategy to address this problem. To improve the text quality and maintain\nrobustness, we adaptively add watermarking to token distributions with high\nentropy measured using an auxiliary model and keep the low entropy token\ndistributions untouched. For the sake of security and to further minimize the\nwatermark's impact on text quality, instead of using a fixed green/red list\ngenerated from a random secret key, which can be vulnerable to decryption and\nforgery, we adaptively scale up the output logits in proportion based on the\nsemantic embedding of previously generated text using a well designed semantic\nmapping model. Our experiments involving various LLMs demonstrate that our\napproach achieves comparable robustness performance to existing watermark\nmethods. Additionally, the text generated by our method has perplexity\ncomparable to that of \\emph{un-watermarked} LLMs while maintaining security\neven under various attacks.\n","authors":["Yepeng Liu","Yuheng Bu"],"pdf_url":"https://arxiv.org/pdf/2401.13927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04661v3","updated":"2024-01-25T03:50:57Z","published":"2023-11-08T13:03:06Z","title":"Massive Editing for Large Language Models via Meta Learning","summary":"  While large language models (LLMs) have enabled learning knowledge from the\npre-training corpora, the acquired knowledge may be fundamentally incorrect or\noutdated over time, which necessitates rectifying the knowledge of the language\nmodel (LM) after the training. A promising approach involves employing a\nhyper-network to generate parameter shift, whereas existing hyper-networks\nsuffer from inferior scalability in synchronous editing operation amount. To\nmitigate the problem, we propose the MAssive Language Model Editing Network\n(MALMEN), which formulates the parameter shift aggregation as the least square\nproblem, subsequently updating the LM parameters using the normal equation. To\naccommodate editing multiple facts simultaneously with limited memory budgets,\nwe separate the computation on the hyper-network and LM, enabling arbitrary\nbatch size on both neural networks. Our method is evaluated by editing up to\nthousands of facts on LMs with different architectures, i.e., BERT-base, GPT-2,\nT5-XL (2.8B), and GPT-J (6B), across various knowledge-intensive NLP tasks,\ni.e., closed book fact-checking and question answering. Remarkably, MALMEN is\ncapable of editing hundreds of times more facts than strong baselines with the\nidentical hyper-network architecture and outperforms editor specifically\ndesigned for GPT. Our code is available at\nhttps://github.com/ChenmienTan/malmen.\n","authors":["Chenmien Tan","Ge Zhang","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2311.04661v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13601v2","updated":"2024-01-25T03:46:15Z","published":"2024-01-24T17:10:45Z","title":"MM-LLMs: Recent Advances in MultiModal Large Language Models","summary":"  In the past year, MultiModal Large Language Models (MM-LLMs) have undergone\nsubstantial advancements, augmenting off-the-shelf LLMs to support MM inputs or\noutputs via cost-effective training strategies. The resulting models not only\npreserve the inherent reasoning and decision-making capabilities of LLMs but\nalso empower a diverse range of MM tasks. In this paper, we provide a\ncomprehensive survey aimed at facilitating further research of MM-LLMs.\nSpecifically, we first outline general design formulations for model\narchitecture and training pipeline. Subsequently, we provide brief\nintroductions of $26$ existing MM-LLMs, each characterized by its specific\nformulations. Additionally, we review the performance of MM-LLMs on mainstream\nbenchmarks and summarize key training recipes to enhance the potency of\nMM-LLMs. Lastly, we explore promising directions for MM-LLMs while concurrently\nmaintaining a real-time tracking website for the latest developments in the\nfield. We hope that this survey contributes to the ongoing advancement of the\nMM-LLMs domain.\n","authors":["Duzhen Zhang","Yahan Yu","Chenxing Li","Jiahua Dong","Dan Su","Chenhui Chu","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2401.13601v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.13920v1","updated":"2024-01-25T03:36:39Z","published":"2024-01-25T03:36:39Z","title":"LocMoE: A Low-overhead MoE for Large Language Model Training","summary":"  The Mixtures-of-Experts (MoE) model is a widespread distributed and\nintegrated learning method for large language models (LLM), which is favored\ndue to its ability to sparsify and expand models efficiently. However, the\nperformance of MoE is limited by load imbalance and high latency of All-To-All\ncommunication, along with relatively redundant computation owing to large\nexpert capacity. Load imbalance may result from existing routing policies that\nconsistently tend to select certain experts. The frequent inter-node\ncommunication in the All-To-All procedure also significantly prolongs the\ntraining time. To alleviate the above performance problems, we propose a novel\nrouting strategy that combines load balance and locality by converting partial\ninter-node communication to that of intra-node. Notably, we elucidate that\nthere is a minimum threshold for expert capacity, calculated through the\nmaximal angular deviation between the gating weights of the experts and the\nassigned tokens. We port these modifications on the PanGu-Sigma model based on\nthe MindSpore framework with multi-level routing and conduct experiments on\nAscend clusters. The experiment results demonstrate that the proposed LocMoE\nreduces training time per epoch by 12.68% to 22.24% compared to classical\nrouters, such as hash router and switch router, without impacting the model\naccuracy.\n","authors":["Jing Li","Zhijie Sun","Xuan He","Li Zeng","Yi Lin","Entong Li","Binfan Zheng","Rongqian Zhao","Xin Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13919v1","updated":"2024-01-25T03:33:18Z","published":"2024-01-25T03:33:18Z","title":"WebVoyager: Building an End-to-End Web Agent with Large Multimodal\n  Models","summary":"  The advancement of large language models (LLMs) leads to a new era marked by\nthe development of autonomous applications in the real world, which drives\ninnovation in the creation of advanced web-based agents. Existing web agents\ntypically only handle one input modality and are evaluated only in simplified\nweb simulators or static web snapshots, greatly limiting their applicability in\nreal-world scenarios. To bridge this gap, we introduce WebVoyager, an\ninnovative Large Multimodal Model (LMM) powered web agent that can complete\nuser instructions end-to-end by interacting with real-world websites. Moreover,\nwe propose a new evaluation protocol for web agents to address the challenges\nof automatic evaluation of open-ended web agent tasks, leveraging the robust\nmultimodal comprehension capabilities of GPT-4V. We create a new benchmark by\ngathering real-world tasks from 15 widely used websites to evaluate our agents.\nWe show that WebVoyager achieves a 55.7% task success rate, significantly\nsurpassing the performance of both GPT-4 (All Tools) and the WebVoyager\n(text-only) setups, underscoring the exceptional capability of WebVoyager in\npractical applications. We found that our proposed automatic evaluation\nachieves 85.3% agreement with human judgment, paving the way for further\ndevelopment of web agents in a real-world setting.\n","authors":["Hongliang He","Wenlin Yao","Kaixin Ma","Wenhao Yu","Yong Dai","Hongming Zhang","Zhenzhong Lan","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2401.13919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13907v1","updated":"2024-01-25T02:54:53Z","published":"2024-01-25T02:54:53Z","title":"No More Distractions: an Adaptive Up-Sampling Algorithm to Reduce Data\n  Artifacts","summary":"  Researchers recently found out that sometimes language models achieve high\naccuracy on benchmark data set, but they can not generalize very well with even\nlittle changes to the original data set. This is sometimes due to data\nartifacts, model is learning the spurious correlation between tokens and\nlabels, instead of the semantics and logic. In this work, we analyzed SNLI data\nand visualized such spurious correlations. We proposed an adaptive up-sampling\nalgorithm to correct the data artifacts, which is simple and effective, and\ndoes not need human edits or annotation. We did an experiment applying the\nalgorithm to fix the data artifacts in SNLI data and the model trained with\ncorrected data performed significantly better than the model trained with raw\nSNLI data, overall, as well as on the subset we corrected.\n","authors":["Han Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00162v2","updated":"2024-01-25T02:50:53Z","published":"2023-06-30T22:36:41Z","title":"What do self-supervised speech models know about words?","summary":"  Many self-supervised speech models (S3Ms) have been introduced over the last\nfew years, improving performance and data efficiency on various speech tasks.\nHowever, these empirical successes alone do not give a complete picture of what\nis learned during pre-training. Recent work has begun analyzing how S3Ms encode\ncertain properties, such as phonetic and speaker information, but we still lack\na proper understanding of knowledge encoded at the word level and beyond. In\nthis work, we use lightweight analysis methods to study segment-level\nlinguistic properties -- word identity, boundaries, pronunciation, syntactic\nfeatures, and semantic features -- encoded in S3Ms. We present a comparative\nstudy of layer-wise representations from ten S3Ms and find that (i) the\nframe-level representations within each word segment are not all equally\ninformative, and (ii) the pre-training objective and model size heavily\ninfluence the accessibility and distribution of linguistic information across\nlayers. We also find that on several tasks -- word discrimination, word\nsegmentation, and semantic sentence similarity -- S3Ms trained with visual\ngrounding outperform their speech-only counterparts. Finally, our task-based\nanalyses demonstrate an improved performance on word segmentation and acoustic\nword discrimination while using simpler methods than prior work.\n","authors":["Ankita Pasad","Chung-Ming Chien","Shane Settle","Karen Livescu"],"pdf_url":"https://arxiv.org/pdf/2307.00162v2.pdf","comment":"This is a pre-MIT Press publication version"},{"id":"http://arxiv.org/abs/2401.13905v1","updated":"2024-01-25T02:50:03Z","published":"2024-01-25T02:50:03Z","title":"Dynamic embedded topic models and change-point detection for exploring\n  literary-historical hypotheses","summary":"  We present a novel combination of dynamic embedded topic models and\nchange-point detection to explore diachronic change of lexical semantic\nmodality in classical and early Christian Latin. We demonstrate several methods\nfor finding and characterizing patterns in the output, and relating them to\ntraditional scholarship in Comparative Literature and Classics. This simple\napproach to unsupervised models of semantic change can be applied to any\nsuitable corpus, and we conclude with future directions and refinements aiming\nto allow noisier, less-curated materials to meet that threshold.\n","authors":["Hale Sirin","Tom Lippincott"],"pdf_url":"https://arxiv.org/pdf/2401.13905v1.pdf","comment":"Accepted to LaTeCH@EACL2024"},{"id":"http://arxiv.org/abs/2312.11819v2","updated":"2024-01-25T02:46:06Z","published":"2023-12-19T03:24:55Z","title":"An Adaptive Placement and Parallelism Framework for Accelerating RLHF\n  Training","summary":"  Recently, ChatGPT or InstructGPT like large language models (LLM) has made a\nsignificant impact in the AI world. Many works have attempted to reproduce the\ncomplex InstructGPT's training pipeline, namely Reinforcement Learning with\nHuman Feedback (RLHF). However, the mainstream distributed RLHF training\nmethods typically adopt a fixed model placement strategy, referred to as the\nFlattening strategy. This strategy treats all four interdependent models\ninvolved in RLHF as a single entity, distributing them across all devices and\napplying parallelism techniques designed for a single model, regardless of the\ndifferent workloads inherent to each model. As a result, this strategy\nexacerbates the generation bottlenecks in the RLHF training and degrades the\noverall training efficiency. To address these issues, we propose an adaptive\nmodel placement framework that offers two flexible model placement strategies.\nThe Interleaving strategy helps reduce memory redundancy and communication\ncosts of RLHF training by placing models without dependencies on exclusive\ndevices with careful orchestration. On the other hand, the Separation strategy\nimproves the throughput of model training by separating the training and\ninference runtime of the RLHF pipeline with additional shadow models.\nFurthermore, our framework provides a simple user interface and allows for the\nagile allocation of models across devices in a fine-grained manner for various\ntraining scenarios, involving models of varying sizes and devices of different\nscales. Extensive experiments have demonstrated that our Interleaving and\nSeparation strategies can achieve notable improvements up to 11X, compared to\nthe current SOTA approaches. The results highlight the effectiveness and\nadaptability of our approaches in accelerating the training of distributed\nRLHF.\n","authors":["Youshao Xiao","Weichang Wu","Zhenglei Zhou","Fagui Mao","Shangchun Zhao","Lin Ju","Lei Liang","Xiaolu Zhang","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.11819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13887v1","updated":"2024-01-25T02:05:31Z","published":"2024-01-25T02:05:31Z","title":"A comparative study of zero-shot inference with large language models\n  and supervised modeling in breast cancer pathology classification","summary":"  Although supervised machine learning is popular for information extraction\nfrom clinical notes, creating large annotated datasets requires extensive\ndomain expertise and is time-consuming. Meanwhile, large language models (LLMs)\nhave demonstrated promising transfer learning capability. In this study, we\nexplored whether recent LLMs can reduce the need for large-scale data\nannotations. We curated a manually-labeled dataset of 769 breast cancer\npathology reports, labeled with 13 categories, to compare zero-shot\nclassification capability of the GPT-4 model and the GPT-3.5 model with\nsupervised classification performance of three model architectures: random\nforests classifier, long short-term memory networks with attention (LSTM-Att),\nand the UCSF-BERT model. Across all 13 tasks, the GPT-4 model performed either\nsignificantly better than or as well as the best supervised model, the LSTM-Att\nmodel (average macro F1 score of 0.83 vs. 0.75). On tasks with high imbalance\nbetween labels, the differences were more prominent. Frequent sources of GPT-4\nerrors included inferences from multiple samples and complex task design. On\ncomplex tasks where large annotated datasets cannot be easily collected, LLMs\ncan reduce the burden of large-scale data labeling. However, if the use of LLMs\nis prohibitive, the use of simpler supervised models with large annotated\ndatasets can provide comparable results. LLMs demonstrated the potential to\nspeed up the execution of clinical NLP studies by reducing the need for\ncurating large annotated datasets. This may result in an increase in the\nutilization of NLP-based variables and outcomes in observational clinical\nstudies.\n","authors":["Madhumita Sushil","Travis Zack","Divneet Mandair","Zhiwei Zheng","Ahmed Wali","Yan-Ning Yu","Yuwei Quan","Atul J. Butte"],"pdf_url":"https://arxiv.org/pdf/2401.13887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08302v3","updated":"2024-01-25T00:48:34Z","published":"2023-06-14T07:15:26Z","title":"Unifying Large Language Models and Knowledge Graphs: A Roadmap","summary":"  Large language models (LLMs), such as ChatGPT and GPT4, are making new waves\nin the field of natural language processing and artificial intelligence, due to\ntheir emergent ability and generalizability. However, LLMs are black-box\nmodels, which often fall short of capturing and accessing factual knowledge. In\ncontrast, Knowledge Graphs (KGs), Wikipedia and Huapu for example, are\nstructured knowledge models that explicitly store rich factual knowledge. KGs\ncan enhance LLMs by providing external knowledge for inference and\ninterpretability. Meanwhile, KGs are difficult to construct and evolving by\nnature, which challenges the existing methods in KGs to generate new facts and\nrepresent unseen knowledge. Therefore, it is complementary to unify LLMs and\nKGs together and simultaneously leverage their advantages. In this article, we\npresent a forward-looking roadmap for the unification of LLMs and KGs. Our\nroadmap consists of three general frameworks, namely, 1) KG-enhanced LLMs,\nwhich incorporate KGs during the pre-training and inference phases of LLMs, or\nfor the purpose of enhancing understanding of the knowledge learned by LLMs; 2)\nLLM-augmented KGs, that leverage LLMs for different KG tasks such as embedding,\ncompletion, construction, graph-to-text generation, and question answering; and\n3) Synergized LLMs + KGs, in which LLMs and KGs play equal roles and work in a\nmutually beneficial way to enhance both LLMs and KGs for bidirectional\nreasoning driven by both data and knowledge. We review and summarize existing\nefforts within these three frameworks in our roadmap and pinpoint their future\nresearch directions.\n","authors":["Shirui Pan","Linhao Luo","Yufei Wang","Chen Chen","Jiapu Wang","Xindong Wu"],"pdf_url":"https://arxiv.org/pdf/2306.08302v3.pdf","comment":"A short version of this paper was accepted by IEEE Transactions on\n  Knowledge and Data Engineering (TKDE)"},{"id":"http://arxiv.org/abs/2401.13867v1","updated":"2024-01-25T00:34:16Z","published":"2024-01-25T00:34:16Z","title":"Unmasking and Quantifying Racial Bias of Large Language Models in\n  Medical Report Generation","summary":"  Large language models like GPT-3.5-turbo and GPT-4 hold promise for\nhealthcare professionals, but they may inadvertently inherit biases during\ntheir training, potentially affecting their utility in medical applications.\nDespite few attempts in the past, the precise impact and extent of these biases\nremain uncertain. Through both qualitative and quantitative analyses, we find\nthat these models tend to project higher costs and longer hospitalizations for\nWhite populations and exhibit optimistic views in challenging medical scenarios\nwith much higher survival rates. These biases, which mirror real-world\nhealthcare disparities, are evident in the generation of patient backgrounds,\nthe association of specific diseases with certain races, and disparities in\ntreatment recommendations, etc. Our findings underscore the critical need for\nfuture research to address and mitigate biases in language models, especially\nin critical healthcare applications, to ensure fair and accurate outcomes for\nall patients.\n","authors":["Yifan Yang","Xiaoyu Liu","Qiao Jin","Furong Huang","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2401.13867v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.14405v1","updated":"2024-01-25T18:59:58Z","published":"2024-01-25T18:59:58Z","title":"Multimodal Pathway: Improve Transformers with Irrelevant Data from Other\n  Modalities","summary":"  We propose to improve transformers of a specific modality with irrelevant\ndata from other modalities, e.g., improve an ImageNet model with audio or point\ncloud datasets. We would like to highlight that the data samples of the target\nmodality are irrelevant to the other modalities, which distinguishes our method\nfrom other works utilizing paired (e.g., CLIP) or interleaved data of different\nmodalities. We propose a methodology named Multimodal Pathway - given a target\nmodality and a transformer designed for it, we use an auxiliary transformer\ntrained with data of another modality and construct pathways to connect\ncomponents of the two models so that data of the target modality can be\nprocessed by both models. In this way, we utilize the universal\nsequence-to-sequence modeling abilities of transformers obtained from two\nmodalities. As a concrete implementation, we use a modality-specific tokenizer\nand task-specific head as usual but utilize the transformer blocks of the\nauxiliary model via a proposed method named Cross-Modal Re-parameterization,\nwhich exploits the auxiliary weights without any inference costs. On the image,\npoint cloud, video, and audio recognition tasks, we observe significant and\nconsistent performance improvements with irrelevant data from other modalities.\nThe code and models are available at https://github.com/AILab-CVC/M2PT.\n","authors":["Yiyuan Zhang","Xiaohan Ding","Kaixiong Gong","Yixiao Ge","Ying Shan","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2401.14405v1.pdf","comment":"The code and models are available at\n  https://github.com/AILab-CVC/M2PT"},{"id":"http://arxiv.org/abs/2401.14404v1","updated":"2024-01-25T18:59:57Z","published":"2024-01-25T18:59:57Z","title":"Deconstructing Denoising Diffusion Models for Self-Supervised Learning","summary":"  In this study, we examine the representation learning abilities of Denoising\nDiffusion Models (DDM) that were originally purposed for image generation. Our\nphilosophy is to deconstruct a DDM, gradually transforming it into a classical\nDenoising Autoencoder (DAE). This deconstructive procedure allows us to explore\nhow various components of modern DDMs influence self-supervised representation\nlearning. We observe that only a very few modern components are critical for\nlearning good representations, while many others are nonessential. Our study\nultimately arrives at an approach that is highly simplified and to a large\nextent resembles a classical DAE. We hope our study will rekindle interest in a\nfamily of classical methods within the realm of modern self-supervised\nlearning.\n","authors":["Xinlei Chen","Zhuang Liu","Saining Xie","Kaiming He"],"pdf_url":"https://arxiv.org/pdf/2401.14404v1.pdf","comment":"Technical report, 10 pages"},{"id":"http://arxiv.org/abs/2401.14403v1","updated":"2024-01-25T18:59:44Z","published":"2024-01-25T18:59:44Z","title":"Adaptive Mobile Manipulation for Articulated Objects In the Open World","summary":"  Deploying robots in open-ended unstructured environments such as homes has\nbeen a long-standing research problem. However, robots are often studied only\nin closed-off lab settings, and prior mobile manipulation work is restricted to\npick-move-place, which is arguably just the tip of the iceberg in this area. In\nthis paper, we introduce Open-World Mobile Manipulation System, a full-stack\napproach to tackle realistic articulated object operation, e.g. real-world\ndoors, cabinets, drawers, and refrigerators in open-ended unstructured\nenvironments. The robot utilizes an adaptive learning framework to initially\nlearns from a small set of data through behavior cloning, followed by learning\nfrom online practice on novel objects that fall outside the training\ndistribution. We also develop a low-cost mobile manipulation hardware platform\ncapable of safe and autonomous online adaptation in unstructured environments\nwith a cost of around 20,000 USD. In our experiments we utilize 20 articulate\nobjects across 4 buildings in the CMU campus. With less than an hour of online\nlearning for each object, the system is able to increase success rate from 50%\nof BC pre-training to 95% using online adaptation. Video results at\nhttps://open-world-mobilemanip.github.io/\n","authors":["Haoyu Xiong","Russell Mendonca","Kenneth Shaw","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2401.14403v1.pdf","comment":"Website at https://open-world-mobilemanip.github.io/"},{"id":"http://arxiv.org/abs/2401.14401v1","updated":"2024-01-25T18:59:42Z","published":"2024-01-25T18:59:42Z","title":"Range-Agnostic Multi-View Depth Estimation With Keyframe Selection","summary":"  Methods for 3D reconstruction from posed frames require prior knowledge about\nthe scene metric range, usually to recover matching cues along the epipolar\nlines and narrow the search range. However, such prior might not be directly\navailable or estimated inaccurately in real scenarios -- e.g., outdoor 3D\nreconstruction from video sequences -- therefore heavily hampering performance.\nIn this paper, we focus on multi-view depth estimation without requiring prior\nknowledge about the metric range of the scene by proposing RAMDepth, an\nefficient and purely 2D framework that reverses the depth estimation and\nmatching steps order. Moreover, we demonstrate the capability of our framework\nto provide rich insights about the quality of the views used for prediction.\nAdditional material can be found on our project page\nhttps://andreaconti.github.io/projects/range_agnostic_multi_view_depth.\n","authors":["Andrea Conti","Matteo Poggi","Valerio Cambareri","Stefano Mattoccia"],"pdf_url":"https://arxiv.org/pdf/2401.14401v1.pdf","comment":"3DV 2024 Project Page\n  https://andreaconti.github.io/projects/range_agnostic_multi_view_depth GitHub\n  Page https://github.com/andreaconti/ramdepth.git"},{"id":"http://arxiv.org/abs/2401.14398v1","updated":"2024-01-25T18:57:36Z","published":"2024-01-25T18:57:36Z","title":"pix2gestalt: Amodal Segmentation by Synthesizing Wholes","summary":"  We introduce pix2gestalt, a framework for zero-shot amodal segmentation,\nwhich learns to estimate the shape and appearance of whole objects that are\nonly partially visible behind occlusions. By capitalizing on large-scale\ndiffusion models and transferring their representations to this task, we learn\na conditional diffusion model for reconstructing whole objects in challenging\nzero-shot cases, including examples that break natural and physical priors,\nsuch as art. As training data, we use a synthetically curated dataset\ncontaining occluded objects paired with their whole counterparts. Experiments\nshow that our approach outperforms supervised baselines on established\nbenchmarks. Our model can furthermore be used to significantly improve the\nperformance of existing object recognition and 3D reconstruction methods in the\npresence of occlusions.\n","authors":["Ege Ozguroglu","Ruoshi Liu","Dídac Surís","Dian Chen","Achal Dave","Pavel Tokmakov","Carl Vondrick"],"pdf_url":"https://arxiv.org/pdf/2401.14398v1.pdf","comment":"Website: https://gestalt.cs.columbia.edu/"},{"id":"http://arxiv.org/abs/2401.14391v1","updated":"2024-01-25T18:49:57Z","published":"2024-01-25T18:49:57Z","title":"Rethinking Patch Dependence for Masked Autoencoders","summary":"  In this work, we re-examine inter-patch dependencies in the decoding\nmechanism of masked autoencoders (MAE). We decompose this decoding mechanism\nfor masked patch reconstruction in MAE into self-attention and cross-attention.\nOur investigations suggest that self-attention between mask patches is not\nessential for learning good representations. To this end, we propose a novel\npretraining framework: Cross-Attention Masked Autoencoders (CrossMAE).\nCrossMAE's decoder leverages only cross-attention between masked and visible\ntokens, with no degradation in downstream performance. This design also enables\ndecoding only a small subset of mask tokens, boosting efficiency. Furthermore,\neach decoder block can now leverage different encoder features, resulting in\nimproved representation learning. CrossMAE matches MAE in performance with 2.5\nto 3.7$\\times$ less decoding compute. It also surpasses MAE on ImageNet\nclassification and COCO instance segmentation under the same compute. Code and\nmodels: https://crossmae.github.io\n","authors":["Letian Fu","Long Lian","Renhao Wang","Baifeng Shi","Xudong Wang","Adam Yala","Trevor Darrell","Alexei A. Efros","Ken Goldberg"],"pdf_url":"https://arxiv.org/pdf/2401.14391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14387v1","updated":"2024-01-25T18:46:35Z","published":"2024-01-25T18:46:35Z","title":"Inconsistency Masks: Removing the Uncertainty from Input-Pseudo-Label\n  Pairs","summary":"  Generating sufficient labeled data is a significant hurdle in the efficient\nexecution of deep learning projects, especially in uncharted territories of\nimage segmentation where labeling demands extensive time, unlike classification\ntasks. Our study confronts this challenge, operating in an environment\nconstrained by limited hardware resources and the lack of extensive datasets or\npre-trained models. We introduce the novel use of Inconsistency Masks (IM) to\neffectively filter uncertainty in image-pseudo-label pairs, substantially\nelevating segmentation quality beyond traditional semi-supervised learning\ntechniques. By integrating IM with other methods, we demonstrate remarkable\nbinary segmentation performance on the ISIC 2018 dataset, starting with just\n10% labeled data. Notably, three of our hybrid models outperform those trained\non the fully labeled dataset. Our approach consistently achieves exceptional\nresults across three additional datasets and shows further improvement when\ncombined with other techniques. For comprehensive and robust evaluation, this\npaper includes an extensive analysis of prevalent semi-supervised learning\nstrategies, all trained under identical starting conditions. The full code is\navailable at: https://github.com/MichaelVorndran/InconsistencyMasks\n","authors":["Michael R. H. Vorndran","Bernhard F. Roeck"],"pdf_url":"https://arxiv.org/pdf/2401.14387v1.pdf","comment":"18 pages, 22 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.13440v2","updated":"2024-01-25T18:31:49Z","published":"2023-12-20T21:30:55Z","title":"MGAug: Multimodal Geometric Augmentation in Latent Spaces of Image\n  Deformations","summary":"  Geometric transformations have been widely used to augment the size of\ntraining images. Existing methods often assume a unimodal distribution of the\nunderlying transformations between images, which limits their power when data\nwith multimodal distributions occur. In this paper, we propose a novel model,\nMultimodal Geometric Augmentation (MGAug), that for the first time generates\naugmenting transformations in a multimodal latent space of geometric\ndeformations. To achieve this, we first develop a deep network that embeds the\nlearning of latent geometric spaces of diffeomorphic transformations (a.k.a.\ndiffeomorphisms) in a variational autoencoder (VAE). A mixture of multivariate\nGaussians is formulated in the tangent space of diffeomorphisms and serves as a\nprior to approximate the hidden distribution of image transformations. We then\naugment the original training dataset by deforming images using randomly\nsampled transformations from the learned multimodal latent space of VAE. To\nvalidate the efficiency of our model, we jointly learn the augmentation\nstrategy with two distinct domain-specific tasks: multi-class classification on\n2D synthetic datasets and segmentation on real 3D brain magnetic resonance\nimages (MRIs). We also compare MGAug with state-of-the-art transformation-based\nimage augmentation algorithms. Experimental results show that our proposed\napproach outperforms all baselines by significantly improved prediction\naccuracy. Our code is publicly available at\nhttps://github.com/tonmoy-hossain/MGAug.\n","authors":["Tonmoy Hossain","Miaomiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.13440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14379v1","updated":"2024-01-25T18:30:46Z","published":"2024-01-25T18:30:46Z","title":"UrbanGenAI: Reconstructing Urban Landscapes using Panoptic Segmentation\n  and Diffusion Models","summary":"  In contemporary design practices, the integration of computer vision and\ngenerative artificial intelligence (genAI) represents a transformative shift\ntowards more interactive and inclusive processes. These technologies offer new\ndimensions of image analysis and generation, which are particularly relevant in\nthe context of urban landscape reconstruction. This paper presents a novel\nworkflow encapsulated within a prototype application, designed to leverage the\nsynergies between advanced image segmentation and diffusion models for a\ncomprehensive approach to urban design. Our methodology encompasses the\nOneFormer model for detailed image segmentation and the Stable Diffusion XL\n(SDXL) diffusion model, implemented through ControlNet, for generating images\nfrom textual descriptions. Validation results indicated a high degree of\nperformance by the prototype application, showcasing significant accuracy in\nboth object detection and text-to-image generation. This was evidenced by\nsuperior Intersection over Union (IoU) and CLIP scores across iterative\nevaluations for various categories of urban landscape features. Preliminary\ntesting included utilising UrbanGenAI as an educational tool enhancing the\nlearning experience in design pedagogy, and as a participatory instrument\nfacilitating community-driven urban planning. Early results suggested that\nUrbanGenAI not only advances the technical frontiers of urban landscape\nreconstruction but also provides significant pedagogical and participatory\nplanning benefits. The ongoing development of UrbanGenAI aims to further\nvalidate its effectiveness across broader contexts and integrate additional\nfeatures such as real-time feedback mechanisms and 3D modelling capabilities.\nKeywords: generative AI; panoptic image segmentation; diffusion models; urban\nlandscape design; design pedagogy; co-design\n","authors":["Timo Kapsalis"],"pdf_url":"https://arxiv.org/pdf/2401.14379v1.pdf","comment":"19 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2401.14354v1","updated":"2024-01-25T17:58:51Z","published":"2024-01-25T17:58:51Z","title":"Learning Robust Generalizable Radiance Field with Visibility and Feature\n  Augmented Point Representation","summary":"  This paper introduces a novel paradigm for the generalizable neural radiance\nfield (NeRF). Previous generic NeRF methods combine multiview stereo techniques\nwith image-based neural rendering for generalization, yielding impressive\nresults, while suffering from three issues. First, occlusions often result in\ninconsistent feature matching. Then, they deliver distortions and artifacts in\ngeometric discontinuities and locally sharp shapes due to their individual\nprocess of sampled points and rough feature aggregation. Third, their\nimage-based representations experience severe degradations when source views\nare not near enough to the target view. To address challenges, we propose the\nfirst paradigm that constructs the generalizable neural field based on\npoint-based rather than image-based rendering, which we call the Generalizable\nneural Point Field (GPF). Our approach explicitly models visibilities by\ngeometric priors and augments them with neural features. We propose a novel\nnonuniform log sampling strategy to improve both rendering speed and\nreconstruction quality. Moreover, we present a learnable kernel spatially\naugmented with features for feature aggregations, mitigating distortions at\nplaces with drastically varying geometries. Besides, our representation can be\neasily manipulated. Experiments show that our model can deliver better\ngeometries, view consistencies, and rendering quality than all counterparts and\nbenchmarks on three datasets in both generalization and finetuning settings,\npreliminarily proving the potential of the new paradigm for generalizable NeRF.\n","authors":["Jiaxu Wang","Ziyi Zhang","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2401.14354v1.pdf","comment":"International Conference on Learning Representations 2024"},{"id":"http://arxiv.org/abs/2401.14349v1","updated":"2024-01-25T17:50:05Z","published":"2024-01-25T17:50:05Z","title":"Learning to navigate efficiently and precisely in real environments","summary":"  In the context of autonomous navigation of terrestrial robots, the creation\nof realistic models for agent dynamics and sensing is a widespread habit in the\nrobotics literature and in commercial applications, where they are used for\nmodel based control and/or for localization and mapping. The more recent\nEmbodied AI literature, on the other hand, focuses on modular or end-to-end\nagents trained in simulators like Habitat or AI-Thor, where the emphasis is put\non photo-realistic rendering and scene diversity, but high-fidelity robot\nmotion is assigned a less privileged role. The resulting sim2real gap\nsignificantly impacts transfer of the trained models to real robotic platforms.\nIn this work we explore end-to-end training of agents in simulation in settings\nwhich minimize the sim2real gap both, in sensing and in actuation. Our agent\ndirectly predicts (discretized) velocity commands, which are maintained through\nclosed-loop control in the real robot. The behavior of the real robot\n(including the underlying low-level controller) is identified and simulated in\na modified Habitat simulator. Noise models for odometry and localization\nfurther contribute in lowering the sim2real gap. We evaluate on real navigation\nscenarios, explore different localization and point goal calculation methods\nand report significant gains in performance and robustness compared to prior\nwork.\n","authors":["Guillaume Bono","Hervé Poirier","Leonid Antsfeld","Gianluca Monaci","Boris Chidlovskii","Christian Wolf"],"pdf_url":"https://arxiv.org/pdf/2401.14349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09965v2","updated":"2024-01-25T17:37:58Z","published":"2023-10-15T21:54:45Z","title":"ProteusNeRF: Fast Lightweight NeRF Editing using 3D-Aware Image Context","summary":"  Neural Radiance Fields (NeRFs) have recently emerged as a popular option for\nphoto-realistic object capture due to their ability to faithfully capture\nhigh-fidelity volumetric content even from handheld video input. Although much\nresearch has been devoted to efficient optimization leading to real-time\ntraining and rendering, options for interactive editing NeRFs remain limited.\nWe present a very simple but effective neural network architecture that is fast\nand efficient while maintaining a low memory footprint. This architecture can\nbe incrementally guided through user-friendly image-based edits. Our\nrepresentation allows straightforward object selection via semantic feature\ndistillation at the training stage. More importantly, we propose a local\n3D-aware image context to facilitate view-consistent image editing that can\nthen be distilled into fine-tuned NeRFs, via geometric and appearance\nadjustments. We evaluate our setup on a variety of examples to demonstrate\nappearance and geometric edits and report 10-30x speedup over concurrent work\nfocusing on text-guided NeRF editing. Video results can be seen on our project\nwebpage at https://proteusnerf.github.io.\n","authors":["Binglun Wang","Niladri Shekhar Dutt","Niloy J. Mitra"],"pdf_url":"https://arxiv.org/pdf/2310.09965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14336v1","updated":"2024-01-25T17:34:34Z","published":"2024-01-25T17:34:34Z","title":"Progressive Multi-task Anti-Noise Learning and Distilling Frameworks for\n  Fine-grained Vehicle Recognition","summary":"  Fine-grained vehicle recognition (FGVR) is an essential fundamental\ntechnology for intelligent transportation systems, but very difficult because\nof its inherent intra-class variation. Most previous FGVR studies only focus on\nthe intra-class variation caused by different shooting angles, positions, etc.,\nwhile the intra-class variation caused by image noise has received little\nattention. This paper proposes a progressive multi-task anti-noise learning\n(PMAL) framework and a progressive multi-task distilling (PMD) framework to\nsolve the intra-class variation problem in FGVR due to image noise. The PMAL\nframework achieves high recognition accuracy by treating image denoising as an\nadditional task in image recognition and progressively forcing a model to learn\nnoise invariance. The PMD framework transfers the knowledge of the PMAL-trained\nmodel into the original backbone network, which produces a model with about the\nsame recognition accuracy as the PMAL-trained model, but without any additional\noverheads over the original backbone network. Combining the two frameworks, we\nobtain models that significantly exceed previous state-of-the-art methods in\nrecognition accuracy on two widely-used, standard FGVR datasets, namely\nStanford Cars, and CompCars, as well as three additional surveillance\nimage-based vehicle-type classification datasets, namely Beijing Institute of\nTechnology (BIT)-Vehicle, Vehicle Type Image Data 2 (VTID2), and Vehicle Images\nDataset for Make Model Recognition (VIDMMR), without any additional overheads\nover the original backbone networks. The source code is available at\nhttps://github.com/Dichao-Liu/Anti-noise_FGVR\n","authors":["Dichao Liu"],"pdf_url":"https://arxiv.org/pdf/2401.14336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01140v2","updated":"2024-01-25T17:25:06Z","published":"2023-10-02T12:27:22Z","title":"Neural Processing of Tri-Plane Hybrid Neural Fields","summary":"  Driven by the appealing properties of neural fields for storing and\ncommunicating 3D data, the problem of directly processing them to address tasks\nsuch as classification and part segmentation has emerged and has been\ninvestigated in recent works. Early approaches employ neural fields\nparameterized by shared networks trained on the whole dataset, achieving good\ntask performance but sacrificing reconstruction quality. To improve the latter,\nlater methods focus on individual neural fields parameterized as large\nMulti-Layer Perceptrons (MLPs), which are, however, challenging to process due\nto the high dimensionality of the weight space, intrinsic weight space\nsymmetries, and sensitivity to random initialization. Hence, results turn out\nsignificantly inferior to those achieved by processing explicit\nrepresentations, e.g., point clouds or meshes. In the meantime, hybrid\nrepresentations, in particular based on tri-planes, have emerged as a more\neffective and efficient alternative to realize neural fields, but their direct\nprocessing has not been investigated yet. In this paper, we show that the\ntri-plane discrete data structure encodes rich information, which can be\neffectively processed by standard deep-learning machinery. We define an\nextensive benchmark covering a diverse set of fields such as occupancy,\nsigned/unsigned distance, and, for the first time, radiance fields. While\nprocessing a field with the same reconstruction quality, we achieve task\nperformance far superior to frameworks that process large MLPs and, for the\nfirst time, almost on par with architectures handling explicit representations.\n","authors":["Adriano Cardace","Pierluigi Zama Ramirez","Francesco Ballerini","Allan Zhou","Samuele Salti","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2310.01140v2.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.14325v1","updated":"2024-01-25T17:21:35Z","published":"2024-01-25T17:21:35Z","title":"Unlocking Past Information: Temporal Embeddings in Cooperative Bird's\n  Eye View Prediction","summary":"  Accurate and comprehensive semantic segmentation of Bird's Eye View (BEV) is\nessential for ensuring safe and proactive navigation in autonomous driving.\nAlthough cooperative perception has exceeded the detection capabilities of\nsingle-agent systems, prevalent camera-based algorithms in cooperative\nperception neglect valuable information derived from historical observations.\nThis limitation becomes critical during sensor failures or communication issues\nas cooperative perception reverts to single-agent perception, leading to\ndegraded performance and incomplete BEV segmentation maps. This paper\nintroduces TempCoBEV, a temporal module designed to incorporate historical cues\ninto current observations, thereby improving the quality and reliability of BEV\nmap segmentations. We propose an importance-guided attention architecture to\neffectively integrate temporal information that prioritizes relevant properties\nfor BEV map segmentation. TempCoBEV is an independent temporal module that\nseamlessly integrates into state-of-the-art camera-based cooperative perception\nmodels. We demonstrate through extensive experiments on the OPV2V dataset that\nTempCoBEV performs better than non-temporal models in predicting current and\nfuture BEV map segmentations, particularly in scenarios involving communication\nfailures. We show the efficacy of TempCoBEV and its capability to integrate\nhistorical cues into the current BEV map, improving predictions under optimal\ncommunication conditions by up to 2% and under communication failures by up to\n19%. The code will be published on GitHub.\n","authors":["Dominik Rößle","Jeremias Gerner","Klaus Bogenberger","Daniel Cremers","Stefanie Schmidtner","Torsten Schön"],"pdf_url":"https://arxiv.org/pdf/2401.14325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14322v1","updated":"2024-01-25T17:19:22Z","published":"2024-01-25T17:19:22Z","title":"Generalized People Diversity: Learning a Human Perception-Aligned\n  Diversity Representation for People Images","summary":"  Capturing the diversity of people in images is challenging: recent literature\ntends to focus on diversifying one or two attributes, requiring expensive\nattribute labels or building classifiers. We introduce a diverse people image\nranking method which more flexibly aligns with human notions of people\ndiversity in a less prescriptive, label-free manner. The Perception-Aligned\nText-derived Human representation Space (PATHS) aims to capture all or many\nrelevant features of people-related diversity, and, when used as the\nrepresentation space in the standard Maximal Marginal Relevance (MMR) ranking\nalgorithm, is better able to surface a range of types of people-related\ndiversity (e.g. disability, cultural attire). PATHS is created in two stages.\nFirst, a text-guided approach is used to extract a person-diversity\nrepresentation from a pre-trained image-text model. Then this representation is\nfine-tuned on perception judgments from human annotators so that it captures\nthe aspects of people-related similarity that humans find most salient.\nEmpirical results show that the PATHS method achieves diversity better than\nbaseline methods, according to side-by-side ratings from human annotators.\n","authors":["Hansa Srinivasan","Candice Schumann","Aradhana Sinha","David Madras","Gbolahan Oluwafemi Olanubi","Alex Beutel","Susanna Ricco","Jilin Chen"],"pdf_url":"https://arxiv.org/pdf/2401.14322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09627v2","updated":"2024-01-25T17:09:21Z","published":"2024-01-17T22:34:20Z","title":"SymTC: A Symbiotic Transformer-CNN Net for Instance Segmentation of\n  Lumbar Spine MRI","summary":"  Intervertebral disc disease, a prevalent ailment, frequently leads to\nintermittent or persistent low back pain, and diagnosing and assessing of this\ndisease rely on accurate measurement of vertebral bone and intervertebral disc\ngeometries from lumbar MR images. Deep neural network (DNN) models may assist\nclinicians with more efficient image segmentation of individual instances\n(disks and vertebrae) of the lumbar spine in an automated way, which is termed\nas instance image segmentation. In this work, we proposed SymTC, an innovative\nlumbar spine MR image segmentation model that combines the strengths of\nTransformer and Convolutional Neural Network (CNN). Specifically, we designed a\nparallel dual-path architecture to merge CNN layers and Transformer layers, and\nwe integrated a novel position embedding into the self-attention module of\nTransformer, enhancing the utilization of positional information for more\naccurate segmentation. To further improves model performance, we introduced a\nnew data augmentation technique to create synthetic yet realistic MR image\ndataset, named SSMSpine, which is made publicly available. We evaluated our\nSymTC and the other 15 existing image segmentation models on our private\nin-house dataset and the public SSMSpine dataset, using two metrics, Dice\nSimilarity Coefficient and 95% Hausdorff Distance. The results show that our\nSymTC has the best performance for segmenting vertebral bones and\nintervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine\ndataset are available at https://github.com/jiasongchen/SymTC.\n","authors":["Jiasong Chen","Linchen Qian","Linhai Ma","Timur Urakov","Weiyong Gu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2401.09627v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01429v4","updated":"2024-01-25T17:02:49Z","published":"2023-09-04T08:23:31Z","title":"Adapting Segment Anything Model for Change Detection in HR Remote\n  Sensing Images","summary":"  Vision Foundation Models (VFMs) such as the Segment Anything Model (SAM)\nallow zero-shot or interactive segmentation of visual contents, thus they are\nquickly applied in a variety of visual scenes. However, their direct use in\nmany Remote Sensing (RS) applications is often unsatisfactory due to the\nspecial imaging characteristics of RS images. In this work, we aim to utilize\nthe strong visual recognition capabilities of VFMs to improve the change\ndetection of high-resolution Remote Sensing Images (RSIs). We employ the visual\nencoder of FastSAM, an efficient variant of the SAM, to extract visual\nrepresentations in RS scenes. To adapt FastSAM to focus on some specific ground\nobjects in the RS scenes, we propose a convolutional adaptor to aggregate the\ntask-oriented change information. Moreover, to utilize the semantic\nrepresentations that are inherent to SAM features, we introduce a task-agnostic\nsemantic learning branch to model the semantic latent in bi-temporal RSIs. The\nresulting method, SAMCD, obtains superior accuracy compared to the SOTA methods\nand exhibits a sample-efficient learning ability that is comparable to\nsemi-supervised CD methods. To the best of our knowledge, this is the first\nwork that adapts VFMs for the CD of HR RSIs.\n","authors":["Lei Ding","Kun Zhu","Daifeng Peng","Hao Tang","Kuiwu Yang","Lorenzo Bruzzone"],"pdf_url":"https://arxiv.org/pdf/2309.01429v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.07778v2","updated":"2024-01-25T16:47:32Z","published":"2022-12-15T12:54:21Z","title":"Efficient Visual Computing with Camera RAW Snapshots","summary":"  Conventional cameras capture image irradiance on a sensor and convert it to\nRGB images using an image signal processor (ISP). The images can then be used\nfor photography or visual computing tasks in a variety of applications, such as\npublic safety surveillance and autonomous driving. One can argue that since RAW\nimages contain all the captured information, the conversion of RAW to RGB using\nan ISP is not necessary for visual computing. In this paper, we propose a novel\n$\\rho$-Vision framework to perform high-level semantic understanding and\nlow-level compression using RAW images without the ISP subsystem used for\ndecades. Considering the scarcity of available RAW image datasets, we first\ndevelop an unpaired CycleR2R network based on unsupervised CycleGAN to train\nmodular unrolled ISP and inverse ISP (invISP) models using unpaired RAW and RGB\nimages. We can then flexibly generate simulated RAW images (simRAW) using any\nexisting RGB image dataset and finetune different models originally trained for\nthe RGB domain to process real-world camera RAW images. We demonstrate object\ndetection and image compression capabilities in RAW-domain using RAW-domain\nYOLOv3 and RAW image compressor (RIC) on snapshots from various cameras.\nQuantitative results reveal that RAW-domain task inference provides better\ndetection accuracy and compression compared to RGB-domain processing.\nFurthermore, the proposed \\r{ho}-Vision generalizes across various camera\nsensors and different task-specific models. Additional advantages of the\nproposed $\\rho$-Vision that eliminates the ISP are the potential reductions in\ncomputations and processing times.\n","authors":["Zhihao Li","Ming Lu","Xu Zhang","Xin Feng","M. Salman Asif","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2212.07778v2.pdf","comment":"Accepted by T-PAMI 2024. Homepage:\n  https://njuvision.github.io/rho-vision"},{"id":"http://arxiv.org/abs/2310.05207v3","updated":"2024-01-25T16:29:03Z","published":"2023-10-08T15:49:26Z","title":"Facial Action Unit Detection Based on Multi-task Learning Strategy for\n  Unlabeled Facial Images in the Wild","summary":"  Facial Action Unit (AU) detection often relies on highly-cost accurate\nlabeling or inaccurate pseudo labeling techniques in recent years. How to\nintroduce large amounts of unlabeled facial images in the wild into supervised\nAU detection frameworks has become a challenging problem. Additionally, nearly\nevery type of AUs has the problem of unbalanced positive and negative samples.\nInspired by other multi-task learning frameworks, we first propose a multi-task\nlearning strategy boosting AU detection in the wild through jointing facial\nlandmark detection and AU domain separation and reconstruction. Our introduced\ndual domains facial landmark detection framework can solve the lack of accurate\nfacial landmark coordinates during the AU domain separation and reconstruction\ntraining process, while the parameters of homostructural facial extraction\nmodules from these two similar facial tasks are shared. Moreover, we propose a\npixel-level feature alignment scheme to maintain the consistency of features\nobtained from two separation and reconstruction processes. Furthermore, a\nweighted asymmetric loss is proposed to change the contribution of positive and\nnegative samples of each type of AUs to model parameters updating. Experimental\nresults on three widely used benchmarks demonstrate our superiority to most\nstate-of-the-art methods for AU detection.\n","authors":["Ziqiao Shang","Bin Liu"],"pdf_url":"https://arxiv.org/pdf/2310.05207v3.pdf","comment":"15 pages, 6 figure, submitted to Expert Systems with Applications"},{"id":"http://arxiv.org/abs/2309.15519v2","updated":"2024-01-25T16:28:47Z","published":"2023-09-27T09:37:29Z","title":"Defending Against Physical Adversarial Patch Attacks on Infrared Human\n  Detection","summary":"  Infrared detection is an emerging technique for safety-critical tasks owing\nto its remarkable anti-interference capability. However, recent studies have\nrevealed that it is vulnerable to physically-realizable adversarial patches,\nposing risks in its real-world applications. To address this problem, we are\nthe first to investigate defense strategies against adversarial patch attacks\non infrared detection, especially human detection. We have devised a\nstraightforward defense strategy, patch-based occlusion-aware detection (POD),\nwhich efficiently augments training samples with random patches and\nsubsequently detects them. POD not only robustly detects people but also\nidentifies adversarial patch locations. Surprisingly, while being extremely\ncomputationally efficient, POD easily generalizes to state-of-the-art\nadversarial patch attacks that are unseen during training. Furthermore, POD\nimproves detection precision even in a clean (i.e., no-attack) situation due to\nthe data augmentation effect. Evaluation demonstrated that POD is robust to\nadversarial patches of various shapes and sizes. The effectiveness of our\nbaseline approach is shown to be a viable defense mechanism for real-world\ninfrared human detection systems, paving the way for exploring future research\ndirections.\n","authors":["Lukas Strack","Futa Waseda","Huy H. Nguyen","Yinqiang Zheng","Isao Echizen"],"pdf_url":"https://arxiv.org/pdf/2309.15519v2.pdf","comment":"Lukas Strack and Futa Waseda contributed equally. 6 pages,\n  Under-review"},{"id":"http://arxiv.org/abs/2401.14285v1","updated":"2024-01-25T16:18:11Z","published":"2024-01-25T16:18:11Z","title":"POUR-Net: A Population-Prior-Aided Over-Under-Representation Network for\n  Low-Count PET Attenuation Map Generation","summary":"  Low-dose PET offers a valuable means of minimizing radiation exposure in PET\nimaging. However, the prevalent practice of employing additional CT scans for\ngenerating attenuation maps (u-map) for PET attenuation correction\nsignificantly elevates radiation doses. To address this concern and further\nmitigate radiation exposure in low-dose PET exams, we propose POUR-Net - an\ninnovative population-prior-aided over-under-representation network that aims\nfor high-quality attenuation map generation from low-dose PET. First, POUR-Net\nincorporates an over-under-representation network (OUR-Net) to facilitate\nefficient feature extraction, encompassing both low-resolution abstracted and\nfine-detail features, for assisting deep generation on the full-resolution\nlevel. Second, complementing OUR-Net, a population prior generation machine\n(PPGM) utilizing a comprehensive CT-derived u-map dataset, provides additional\nprior information to aid OUR-Net generation. The integration of OUR-Net and\nPPGM within a cascade framework enables iterative refinement of $\\mu$-map\ngeneration, resulting in the production of high-quality $\\mu$-maps.\nExperimental results underscore the effectiveness of POUR-Net, showing it as a\npromising solution for accurate CT-free low-count PET attenuation correction,\nwhich also surpasses the performance of previous baseline methods.\n","authors":["Bo Zhou","Jun Hou","Tianqi Chen","Yinchi Zhou","Xiongchao Chen","Huidong Xie","Qiong Liu","Xueqi Guo","Yu-Jung Tsai","Vladimir Y. Panin","Takuya Toyonaga","James S. Duncan","Chi Liu"],"pdf_url":"https://arxiv.org/pdf/2401.14285v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.08098v2","updated":"2024-01-25T16:10:28Z","published":"2023-05-14T08:24:59Z","title":"A Theory of General Difference in Continuous and Discrete Domain","summary":"  Though a core element of the digital age, numerical difference algorithms\nstruggle with noise susceptibility. This stems from a key disconnect between\nthe infinitesimal quantities in continuous differentiation and the finite\nintervals in its discrete counterpart. This disconnect violates the fundamental\ndefinition of differentiation (Leibniz and Cauchy). To bridge this gap, we\nbuild a novel general difference (Tao General Difference, TGD). Departing from\nderivative-by-integration, TGD generalizes differentiation to finite intervals\nin continuous domains through three key constraints. This allows us to\ncalculate the general difference of a sequence in discrete domain via the\ncontinuous step function constructed from the sequence. Two construction\nmethods, the rotational construction and the orthogonal construction, are\nproposed to construct the operators of TGD. The construction TGD operators take\nsame convolution mode in calculation for continuous functions, discrete\nsequences, and arrays across any dimension. Our analysis with example\noperations showcases TGD's capability in both continuous and discrete domains,\npaving the way for accurate and noise-resistant differentiation in the digital\nera.\n","authors":["Linmi Tao","Ruiyang Liu","Donglai Tao","Wu Xia","Feilong Ma","Yu Cheng","Jingmao Cui"],"pdf_url":"https://arxiv.org/pdf/2305.08098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09437v2","updated":"2024-01-25T15:52:19Z","published":"2023-07-18T17:11:55Z","title":"Grounded Object Centric Learning","summary":"  The extraction of modular object-centric representations for downstream tasks\nis an emerging area of research. Learning grounded representations of objects\nthat are guaranteed to be stable and invariant promises robust performance\nacross different tasks and environments. Slot Attention (SA) learns\nobject-centric representations by assigning objects to \\textit{slots}, but\npresupposes a \\textit{single} distribution from which all slots are randomly\ninitialised. This results in an inability to learn \\textit{specialized} slots\nwhich bind to specific object types and remain invariant to identity-preserving\nchanges in object appearance. To address this, we present\n\\emph{\\textsc{Co}nditional \\textsc{S}lot \\textsc{A}ttention} (\\textsc{CoSA})\nusing a novel concept of \\emph{Grounded Slot Dictionary} (GSD) inspired by\nvector quantization. Our proposed GSD comprises (i) canonical object-level\nproperty vectors and (ii) parametric Gaussian distributions, which define a\nprior over the slots. We demonstrate the benefits of our method in multiple\ndownstream tasks such as scene generation, composition, and task adaptation,\nwhilst remaining competitive with SA in popular object discovery benchmarks.\n","authors":["Avinash Kori","Francesco Locatello","Fabio De Sousa Ribeiro","Francesca Toni","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2307.09437v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14257v1","updated":"2024-01-25T15:49:12Z","published":"2024-01-25T15:49:12Z","title":"Sketch2NeRF: Multi-view Sketch-guided Text-to-3D Generation","summary":"  Recently, text-to-3D approaches have achieved high-fidelity 3D content\ngeneration using text description. However, the generated objects are\nstochastic and lack fine-grained control. Sketches provide a cheap approach to\nintroduce such fine-grained control. Nevertheless, it is challenging to achieve\nflexible control from these sketches due to their abstraction and ambiguity. In\nthis paper, we present a multi-view sketch-guided text-to-3D generation\nframework (namely, Sketch2NeRF) to add sketch control to 3D generation.\nSpecifically, our method leverages pretrained 2D diffusion models (e.g., Stable\nDiffusion and ControlNet) to supervise the optimization of a 3D scene\nrepresented by a neural radiance field (NeRF). We propose a novel synchronized\ngeneration and reconstruction method to effectively optimize the NeRF. In the\nexperiments, we collected two kinds of multi-view sketch datasets to evaluate\nthe proposed method. We demonstrate that our method can synthesize 3D\nconsistent contents with fine-grained sketch control while being high-fidelity\nto text prompts. Extensive results show that our method achieves\nstate-of-the-art performance in terms of sketch similarity and text alignment.\n","authors":["Minglin Chen","Longguang Wang","Weihao Yuan","Yukun Wang","Zhe Sheng","Yisheng He","Zilong Dong","Liefeng Bo","Yulan Guo"],"pdf_url":"https://arxiv.org/pdf/2401.14257v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.14256v1","updated":"2024-01-25T15:47:18Z","published":"2024-01-25T15:47:18Z","title":"Producing Plankton Classifiers that are Robust to Dataset Shift","summary":"  Modern plankton high-throughput monitoring relies on deep learning\nclassifiers for species recognition in water ecosystems. Despite satisfactory\nnominal performances, a significant challenge arises from Dataset Shift, which\ncauses performances to drop during deployment. In our study, we integrate the\nZooLake dataset with manually-annotated images from 10 independent days of\ndeployment, serving as test cells to benchmark Out-Of-Dataset (OOD)\nperformances. Our analysis reveals instances where classifiers, initially\nperforming well in In-Dataset conditions, encounter notable failures in\npractical scenarios. For example, a MobileNet with a 92% nominal test accuracy\nshows a 77% OOD accuracy. We systematically investigate conditions leading to\nOOD performance drops and propose a preemptive assessment method to identify\npotential pitfalls when classifying new data, and pinpoint features in OOD\nimages that adversely impact classification. We present a three-step pipeline:\n(i) identifying OOD degradation compared to nominal test performance, (ii)\nconducting a diagnostic analysis of degradation causes, and (iii) providing\nsolutions. We find that ensembles of BEiT vision transformers, with targeted\naugmentations addressing OOD robustness, geometric ensembling, and\nrotation-based test-time augmentation, constitute the most robust model, which\nwe call BEsT model. It achieves an 83% OOD accuracy, with errors concentrated\non container classes. Moreover, it exhibits lower sensitivity to dataset shift,\nand reproduces well the plankton abundances. Our proposed pipeline is\napplicable to generic plankton classifiers, contingent on the availability of\nsuitable test cells. By identifying critical shortcomings and offering\npractical procedures to fortify models against dataset shift, our study\ncontributes to the development of more reliable plankton classification\ntechnologies.\n","authors":["Cheng Chen","Sreenath Kyathanahally","Marta Reyes","Stefanie Merkli","Ewa Merz","Emanuele Francazi","Marvin Hoege","Francesco Pomati","Marco Baity-Jesi"],"pdf_url":"https://arxiv.org/pdf/2401.14256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14250v1","updated":"2024-01-25T15:40:19Z","published":"2024-01-25T15:40:19Z","title":"JUMP: A joint multimodal registration pipeline for neuroimaging with\n  minimal preprocessing","summary":"  We present a pipeline for unbiased and robust multimodal registration of\nneuroimaging modalities with minimal pre-processing. While typical multimodal\nstudies need to use multiple independent processing pipelines, with diverse\noptions and hyperparameters, we propose a single and structured framework to\njointly process different image modalities. The use of state-of-the-art\nlearning-based techniques enables fast inferences, which makes the presented\nmethod suitable for large-scale and/or multi-cohort datasets with a diverse\nnumber of modalities per session. The pipeline currently works with structural\nMRI, resting state fMRI and amyloid PET images. We show the predictive power of\nthe derived biomarkers using in a case-control study and study the cross-modal\nrelationship between different image modalities. The code can be found in\nhttps: //github.com/acasamitjana/JUMP.\n","authors":["Adria Casamitjana","Juan Eugenio Iglesias","Raul Tudela","Aida Ninerola-Baizan","Roser Sala-Llonch"],"pdf_url":"https://arxiv.org/pdf/2401.14250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14248v1","updated":"2024-01-25T15:39:37Z","published":"2024-01-25T15:39:37Z","title":"On generalisability of segment anything model for nuclear instance\n  segmentation in histology images","summary":"  Pre-trained on a large and diverse dataset, the segment anything model (SAM)\nis the first promptable foundation model in computer vision aiming at object\nsegmentation tasks. In this work, we evaluate SAM for the task of nuclear\ninstance segmentation performance with zero-shot learning and finetuning. We\ncompare SAM with other representative methods in nuclear instance segmentation,\nespecially in the context of model generalisability. To achieve automatic\nnuclear instance segmentation, we propose using a nuclei detection model to\nprovide bounding boxes or central points of nu-clei as visual prompts for SAM\nin generating nuclear instance masks from histology images.\n","authors":["Kesi Xu","Lea Goetz","Nasir Rajpoot"],"pdf_url":"https://arxiv.org/pdf/2401.14248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06023v5","updated":"2024-01-25T15:35:40Z","published":"2023-09-12T07:50:54Z","title":"Learning from History: Task-agnostic Model Contrastive Learning for\n  Image Restoration","summary":"  Contrastive learning has emerged as a prevailing paradigm for high-level\nvision tasks, which, by introducing properly negative samples, has also been\nexploited for low-level vision tasks to achieve a compact optimization space to\naccount for their ill-posed nature. However, existing methods rely on manually\npredefined and task-oriented negatives, which often exhibit pronounced\ntask-specific biases. To address this challenge, our paper introduces an\ninnovative method termed 'learning from history', which dynamically generates\nnegative samples from the target model itself. Our approach, named Model\nContrastive Learning for Image Restoration (MCLIR), rejuvenates latency models\nas negative models, making it compatible with diverse image restoration tasks.\nWe propose the Self-Prior guided Negative loss (SPN) to enable it. This\napproach significantly enhances existing models when retrained with the\nproposed model contrastive paradigm. The results show significant improvements\nin image restoration across various tasks and architectures. For example,\nmodels retrained with SPN outperform the original FFANet and DehazeFormer by\n3.41 dB and 0.57 dB on the RESIDE indoor dataset for image dehazing. Similarly,\nthey achieve notable improvements of 0.47 dB on SPA-Data over IDT for image\nderaining and 0.12 dB on Manga109 for a 4x scale super-resolution over\nlightweight SwinIR, respectively. Code and retrained models are available at\nhttps://github.com/Aitical/MCLIR.\n","authors":["Gang Wu","Junjun Jiang","Kui Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06023v5.pdf","comment":"Camera Ready Version. Accepted to The 38th Annual AAAI Conference on\n  Artificial Intelligence (AAAI 2024)"},{"id":"http://arxiv.org/abs/2310.09543v2","updated":"2024-01-25T15:35:26Z","published":"2023-10-14T09:36:01Z","title":"Benchmarking the Sim-to-Real Gap in Cloth Manipulation","summary":"  Realistic physics engines play a crucial role for learning to manipulate\ndeformable objects such as garments in simulation. By doing so, researchers can\ncircumvent challenges such as sensing the deformation of the object in the\nrealworld. In spite of the extensive use of simulations for this task, few\nworks have evaluated the reality gap between deformable object simulators and\nreal-world data. We present a benchmark dataset to evaluate the sim-to-real gap\nin cloth manipulation. The dataset is collected by performing a dynamic as well\nas a quasi-static cloth manipulation task involving contact with a rigid table.\nWe use the dataset to evaluate the reality gap, computational time, and\nsimulation stability of four popular deformable object simulators: MuJoCo,\nBullet, Flex, and SOFA. Additionally, we discuss the benefits and drawbacks of\neach simulator. The benchmark dataset is open-source. Supplementary material,\nvideos, and code, can be found at\nhttps://sites.google.com/view/cloth-sim2real-benchmark.\n","authors":["David Blanco-Mulero","Oriol Barbany","Gokhan Alcan","Adrià Colomé","Carme Torras","Ville Kyrki"],"pdf_url":"https://arxiv.org/pdf/2310.09543v2.pdf","comment":"Accepted to IEEE Robotics and Automation Letters (RA-L). 8 pages, 6\n  figures. Supplementary material available at\n  https://sites.google.com/view/cloth-sim2real-benchmark"},{"id":"http://arxiv.org/abs/2401.14236v1","updated":"2024-01-25T15:21:53Z","published":"2024-01-25T15:21:53Z","title":"Exploring the Unexplored: Understanding the Impact of Layer Adjustments\n  on Image Classification","summary":"  This paper investigates how adjustments to deep learning architectures impact\nmodel performance in image classification. Small-scale experiments generate\ninitial insights although the trends observed are not consistent with the\nentire dataset. Filtering operations in the image processing pipeline are\ncrucial, with image filtering before pre-processing yielding better results.\nThe choice and order of layers as well as filter placement significantly impact\nmodel performance. This study provides valuable insights into optimizing deep\nlearning models, with potential avenues for future research including\ncollaborative platforms.\n","authors":["Haixia Liu","Tim Brailsford","James Goulding","Gavin Smith","Larry Bull"],"pdf_url":"https://arxiv.org/pdf/2401.14236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01622v3","updated":"2024-01-25T14:55:17Z","published":"2023-02-03T09:49:13Z","title":"Private, fair and accurate: Training large-scale, privacy-preserving AI\n  models in medical imaging","summary":"  Artificial intelligence (AI) models are increasingly used in the medical\ndomain. However, as medical data is highly sensitive, special precautions to\nensure its protection are required. The gold standard for privacy preservation\nis the introduction of differential privacy (DP) to model training. Prior work\nindicates that DP has negative implications on model accuracy and fairness,\nwhich are unacceptable in medicine and represent a main barrier to the\nwidespread use of privacy-preserving techniques. In this work, we evaluated the\neffect of privacy-preserving training of AI models regarding accuracy and\nfairness compared to non-private training. For this, we used two datasets: (1)\nA large dataset (N=193,311) of high quality clinical chest radiographs, and (2)\na dataset (N=1,625) of 3D abdominal computed tomography (CT) images, with the\ntask of classifying the presence of pancreatic ductal adenocarcinoma (PDAC).\nBoth were retrospectively collected and manually labeled by experienced\nradiologists. We then compared non-private deep convolutional neural networks\n(CNNs) and privacy-preserving (DP) models with respect to privacy-utility\ntrade-offs measured as area under the receiver-operator-characteristic curve\n(AUROC), and privacy-fairness trade-offs, measured as Pearson's r or\nStatistical Parity Difference. We found that, while the privacy-preserving\ntrainings yielded lower accuracy, they did largely not amplify discrimination\nagainst age, sex or co-morbidity. Our study shows that -- under the challenging\nrealistic circumstances of a real-life clinical dataset -- the\nprivacy-preserving training of diagnostic deep learning models is possible with\nexcellent diagnostic accuracy and fairness.\n","authors":["Soroosh Tayebi Arasteh","Alexander Ziller","Christiane Kuhl","Marcus Makowski","Sven Nebelung","Rickmer Braren","Daniel Rueckert","Daniel Truhn","Georgios Kaissis"],"pdf_url":"https://arxiv.org/pdf/2302.01622v3.pdf","comment":"To appear in Communications Medicine. 2024. Nature Portfolio"},{"id":"http://arxiv.org/abs/2401.14193v1","updated":"2024-01-25T14:03:54Z","published":"2024-01-25T14:03:54Z","title":"Clinical Melanoma Diagnosis with Artificial Intelligence: Insights from\n  a Prospective Multicenter Study","summary":"  Early detection of melanoma, a potentially lethal type of skin cancer with\nhigh prevalence worldwide, improves patient prognosis. In retrospective\nstudies, artificial intelligence (AI) has proven to be helpful for enhancing\nmelanoma detection. However, there are few prospective studies confirming these\npromising results. Existing studies are limited by low sample sizes, too\nhomogenous datasets, or lack of inclusion of rare melanoma subtypes, preventing\na fair and thorough evaluation of AI and its generalizability, a crucial aspect\nfor its application in the clinical setting. Therefore, we assessed 'All Data\nare Ext' (ADAE), an established open-source ensemble algorithm for detecting\nmelanomas, by comparing its diagnostic accuracy to that of dermatologists on a\nprospectively collected, external, heterogeneous test set comprising eight\ndistinct hospitals, four different camera setups, rare melanoma subtypes, and\nspecial anatomical sites. We advanced the algorithm with real test-time\naugmentation (R-TTA, i.e. providing real photographs of lesions taken from\nmultiple angles and averaging the predictions), and evaluated its\ngeneralization capabilities. Overall, the AI showed higher balanced accuracy\nthan dermatologists (0.798, 95% confidence interval (CI) 0.779-0.814 vs. 0.781,\n95% CI 0.760-0.802; p<0.001), obtaining a higher sensitivity (0.921, 95% CI\n0.900- 0.942 vs. 0.734, 95% CI 0.701-0.770; p<0.001) at the cost of a lower\nspecificity (0.673, 95% CI 0.641-0.702 vs. 0.828, 95% CI 0.804-0.852; p<0.001).\nAs the algorithm exhibited a significant performance advantage on our\nheterogeneous dataset exclusively comprising melanoma-suspicious lesions, AI\nmay offer the potential to support dermatologists particularly in diagnosing\nchallenging cases.\n","authors":["Lukas Heinlein","Roman C. Maron","Achim Hekler","Sarah Haggenmüller","Christoph Wies","Jochen S. Utikal","Friedegund Meier","Sarah Hobelsberger","Frank F. Gellrich","Mildred Sergon","Axel Hauschild","Lars E. French","Lucie Heinzerling","Justin G. Schlager","Kamran Ghoreschi","Max Schlaak","Franz J. Hilke","Gabriela Poch","Sören Korsing","Carola Berking","Markus V. Heppt","Michael Erdmann","Sebastian Haferkamp","Konstantin Drexler","Dirk Schadendorf","Wiebke Sondermann","Matthias Goebeler","Bastian Schilling","Eva Krieghoff-Henning","Titus J. Brinker"],"pdf_url":"https://arxiv.org/pdf/2401.14193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15072v2","updated":"2024-01-25T13:30:27Z","published":"2023-10-23T16:30:39Z","title":"RD-VIO: Robust Visual-Inertial Odometry for Mobile Augmented Reality in\n  Dynamic Environments","summary":"  It is typically challenging for visual or visual-inertial odometry systems to\nhandle the problems of dynamic scenes and pure rotation. In this work, we\ndesign a novel visual-inertial odometry (VIO) system called RD-VIO to handle\nboth of these two problems. Firstly, we propose an IMU-PARSAC algorithm which\ncan robustly detect and match keypoints in a two-stage process. In the first\nstate, landmarks are matched with new keypoints using visual and IMU\nmeasurements. We collect statistical information from the matching and then\nguide the intra-keypoint matching in the second stage. Secondly, to handle the\nproblem of pure rotation, we detect the motion type and adapt the\ndeferred-triangulation technique during the data-association process. We make\nthe pure-rotational frames into the special subframes. When solving the\nvisual-inertial bundle adjustment, they provide additional constraints to the\npure-rotational motion. We evaluate the proposed VIO system on public datasets.\nExperiments show the proposed RD-VIO has obvious advantages over other methods\nin dynamic environments.\n","authors":["Jinyu Li","Xiaokun Pan","Gan Huang","Ziyang Zhang","Nan Wang","Hujun Bao","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.15072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.11442v7","updated":"2024-01-25T13:29:56Z","published":"2022-03-22T03:33:27Z","title":"Scalable Video Object Segmentation with Identification Mechanism","summary":"  This paper delves into the challenges of achieving scalable and effective\nmulti-object modeling for semi-supervised Video Object Segmentation (VOS).\nPrevious VOS methods decode features with a single positive object, limiting\nthe learning of multi-object representation as they must match and segment each\ntarget separately under multi-object scenarios. Additionally, earlier\ntechniques catered to specific application objectives and lacked the\nflexibility to fulfill different speed-accuracy requirements. To address these\nproblems, we present two innovative approaches, Associating Objects with\nTransformers (AOT) and Associating Objects with Scalable Transformers (AOST).\nIn pursuing effective multi-object modeling, AOT introduces the IDentification\n(ID) mechanism to allocate each object a unique identity. This approach enables\nthe network to model the associations among all objects simultaneously, thus\nfacilitating the tracking and segmentation of objects in a single network pass.\nTo address the challenge of inflexible deployment, AOST further integrates\nscalable long short-term transformers that incorporate scalable supervision and\nlayer-wise ID-based attention. This enables online architecture scalability in\nVOS for the first time and overcomes ID embeddings' representation limitations.\nGiven the absence of a benchmark for VOS involving densely multi-object\nannotations, we propose a challenging Video Object Segmentation in the Wild\n(VOSW) benchmark to validate our approaches. We evaluated various AOT and AOST\nvariants using extensive experiments across VOSW and five commonly used VOS\nbenchmarks, including YouTube-VOS 2018 & 2019 Val, DAVIS-2017 Val & Test, and\nDAVIS-2016. Our approaches surpass the state-of-the-art competitors and display\nexceptional efficiency and scalability consistently across all six benchmarks.\nProject page: https://github.com/yoxu515/aot-benchmark.\n","authors":["Zongxin Yang","Jiaxu Miao","Yunchao Wei","Wenguan Wang","Xiaohan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2203.11442v7.pdf","comment":"Extension of arXiv:2106.02638 (NeurIPS 2021)"},{"id":"http://arxiv.org/abs/2401.14168v1","updated":"2024-01-25T13:27:03Z","published":"2024-01-25T13:27:03Z","title":"Vivim: a Video Vision Mamba for Medical Video Object Segmentation","summary":"  Traditional convolutional neural networks have a limited receptive field\nwhile transformer-based networks are mediocre in constructing long-term\ndependency from the perspective of computational complexity. Such the\nbottleneck poses a significant challenge when processing long video sequences\nin video analysis tasks. Very recently, the state space models (SSMs) with\nefficient hardware-aware designs, famous by Mamba, have exhibited impressive\nachievements in long sequence modeling, which facilitates the development of\ndeep neural networks on many vision tasks. To better capture available cues in\nvideo frames, this paper presents a generic Video Vision Mamba-based framework\nfor medical video object segmentation tasks, named Vivim. Our Vivim can\neffectively compress the long-term spatiotemporal representation into sequences\nat varying scales by our designed Temporal Mamba Block. Compared to existing\nvideo-level Transformer-based methods, our model maintains excellent\nsegmentation results with better speed performance. Extensive experiments on\nthe breast US dataset demonstrate the effectiveness and efficiency of our\nVivim. The code for Vivim is available at:\nhttps://github.com/scott-yjyang/Vivim.\n","authors":["Yijun Yang","Zhaohu Xing","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.14168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13560v2","updated":"2024-01-25T13:15:40Z","published":"2024-01-24T16:17:23Z","title":"SegMamba: Long-range Sequential Modeling Mamba For 3D Medical Image\n  Segmentation","summary":"  The Transformer architecture has shown a remarkable ability in modeling\nglobal relationships. However, it poses a significant computational challenge\nwhen processing high-dimensional medical images. This hinders its development\nand widespread adoption in this task. Mamba, as a State Space Model (SSM),\nrecently emerged as a notable manner for long-range dependencies in sequential\nmodeling, excelling in natural language processing filed with its remarkable\nmemory efficiency and computational speed. Inspired by its success, we\nintroduce SegMamba, a novel 3D medical image \\textbf{Seg}mentation\n\\textbf{Mamba} model, designed to effectively capture long-range dependencies\nwithin whole volume features at every scale. Our SegMamba, in contrast to\nTransformer-based methods, excels in whole volume feature modeling from a state\nspace model standpoint, maintaining superior processing speed, even with volume\nfeatures at a resolution of {$64\\times 64\\times 64$}. Comprehensive experiments\non the BraTS2023 dataset demonstrate the effectiveness and efficiency of our\nSegMamba. The code for SegMamba is available at:\nhttps://github.com/ge-xing/SegMamba\n","authors":["Zhaohu Xing","Tian Ye","Yijun Yang","Guang Liu","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.13560v2.pdf","comment":"Code has released"},{"id":"http://arxiv.org/abs/2401.14159v1","updated":"2024-01-25T13:12:09Z","published":"2024-01-25T13:12:09Z","title":"Grounded SAM: Assembling Open-World Models for Diverse Visual Tasks","summary":"  We introduce Grounded SAM, which uses Grounding DINO as an open-set object\ndetector to combine with the segment anything model (SAM). This integration\nenables the detection and segmentation of any regions based on arbitrary text\ninputs and opens a door to connecting various vision models. As shown in Fig.1,\na wide range of vision tasks can be achieved by using the versatile Grounded\nSAM pipeline. For example, an automatic annotation pipeline based solely on\ninput images can be realized by incorporating models such as BLIP and Recognize\nAnything. Additionally, incorporating Stable-Diffusion allows for controllable\nimage editing, while the integration of OSX facilitates promptable 3D human\nmotion analysis. Grounded SAM also shows superior performance on\nopen-vocabulary benchmarks, achieving 48.7 mean AP on SegInW (Segmentation in\nthe wild) zero-shot benchmark with the combination of Grounding DINO-Base and\nSAM-Huge models.\n","authors":["Tianhe Ren","Shilong Liu","Ailing Zeng","Jing Lin","Kunchang Li","He Cao","Jiayu Chen","Xinyu Huang","Yukang Chen","Feng Yan","Zhaoyang Zeng","Hao Zhang","Feng Li","Jie Yang","Hongyang Li","Qing Jiang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.14159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06968v3","updated":"2024-01-25T13:01:43Z","published":"2023-12-12T04:05:15Z","title":"Hallucination Augmented Contrastive Learning for Multimodal Large\n  Language Model","summary":"  Multi-modal large language models (MLLMs) have been shown to efficiently\nintegrate natural language with visual information to handle multi-modal tasks.\nHowever, MLLMs still face a fundamental limitation of hallucinations, where\nthey tend to generate erroneous or fabricated information. In this paper, we\naddress hallucinations in MLLMs from a novel perspective of representation\nlearning. We first analyzed the representation distribution of textual and\nvisual tokens in MLLM, revealing two important findings: 1) there is a\nsignificant gap between textual and visual representations, indicating\nunsatisfactory cross-modal representation alignment; 2) representations of\ntexts that contain and do not contain hallucinations are entangled, making it\nchallenging to distinguish them. These two observations inspire us with a\nsimple yet effective method to mitigate hallucinations. Specifically, we\nintroduce contrastive learning into MLLMs and use text with hallucination as\nhard negative examples, naturally bringing representations of non-hallucinative\ntext and visual samples closer while pushing way representations of\nnon-hallucinating and hallucinative text. We evaluate our method quantitatively\nand qualitatively, showing its effectiveness in reducing hallucination\noccurrences and improving performance across multiple benchmarks. On the\nMMhal-Bench benchmark, our method obtains a 34.66% /29.5% improvement over the\nbaseline MiniGPT-4/LLaVA. Our code is available on\nhttps://github.com/X-PLUG/mPLUG-HalOwl/tree/main/hacl.\n","authors":["Chaoya Jiang","Haiyang Xu","Mengfan Dong","Jiaxing Chen","Wei Ye","Ming Yan","Qinghao Ye","Ji Zhang","Fei Huang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.06968v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10895v2","updated":"2024-01-25T12:59:04Z","published":"2023-07-20T14:18:44Z","title":"Variational Autoencoding of Dental Point Clouds","summary":"  Digital dentistry has made significant advancements, yet numerous challenges\nremain. This paper introduces the FDI 16 dataset, an extensive collection of\ntooth meshes and point clouds. Additionally, we present a novel approach:\nVariational FoldingNet (VF-Net), a fully probabilistic variational autoencoder\ndesigned for point clouds. Notably, prior latent variable models for point\nclouds lack a one-to-one correspondence between input and output points.\nInstead, they rely on optimizing Chamfer distances, a metric that lacks a\nnormalized distributional counterpart, rendering it unsuitable for\nprobabilistic modeling. We replace the explicit minimization of Chamfer\ndistances with a suitable encoder, increasing computational efficiency while\nsimplifying the probabilistic extension. This allows for straightforward\napplication in various tasks, including mesh generation, shape completion, and\nrepresentation learning. Empirically, we provide evidence of lower\nreconstruction error in dental reconstruction and interpolation, showcasing\nstate-of-the-art performance in dental sample generation while identifying\nvaluable latent representations.\n","authors":["Johan Ziruo Ye","Thomas Ørkild","Peter Lempel Søndergaard","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2307.10895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14148v1","updated":"2024-01-25T12:55:48Z","published":"2024-01-25T12:55:48Z","title":"LanDA: Language-Guided Multi-Source Domain Adaptation","summary":"  Multi-Source Domain Adaptation (MSDA) aims to mitigate changes in data\ndistribution when transferring knowledge from multiple labeled source domains\nto an unlabeled target domain. However, existing MSDA techniques assume target\ndomain images are available, yet overlook image-rich semantic information.\nConsequently, an open question is whether MSDA can be guided solely by textual\ncues in the absence of target domain images. By employing a multimodal model\nwith a joint image and language embedding space, we propose a novel\nlanguage-guided MSDA approach, termed LanDA, based on optimal transfer theory,\nwhich facilitates the transfer of multiple source domains to a new target\ndomain, requiring only a textual description of the target domain without\nneeding even a single target domain image, while retaining task-relevant\ninformation. We present extensive experiments across different transfer\nscenarios using a suite of relevant benchmarks, demonstrating that LanDA\noutperforms standard fine-tuning and ensemble approaches in both target and\nsource domains.\n","authors":["Zhenbin Wang","Lei Zhang","Lituan Wang","Minjuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.14148v1.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.14142v1","updated":"2024-01-25T12:46:37Z","published":"2024-01-25T12:46:37Z","title":"Energy-Based Concept Bottleneck Models: Unifying Prediction, Concept\n  Intervention, and Conditional Interpretations","summary":"  Existing methods, such as concept bottleneck models (CBMs), have been\nsuccessful in providing concept-based interpretations for black-box deep\nlearning models. They typically work by predicting concepts given the input and\nthen predicting the final class label given the predicted concepts. However,\n(1) they often fail to capture the high-order, nonlinear interaction between\nconcepts, e.g., correcting a predicted concept (e.g., \"yellow breast\") does not\nhelp correct highly correlated concepts (e.g., \"yellow belly\"), leading to\nsuboptimal final accuracy; (2) they cannot naturally quantify the complex\nconditional dependencies between different concepts and class labels (e.g., for\nan image with the class label \"Kentucky Warbler\" and a concept \"black bill\",\nwhat is the probability that the model correctly predicts another concept\n\"black crown\"), therefore failing to provide deeper insight into how a\nblack-box model works. In response to these limitations, we propose\nEnergy-based Concept Bottleneck Models (ECBMs). Our ECBMs use a set of neural\nnetworks to define the joint energy of candidate (input, concept, class)\ntuples. With such a unified interface, prediction, concept correction, and\nconditional dependency quantification are then represented as conditional\nprobabilities, which are generated by composing different energy functions. Our\nECBMs address both limitations of existing CBMs, providing higher accuracy and\nricher concept interpretations. Empirical results show that our approach\noutperforms the state-of-the-art on real-world datasets.\n","authors":["Xinyue Xu","Yi Qin","Lu Mi","Hao Wang","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2401.14142v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2310.17170v2","updated":"2024-01-25T12:37:51Z","published":"2023-10-26T05:49:44Z","title":"MO-YOLO: End-to-End Multiple-Object Tracking Method with YOLO and\n  Decoder","summary":"  In the field of multi-object tracking (MOT), recent Transformer based\nend-to-end models like MOTR have demonstrated exceptional performance on\ndatasets such as DanceTracker. However, the computational demands of these\nmodels present challenges in training and deployment. Drawing inspiration from\nsuccessful models like GPT, we present MO-YOLO, an efficient and\ncomputationally frugal end-to-end MOT model. MO-YOLO integrates principles from\nYou Only Look Once (YOLO) and RT-DETR, adopting a decoder-only approach. By\nleveraging the decoder from RT-DETR and architectural components from YOLOv8,\nMO-YOLO achieves high speed, shorter training times, and proficient MOT\nperformance. On the Dancetrack, MO-YOLO not only matches MOTR's performance but\nalso surpasses it, achieving over twice the frames per second (MOTR 9.5 FPS,\nMO-YOLO 19.6 FPS). Furthermore, MO-YOLO demonstrates significantly reduced\ntraining times and lower hardware requirements compared to MOTR. This research\nintroduces a promising paradigm for efficient end-to-end MOT, emphasizing\nenhanced performance and resource efficiency.\n","authors":["Liao Pan","Yang Feng","Wu Di","Liu Bo","Zhang Xingle"],"pdf_url":"https://arxiv.org/pdf/2310.17170v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14136v1","updated":"2024-01-25T12:32:21Z","published":"2024-01-25T12:32:21Z","title":"Expression-aware video inpainting for HMD removal in XR applications","summary":"  Head-mounted displays (HMDs) serve as indispensable devices for observing\nextended reality (XR) environments and virtual content. However, HMDs present\nan obstacle to external recording techniques as they block the upper face of\nthe user. This limitation significantly affects social XR applications,\nspecifically teleconferencing, where facial features and eye gaze information\nplay a vital role in creating an immersive user experience. In this study, we\npropose a new network for expression-aware video inpainting for HMD removal\n(EVI-HRnet) based on generative adversarial networks (GANs). Our model\neffectively fills in missing information with regard to facial landmarks and a\nsingle occlusion-free reference image of the user. The framework and its\ncomponents ensure the preservation of the user's identity across frames using\nthe reference frame. To further improve the level of realism of the inpainted\noutput, we introduce a novel facial expression recognition (FER) loss function\nfor emotion preservation. Our results demonstrate the remarkable capability of\nthe proposed framework to remove HMDs from facial videos while maintaining the\nsubject's facial expression and identity. Moreover, the outputs exhibit\ntemporal consistency along the inpainted frames. This lightweight framework\npresents a practical approach for HMD occlusion removal, with the potential to\nenhance various collaborative XR applications without the need for additional\nhardware.\n","authors":["Fatemeh Ghorbani Lohesara","Karen Egiazarian","Sebastian Knorr"],"pdf_url":"https://arxiv.org/pdf/2401.14136v1.pdf","comment":"Accepted in CVMP 2023"},{"id":"http://arxiv.org/abs/2401.14132v1","updated":"2024-01-25T12:27:03Z","published":"2024-01-25T12:27:03Z","title":"Enabling Cross-Camera Collaboration for Video Analytics on Distributed\n  Smart Cameras","summary":"  Overlapping cameras offer exciting opportunities to view a scene from\ndifferent angles, allowing for more advanced, comprehensive and robust\nanalysis. However, existing visual analytics systems for multi-camera streams\nare mostly limited to (i) per-camera processing and aggregation and (ii)\nworkload-agnostic centralized processing architectures. In this paper, we\npresent Argus, a distributed video analytics system with cross-camera\ncollaboration on smart cameras. We identify multi-camera, multi-target tracking\nas the primary task of multi-camera video analytics and develop a novel\ntechnique that avoids redundant, processing-heavy identification tasks by\nleveraging object-wise spatio-temporal association in the overlapping fields of\nview across multiple cameras. We further develop a set of techniques to perform\nthese operations across distributed cameras without cloud support at low\nlatency by (i) dynamically ordering the camera and object inspection sequence\nand (ii) flexibly distributing the workload across smart cameras, taking into\naccount network transmission and heterogeneous computational capacities.\nEvaluation of three real-world overlapping camera datasets with two Nvidia\nJetson devices shows that Argus reduces the number of object identifications\nand end-to-end latency by up to 7.13x and 2.19x (4.86x and 1.60x compared to\nthe state-of-the-art), while achieving comparable tracking quality.\n","authors":["Chulhong Min","Juheon Yi","Utku Gunay Acer","Fahim Kawsar"],"pdf_url":"https://arxiv.org/pdf/2401.14132v1.pdf","comment":"18 pages, under review"},{"id":"http://arxiv.org/abs/2401.14130v1","updated":"2024-01-25T12:18:46Z","published":"2024-01-25T12:18:46Z","title":"Attention-based Efficient Classification for 3D MRI Image of Alzheimer's\n  Disease","summary":"  Early diagnosis of Alzheimer Diagnostics (AD) is a challenging task due to\nits subtle and complex clinical symptoms. Deep learning-assisted medical\ndiagnosis using image recognition techniques has become an important research\ntopic in this field. The features have to accurately capture main variations of\nanatomical brain structures. However, time-consuming is expensive for feature\nextraction by deep learning training. This study proposes a novel Alzheimer's\ndisease detection model based on Convolutional Neural Networks. The model\nutilizes a pre-trained ResNet network as the backbone, incorporating\npost-fusion algorithm for 3D medical images and attention mechanisms. The\nexperimental results indicate that the employed 2D fusion algorithm effectively\nimproves the model's training expense. And the introduced attention mechanism\naccurately weights important regions in images, further enhancing the model's\ndiagnostic accuracy.\n","authors":["Yihao Lin","Ximeng Li","Yan Zhang","Jinshan Tang"],"pdf_url":"https://arxiv.org/pdf/2401.14130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14121v1","updated":"2024-01-25T12:04:53Z","published":"2024-01-25T12:04:53Z","title":"Incorporating Exemplar Optimization into Training with Dual Networks for\n  Human Mesh Recovery","summary":"  We propose a novel optimization-based human mesh recovery method from a\nsingle image. Given a test exemplar, previous approaches optimize the\npre-trained regression network to minimize the 2D re-projection loss, which\nhowever suffer from over-/under-fitting problems. This is because the\n``exemplar optimization'' at testing time has too weak relation to the\npre-training process, and the exemplar optimization loss function is different\nfrom the training loss function. (1) We incorporate exemplar optimization into\nthe training stage. During training, our method first executes exemplar\noptimization and subsequently proceeds with training-time optimization. The\nexemplar optimization may run into a wrong direction, while the subsequent\ntraining optimization serves to correct the deviation. Involved in training,\nthe exemplar optimization learns to adapt its behavior to training data,\nthereby acquires generalibility to test exemplars. (2) We devise a dual-network\narchitecture to convey the novel training paradigm, which is composed of a main\nregression network and an auxiliary network, in which we can formulate the\nexemplar optimization loss function in the same form as the training loss\nfunction. This further enhances the compatibility between the exemplar and\ntraining optimizations. Experiments demonstrate that our exemplar optimization\nafter the novel training scheme significantly outperforms state-of-the-art\napproaches.\n","authors":["Yongwei Nie","Mingxian Fan","Chengjiang Long","Qing Zhang","Jian Zhu","Xuemiao Xu"],"pdf_url":"https://arxiv.org/pdf/2401.14121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05925v2","updated":"2024-01-25T11:51:22Z","published":"2024-01-11T14:05:01Z","title":"CoSSegGaussians: Compact and Swift Scene Segmenting 3D Gaussians with\n  Dual Feature Fusion","summary":"  We propose Compact and Swift Segmenting 3D Gaussians(CoSSegGaussians), a\nmethod for compact 3D-consistent scene segmentation at fast rendering speed\nwith only RGB images input. Previous NeRF-based segmentation methods have\nrelied on time-consuming neural scene optimization. While recent 3D Gaussian\nSplatting has notably improved speed, existing Gaussian-based segmentation\nmethods struggle to produce compact masks, especially in zero-shot\nsegmentation. This issue probably stems from their straightforward assignment\nof learnable parameters to each Gaussian, resulting in a lack of robustness\nagainst cross-view inconsistent 2D machine-generated labels. Our method aims to\naddress this problem by employing Dual Feature Fusion Network as Gaussians'\nsegmentation field. Specifically, we first optimize 3D Gaussians under RGB\nsupervision. After Gaussian Locating, DINO features extracted from images are\napplied through explicit unprojection, which are further incorporated with\nspatial features from the efficient point cloud processing network. Feature\naggregation is utilized to fuse them in a global-to-local strategy for compact\nsegmentation features. Experimental results show that our model outperforms\nbaselines on both semantic and panoptic zero-shot segmentation task, meanwhile\nconsumes less than 10\\% inference time compared to NeRF-based methods. Code and\nmore results will be available at https://David-Dou.github.io/CoSSegGaussians.\n","authors":["Bin Dou","Tianyu Zhang","Yongjia Ma","Zhaohui Wang","Zejian Yuan"],"pdf_url":"https://arxiv.org/pdf/2401.05925v2.pdf","comment":"Correct writing details"},{"id":"http://arxiv.org/abs/2401.14115v1","updated":"2024-01-25T11:50:43Z","published":"2024-01-25T11:50:43Z","title":"MIFI: MultI-camera Feature Integration for Roust 3D Distracted Driver\n  Activity Recognition","summary":"  Distracted driver activity recognition plays a critical role in risk\naversion-particularly beneficial in intelligent transportation systems.\nHowever, most existing methods make use of only the video from a single view\nand the difficulty-inconsistent issue is neglected. Different from them, in\nthis work, we propose a novel MultI-camera Feature Integration (MIFI) approach\nfor 3D distracted driver activity recognition by jointly modeling the data from\ndifferent camera views and explicitly re-weighting examples based on their\ndegree of difficulty. Our contributions are two-fold: (1) We propose a simple\nbut effective multi-camera feature integration framework and provide three\ntypes of feature fusion techniques. (2) To address the difficulty-inconsistent\nproblem in distracted driver activity recognition, a periodic learning method,\nnamed example re-weighting that can jointly learn the easy and hard samples, is\npresented. The experimental results on the 3MDAD dataset demonstrate that the\nproposed MIFI can consistently boost performance compared to single-view\nmodels.\n","authors":["Jian Kuang","Wenjing Li","Fang Li","Jun Zhang","Zhongcheng Wu"],"pdf_url":"https://arxiv.org/pdf/2401.14115v1.pdf","comment":"Accepted by IEEE Transactions on Intelligent Transportation Systems.\n  Minor typos have been fixed in Table IV"},{"id":"http://arxiv.org/abs/2305.13819v2","updated":"2024-01-25T11:49:55Z","published":"2023-05-23T08:41:04Z","title":"WaveDM: Wavelet-Based Diffusion Models for Image Restoration","summary":"  Latest diffusion-based methods for many image restoration tasks outperform\ntraditional models, but they encounter the long-time inference problem. To\ntackle it, this paper proposes a Wavelet-Based Diffusion Model (WaveDM). WaveDM\nlearns the distribution of clean images in the wavelet domain conditioned on\nthe wavelet spectrum of degraded images after wavelet transform, which is more\ntime-saving in each step of sampling than modeling in the spatial domain. To\nensure restoration performance, a unique training strategy is proposed where\nthe low-frequency and high-frequency spectrums are learned using distinct\nmodules. In addition, an Efficient Conditional Sampling (ECS) strategy is\ndeveloped from experiments, which reduces the number of total sampling steps to\naround 5. Evaluations on twelve benchmark datasets including image raindrop\nremoval, rain steaks removal, dehazing, defocus deblurring, demoir\\'eing, and\ndenoising demonstrate that WaveDM achieves state-of-the-art performance with\nthe efficiency that is comparable to traditional one-pass methods and over\n100$\\times$ faster than existing image restoration methods using vanilla\ndiffusion models.\n","authors":["Yi Huang","Jiancheng Huang","Jianzhuang Liu","Mingfu Yan","Yu Dong","Jiaxi Lv","Chaoqi Chen","Shifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2305.13819v2.pdf","comment":"Accepted by TMM"},{"id":"http://arxiv.org/abs/2401.14111v1","updated":"2024-01-25T11:46:31Z","published":"2024-01-25T11:46:31Z","title":"Scene Graph to Image Synthesis: Integrating CLIP Guidance with Graph\n  Conditioning in Diffusion Models","summary":"  Advancements in generative models have sparked significant interest in\ngenerating images while adhering to specific structural guidelines. Scene graph\nto image generation is one such task of generating images which are consistent\nwith the given scene graph. However, the complexity of visual scenes poses a\nchallenge in accurately aligning objects based on specified relations within\nthe scene graph. Existing methods approach this task by first predicting a\nscene layout and generating images from these layouts using adversarial\ntraining. In this work, we introduce a novel approach to generate images from\nscene graphs which eliminates the need of predicting intermediate layouts. We\nleverage pre-trained text-to-image diffusion models and CLIP guidance to\ntranslate graph knowledge into images. Towards this, we first pre-train our\ngraph encoder to align graph features with CLIP features of corresponding\nimages using a GAN based training. Further, we fuse the graph features with\nCLIP embedding of object labels present in the given scene graph to create a\ngraph consistent CLIP guided conditioning signal. In the conditioning input,\nobject embeddings provide coarse structure of the image and graph features\nprovide structural alignment based on relationships among objects. Finally, we\nfine tune a pre-trained diffusion model with the graph consistent conditioning\nsignal with reconstruction and CLIP alignment loss. Elaborate experiments\nreveal that our method outperforms existing methods on standard benchmarks of\nCOCO-stuff and Visual Genome dataset.\n","authors":["Rameshwar Mishra","A V Subramanyam"],"pdf_url":"https://arxiv.org/pdf/2401.14111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11562v5","updated":"2024-01-25T11:20:16Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models","summary":"  Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, e.g., Large Language Models (LLMs), there is a growing\ninterest in exploring their abilities in reasoning tasks. In this paper, we\nintroduce seminal foundation models proposed or adaptable for reasoning,\nhighlighting the latest advancements in various reasoning tasks, methods, and\nbenchmarks. We then delve into the potential future directions behind the\nemergence of reasoning abilities within foundation models. We also discuss the\nrelevance of multimodal learning, autonomous agents, and super alignment in the\ncontext of reasoning. By discussing these future research directions, we hope\nto inspire researchers in their exploration of this field, stimulate further\nadvancements in reasoning with foundation models, and contribute to the\ndevelopment of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v5.pdf","comment":"20 Figures, 160 Pages, 750+ References, Project Page\n  https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2401.14088v1","updated":"2024-01-25T11:10:13Z","published":"2024-01-25T11:10:13Z","title":"Double Trouble? Impact and Detection of Duplicates in Face Image\n  Datasets","summary":"  Various face image datasets intended for facial biometrics research were\ncreated via web-scraping, i.e. the collection of images publicly available on\nthe internet. This work presents an approach to detect both exactly and nearly\nidentical face image duplicates, using file and image hashes. The approach is\nextended through the use of face image preprocessing. Additional steps based on\nface recognition and face image quality assessment models reduce false\npositives, and facilitate the deduplication of the face images both for intra-\nand inter-subject duplicate sets. The presented approach is applied to five\ndatasets, namely LFW, TinyFace, Adience, CASIA-WebFace, and C-MS-Celeb (a\ncleaned MS-Celeb-1M variant). Duplicates are detected within every dataset,\nwith hundreds to hundreds of thousands of duplicates for all except LFW. Face\nrecognition and quality assessment experiments indicate a minor impact on the\nresults through the duplicate removal. The final deduplication data is publicly\navailable.\n","authors":["Torsten Schlett","Christian Rathgeb","Juan Tapia","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2401.14088v1.pdf","comment":"Accepted at the 13th International Conference on Pattern Recognition\n  Applications and Methods (ICPRAM 2024)"},{"id":"http://arxiv.org/abs/2401.14074v1","updated":"2024-01-25T10:52:36Z","published":"2024-01-25T10:52:36Z","title":"ProCNS: Progressive Prototype Calibration and Noise Suppression for\n  Weakly-Supervised Medical Image Segmentation","summary":"  Weakly-supervised segmentation (WSS) has emerged as a solution to mitigate\nthe conflict between annotation cost and model performance by adopting sparse\nannotation formats (e.g., point, scribble, block, etc.). Typical approaches\nattempt to exploit anatomy and topology priors to directly expand sparse\nannotations into pseudo-labels. However, due to a lack of attention to the\nambiguous edges in medical images and insufficient exploration of sparse\nsupervision, existing approaches tend to generate erroneous and overconfident\npseudo proposals in noisy regions, leading to cumulative model error and\nperformance degradation. In this work, we propose a novel WSS approach, named\nProCNS, encompassing two synergistic modules devised with the principles of\nprogressive prototype calibration and noise suppression. Specifically, we\ndesign a Prototype-based Regional Spatial Affinity (PRSA) loss to maximize the\npair-wise affinities between spatial and semantic elements, providing our model\nof interest with more reliable guidance. The affinities are derived from the\ninput images and the prototype-refined predictions. Meanwhile, we propose an\nAdaptive Noise Perception and Masking (ANPM) module to obtain more enriched and\nrepresentative prototype representations, which adaptively identifies and masks\nnoisy regions within the pseudo proposals, reducing potential erroneous\ninterference during prototype computation. Furthermore, we generate specialized\nsoft pseudo-labels for the noisy regions identified by ANPM, providing\nsupplementary supervision. Extensive experiments on three medical image\nsegmentation tasks involving different modalities demonstrate that the proposed\nframework significantly outperforms representative state-of-the-art methods\n","authors":["Y. Liu","L. Lin","K. K. Y. Wong","X. Tang"],"pdf_url":"https://arxiv.org/pdf/2401.14074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05527v4","updated":"2024-01-25T10:51:45Z","published":"2023-09-11T15:11:11Z","title":"ReSimAD: Zero-Shot 3D Domain Transfer for Autonomous Driving with Source\n  Reconstruction and Target Simulation","summary":"  Domain shifts such as sensor type changes and geographical situation\nvariations are prevalent in Autonomous Driving (AD), which poses a challenge\nsince AD model relying on the previous domain knowledge can be hardly directly\ndeployed to a new domain without additional costs. In this paper, we provide a\nnew perspective and approach of alleviating the domain shifts, by proposing a\nReconstruction-Simulation-Perception (ReSimAD) scheme. Specifically, the\nimplicit reconstruction process is based on the knowledge from the previous old\ndomain, aiming to convert the domain-related knowledge into domain-invariant\nrepresentations, e.g., 3D scene-level meshes. Besides, the point clouds\nsimulation process of multiple new domains is conditioned on the above\nreconstructed 3D meshes, where the target-domain-like simulation samples can be\nobtained, thus reducing the cost of collecting and annotating new-domain data\nfor the subsequent perception process. For experiments, we consider different\ncross-domain situations such as Waymo-to-KITTI, Waymo-to-nuScenes,\nWaymo-to-ONCE, etc, to verify the zero-shot target-domain perception using\nReSimAD. Results demonstrate that our method is beneficial to boost the domain\ngeneralization ability, even promising for 3D pre-training.\n","authors":["Bo Zhang","Xinyu Cai","Jiakang Yuan","Donglin Yang","Jianfei Guo","Xiangchao Yan","Renqiu Xia","Botian Shi","Min Dou","Tao Chen","Si Liu","Junchi Yan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.05527v4.pdf","comment":"Accepted by ICLR 2024. Code and simulated points are available at\n  https://github.com/PJLab-ADG/3DTrans#resimad"},{"id":"http://arxiv.org/abs/2401.14066v1","updated":"2024-01-25T10:42:09Z","published":"2024-01-25T10:42:09Z","title":"CreativeSynth: Creative Blending and Synthesis of Visual Arts based on\n  Multimodal Diffusion","summary":"  Large-scale text-to-image generative models have made impressive strides,\nshowcasing their ability to synthesize a vast array of high-quality images.\nHowever, adapting these models for artistic image editing presents two\nsignificant challenges. Firstly, users struggle to craft textual prompts that\nmeticulously detail visual elements of the input image. Secondly, prevalent\nmodels, when effecting modifications in specific zones, frequently disrupt the\noverall artistic style, complicating the attainment of cohesive and\naesthetically unified artworks. To surmount these obstacles, we build the\ninnovative unified framework CreativeSynth, which is based on a diffusion model\nwith the ability to coordinate multimodal inputs and multitask in the field of\nartistic image generation. By integrating multimodal features with customized\nattention mechanisms, CreativeSynth facilitates the importation of real-world\nsemantic content into the domain of art through inversion and real-time style\ntransfer. This allows for the precise manipulation of image style and content\nwhile maintaining the integrity of the original model parameters. Rigorous\nqualitative and quantitative evaluations underscore that CreativeSynth excels\nin enhancing artistic images' fidelity and preserves their innate aesthetic\nessence. By bridging the gap between generative models and artistic finesse,\nCreativeSynth becomes a custom digital palette.\n","authors":["Nisha Huang","Weiming Dong","Yuxin Zhang","Fan Tang","Ronghui Li","Chongyang Ma","Xiu Li","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2401.14066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14051v1","updated":"2024-01-25T10:08:53Z","published":"2024-01-25T10:08:53Z","title":"A real-time rendering method for high albedo anisotropic materials with\n  multiple scattering","summary":"  We propose a neural network-based real-time volume rendering method for\nrealistic and efficient rendering of volumetric media. The traditional volume\nrendering method uses path tracing to solve the radiation transfer equation,\nwhich requires a huge amount of calculation and cannot achieve real-time\nrendering. Therefore, this paper uses neural networks to simulate the iterative\nintegration process of solving the radiative transfer equation to speed up the\nvolume rendering of volume media. Specifically, the paper first performs data\nprocessing on the volume medium to generate a variety of sampling features,\nincluding density features, transmittance features and phase features. The\nhierarchical transmittance fields are fed into a 3D-CNN network to compute more\nimportant transmittance features. Secondly, the diffuse reflection sampling\ntemplate and the highlight sampling template are used to layer the three types\nof sampling features into the network. This method can pay more attention to\nlight scattering, highlights and shadows, and then select important channel\nfeatures through the attention module. Finally, the scattering distribution of\nthe center points of all sampling templates is predicted through the backbone\nneural network. This method can achieve realistic volumetric media rendering\neffects and greatly increase the rendering speed while maintaining rendering\nquality, which is of great significance for real-time rendering applications.\nExperimental results indicate that our method outperforms previous methods.\n","authors":["Shun Fang","Xing Feng","Ming Cui"],"pdf_url":"https://arxiv.org/pdf/2401.14051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13388v2","updated":"2024-01-25T09:47:14Z","published":"2024-01-24T11:36:44Z","title":"UNIMO-G: Unified Image Generation through Multimodal Conditional\n  Diffusion","summary":"  Existing text-to-image diffusion models primarily generate images from text\nprompts. However, the inherent conciseness of textual descriptions poses\nchallenges in faithfully synthesizing images with intricate details, such as\nspecific entities or scenes. This paper presents UNIMO-G, a simple multimodal\nconditional diffusion framework that operates on multimodal prompts with\ninterleaved textual and visual inputs, which demonstrates a unified ability for\nboth text-driven and subject-driven image generation. UNIMO-G comprises two\ncore components: a Multimodal Large Language Model (MLLM) for encoding\nmultimodal prompts, and a conditional denoising diffusion network for\ngenerating images based on the encoded multimodal input. We leverage a\ntwo-stage training strategy to effectively train the framework: firstly\npre-training on large-scale text-image pairs to develop conditional image\ngeneration capabilities, and then instruction tuning with multimodal prompts to\nachieve unified image generation proficiency. A well-designed data processing\npipeline involving language grounding and image segmentation is employed to\nconstruct multi-modal prompts. UNIMO-G excels in both text-to-image generation\nand zero-shot subject-driven synthesis, and is notably effective in generating\nhigh-fidelity images from complex multimodal prompts involving multiple image\nentities.\n","authors":["Wei Li","Xue Xu","Jiachen Liu","Xinyan Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.13388v2.pdf","comment":"Project page: https://unimo-ptm.github.io/"},{"id":"http://arxiv.org/abs/2401.14038v1","updated":"2024-01-25T09:33:49Z","published":"2024-01-25T09:33:49Z","title":"Deep Clustering with Diffused Sampling and Hardness-aware\n  Self-distillation","summary":"  Deep clustering has gained significant attention due to its capability in\nlearning clustering-friendly representations without labeled data. However,\nprevious deep clustering methods tend to treat all samples equally, which\nneglect the variance in the latent distribution and the varying difficulty in\nclassifying or clustering different samples. To address this, this paper\nproposes a novel end-to-end deep clustering method with diffused sampling and\nhardness-aware self-distillation (HaDis). Specifically, we first align one view\nof instances with another view via diffused sampling alignment (DSA), which\nhelps improve the intra-cluster compactness. To alleviate the sampling bias, we\npresent the hardness-aware self-distillation (HSD) mechanism to mine the\nhardest positive and negative samples and adaptively adjust their weights in a\nself-distillation fashion, which is able to deal with the potential imbalance\nin sample contributions during optimization. Further, the prototypical\ncontrastive learning is incorporated to simultaneously enhance the\ninter-cluster separability and intra-cluster compactness. Experimental results\non five challenging image datasets demonstrate the superior clustering\nperformance of our HaDis method over the state-of-the-art. Source code is\navailable at https://github.com/Regan-Zhang/HaDis.\n","authors":["Hai-Xin Zhang","Dong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.14038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14036v1","updated":"2024-01-25T09:26:08Z","published":"2024-01-25T09:26:08Z","title":"Diverse and Lifespan Facial Age Transformation Synthesis with Identity\n  Variation Rationality Metric","summary":"  Face aging has received continuous research attention over the past two\ndecades. Although previous works on this topic have achieved impressive\nsuccess, two longstanding problems remain unsettled: 1) generating diverse and\nplausible facial aging patterns at the target age stage; 2) measuring the\nrationality of identity variation between the original portrait and its\nsyntheses with age progression or regression. In this paper, we introduce DLAT\n+ , the first algorithm that can realize Diverse and Lifespan Age\nTransformation on human faces, where the diversity jointly manifests in the\ntransformation of facial textures and shapes. Apart from the diversity\nmechanism embedded in the model, multiple consistency restrictions are\nleveraged to keep it away from counterfactual aging syntheses. Moreover, we\npropose a new metric to assess the rationality of Identity Deviation under Age\nGaps (IDAG) between the input face and its series of age-transformed\ngenerations, which is based on statistical laws summarized from plenty of\ngenuine face-aging data. Extensive experimental results demonstrate the\nuniqueness and effectiveness of our method in synthesizing diverse and\nperceptually reasonable faces across the whole lifetime.\n","authors":["Jiu-Cheng Xie","Jun Yang","Wenqing Wang","Feng Xu","Hao Gao"],"pdf_url":"https://arxiv.org/pdf/2401.14036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14034v1","updated":"2024-01-25T09:24:07Z","published":"2024-01-25T09:24:07Z","title":"Unsupervised Spatial-Temporal Feature Enrichment and Fidelity\n  Preservation Network for Skeleton based Action Recognition","summary":"  Unsupervised skeleton based action recognition has achieved remarkable\nprogress recently. Existing unsupervised learning methods suffer from severe\noverfitting problem, and thus small networks are used, significantly reducing\nthe representation capability. To address this problem, the overfitting\nmechanism behind the unsupervised learning for skeleton based action\nrecognition is first investigated. It is observed that the skeleton is already\na relatively high-level and low-dimension feature, but not in the same manifold\nas the features for action recognition. Simply applying the existing\nunsupervised learning method may tend to produce features that discriminate the\ndifferent samples instead of action classes, resulting in the overfitting\nproblem. To solve this problem, this paper presents an Unsupervised\nspatial-temporal Feature Enrichment and Fidelity Preservation framework\n(U-FEFP) to generate rich distributed features that contain all the information\nof the skeleton sequence. A spatial-temporal feature transformation subnetwork\nis developed using spatial-temporal graph convolutional network and graph\nconvolutional gate recurrent unit network as the basic feature extraction\nnetwork. The unsupervised Bootstrap Your Own Latent based learning is used to\ngenerate rich distributed features and the unsupervised pretext task based\nlearning is used to preserve the information of the skeleton sequence. The two\nunsupervised learning ways are collaborated as U-FEFP to produce robust and\ndiscriminative representations. Experimental results on three widely used\nbenchmarks, namely NTU-RGB+D-60, NTU-RGB+D-120 and PKU-MMD dataset, demonstrate\nthat the proposed U-FEFP achieves the best performance compared with the\nstate-of-the-art unsupervised learning methods. t-SNE illustrations further\nvalidate that U-FEFP can learn more discriminative features for unsupervised\nskeleton based action recognition.\n","authors":["Chuankun Li","Shuai Li","Yanbo Gao","Ping Chen","Jian Li","Wanqing Li"],"pdf_url":"https://arxiv.org/pdf/2401.14034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14032v1","updated":"2024-01-25T09:22:32Z","published":"2024-01-25T09:22:32Z","title":"GauU-Scene: A Scene Reconstruction Benchmark on Large Scale 3D\n  Reconstruction Dataset Using Gaussian Splatting","summary":"  We introduce a novel large-scale scene reconstruction benchmark using the\nnewly developed 3D representation approach, Gaussian Splatting, on our\nexpansive U-Scene dataset. U-Scene encompasses over one and a half square\nkilometres, featuring a comprehensive RGB dataset coupled with LiDAR ground\ntruth. For data acquisition, we employed the Matrix 300 drone equipped with the\nhigh-accuracy Zenmuse L1 LiDAR, enabling precise rooftop data collection. This\ndataset, offers a unique blend of urban and academic environments for advanced\nspatial analysis convers more than 1.5 km$^2$. Our evaluation of U-Scene with\nGaussian Splatting includes a detailed analysis across various novel\nviewpoints. We also juxtapose these results with those derived from our\naccurate point cloud dataset, highlighting significant differences that\nunderscore the importance of combine multi-modal information\n","authors":["Butian Xiong","Zhuo Li","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2401.14032v1.pdf","comment":"IJCAI2024 submit, 8 pages"},{"id":"http://arxiv.org/abs/2401.14031v1","updated":"2024-01-25T09:21:29Z","published":"2024-01-25T09:21:29Z","title":"Sparse and Transferable Universal Singular Vectors Attack","summary":"  The research in the field of adversarial attacks and models' vulnerability is\none of the fundamental directions in modern machine learning. Recent studies\nreveal the vulnerability phenomenon, and understanding the mechanisms behind\nthis is essential for improving neural network characteristics and\ninterpretability. In this paper, we propose a novel sparse universal white-box\nadversarial attack. Our approach is based on truncated power iteration\nproviding sparsity to $(p,q)$-singular vectors of the hidden layers of Jacobian\nmatrices. Using the ImageNet benchmark validation subset, we analyze the\nproposed method in various settings, achieving results comparable to dense\nbaselines with more than a 50% fooling rate while damaging only 5% of pixels\nand utilizing 256 samples for perturbation fitting. We also show that our\nalgorithm admits higher attack magnitude without affecting the human ability to\nsolve the task. Furthermore, we investigate that the constructed perturbations\nare highly transferable among different models without significantly decreasing\nthe fooling rate. Our findings demonstrate the vulnerability of\nstate-of-the-art models to sparse attacks and highlight the importance of\ndeveloping robust machine learning systems.\n","authors":["Kseniia Kuvshinova","Olga Tsymboi","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2401.14031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14024v1","updated":"2024-01-25T09:17:05Z","published":"2024-01-25T09:17:05Z","title":"PLCNet: Patch-wise Lane Correction Network for Automatic Lane Correction\n  in High-definition Maps","summary":"  In High-definition (HD) maps, lane elements constitute the majority of\ncomponents and demand stringent localization requirements to ensure safe\nvehicle navigation. Vision lane detection with LiDAR position assignment is a\nprevalent method to acquire initial lanes for HD maps. However, due to\nincorrect vision detection and coarse camera-LiDAR calibration, initial lanes\nmay deviate from their true positions within an uncertain range. To mitigate\nthe need for manual lane correction, we propose a patch-wise lane correction\nnetwork (PLCNet) to automatically correct the positions of initial lane points\nin local LiDAR images that are transformed from point clouds. PLCNet first\nextracts multi-scale image features and crops patch (ROI) features centered at\neach initial lane point. By applying ROIAlign, the fix-sized ROI features are\nflattened into 1D features. Then, a 1D lane attention module is devised to\ncompute instance-level lane features with adaptive weights. Finally, lane\ncorrection offsets are inferred by a multi-layer perceptron and used to correct\nthe initial lane positions. Considering practical applications, our automatic\nmethod supports merging local corrected lanes into global corrected lanes.\nThrough extensive experiments on a self-built dataset, we demonstrate that\nPLCNet achieves fast and effective initial lane correction.\n","authors":["Haiyang Peng","Yi Zhan","Benkang Wang","Hongtao Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.14024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09571v3","updated":"2024-01-25T09:10:16Z","published":"2023-04-19T11:19:10Z","title":"LLIC: Large Receptive Field Transform Coding with Adaptive Weights for\n  Learned Image Compression","summary":"  Effective Receptive field (ERF) plays an important role in transform coding,\nwhich determines how much redundancy can be removed at most during transform\nand how many spatial priors can be utilized to synthesize textures during\ninverse transform. Existing methods rely on stacks of small kernels, whose ERF\nremains not large enough instead, or heavy non-local attention mechanisms,\nwhich limit the potential of high resolution image coding. To tackle this\nissue, we propose Large Receptive Field Transform Coding with Adaptive Weights\nfor Learned Image Compression (LLIC). Specifically, for the first time in\nlearned image compression community, we introduce a few large kernel-based\ndepth-wise convolutions to reduce more redundancy while maintaining modest\ncomplexity. Due to wide range of image diversity, we propose to enhance the\nadaptability of convolutions via generating weights in a self-conditioned\nmanner. The large kernels cooperate with non-linear embedding and gate\nmechanisms for better expressiveness and lighter point-wise interactions. We\nalso investigate improved training techniques to fully exploit the potential of\nlarge kernels. In addition, to enhance the interactions among channels, we\npropose the adaptive channel-wise bit allocation via generating channel\nimportance factor in a self-conditioned manner. To demonstrate the\neffectiveness of proposed transform coding, we align the entropy model to\ncompare with existing transform methods and obtain models LLIC-STF, LLIC-ELIC,\nLLIC-TCM. Extensive experiments demonstrate our proposed LLIC models have\nsignificant improvements over corresponding baselines and achieve\nstate-of-the-art performances and better trade-off between performance and\ncomplexity.\n","authors":["Wei Jiang","Peirong Ning","Jiayu Yang","Yongqi Zhai","Feng Gao","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.09571v3.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2211.11167v2","updated":"2024-01-25T08:23:42Z","published":"2022-11-21T03:48:13Z","title":"Vision Transformer with Super Token Sampling","summary":"  Vision transformer has achieved impressive performance for many vision tasks.\nHowever, it may suffer from high redundancy in capturing local features for\nshallow layers. Local self-attention or early-stage convolutions are thus\nutilized, which sacrifice the capacity to capture long-range dependency. A\nchallenge then arises: can we access efficient and effective global context\nmodeling at the early stages of a neural network? To address this issue, we\ndraw inspiration from the design of superpixels, which reduces the number of\nimage primitives in subsequent processing, and introduce super tokens into\nvision transformer. Super tokens attempt to provide a semantically meaningful\ntessellation of visual content, thus reducing the token number in\nself-attention as well as preserving global modeling. Specifically, we propose\na simple yet strong super token attention (STA) mechanism with three steps: the\nfirst samples super tokens from visual tokens via sparse association learning,\nthe second performs self-attention on super tokens, and the last maps them back\nto the original token space. STA decomposes vanilla global attention into\nmultiplications of a sparse association map and a low-dimensional attention,\nleading to high efficiency in capturing global dependencies. Based on STA, we\ndevelop a hierarchical vision transformer. Extensive experiments demonstrate\nits strong performance on various vision tasks. In particular, without any\nextra training data or label, it achieves 86.4% top-1 accuracy on ImageNet-1K\nwith less than 100M parameters. It also achieves 53.9 box AP and 46.8 mask AP\non the COCO detection task, and 51.9 mIOU on the ADE20K semantic segmentation\ntask. Code is released at https://github.com/hhb072/STViT.\n","authors":["Huaibo Huang","Xiaoqiang Zhou","Jie Cao","Ran He","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2211.11167v2.pdf","comment":"12 pages, 4 figures, 8 tables"},{"id":"http://arxiv.org/abs/2310.06992v2","updated":"2024-01-25T08:11:43Z","published":"2023-10-10T20:25:30Z","title":"Zero-Shot Open-Vocabulary Tracking with Large Pre-Trained Models","summary":"  Object tracking is central to robot perception and scene understanding.\nTracking-by-detection has long been a dominant paradigm for object tracking of\nspecific object categories. Recently, large-scale pre-trained models have shown\npromising advances in detecting and segmenting objects and parts in 2D static\nimages in the wild. This begs the question: can we re-purpose these large-scale\npre-trained static image models for open-vocabulary video tracking? In this\npaper, we re-purpose an open-vocabulary detector, segmenter, and dense optical\nflow estimator, into a model that tracks and segments objects of any category\nin 2D videos. Our method predicts object and part tracks with associated\nlanguage descriptions in monocular videos, rebuilding the pipeline of Tractor\nwith modern large pre-trained models for static image detection and\nsegmentation: we detect open-vocabulary object instances and propagate their\nboxes from frame to frame using a flow-based motion model, refine the\npropagated boxes with the box regression module of the visual detector, and\nprompt an open-world segmenter with the refined box to segment the objects. We\ndecide the termination of an object track based on the objectness score of the\npropagated boxes, as well as forward-backward optical flow consistency. We\nre-identify objects across occlusions using deep feature matching. We show that\nour model achieves strong performance on multiple established video object\nsegmentation and tracking benchmarks, and can produce reasonable tracks in\nmanipulation data. In particular, our model outperforms previous\nstate-of-the-art in UVO and BURST, benchmarks for open-world object tracking\nand segmentation, despite never being explicitly trained for tracking. We hope\nthat our approach can serve as a simple and extensible framework for future\nresearch.\n","authors":["Wen-Hsuan Chu","Adam W. Harley","Pavel Tokmakov","Achal Dave","Leonidas Guibas","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2310.06992v2.pdf","comment":"Project page available at https://wenhsuanchu.github.io/ovtracktor/"},{"id":"http://arxiv.org/abs/2401.14007v1","updated":"2024-01-25T08:11:27Z","published":"2024-01-25T08:11:27Z","title":"Semantic Ensemble Loss and Latent Refinement for High-Fidelity Neural\n  Image Compression","summary":"  Recent advancements in neural compression have surpassed traditional codecs\nin PSNR and MS-SSIM measurements. However, at low bit-rates, these methods can\nintroduce visually displeasing artifacts, such as blurring, color shifting, and\ntexture loss, thereby compromising perceptual quality of images. To address\nthese issues, this study presents an enhanced neural compression method\ndesigned for optimal visual fidelity. We have trained our model with a\nsophisticated semantic ensemble loss, integrating Charbonnier loss, perceptual\nloss, style loss, and a non-binary adversarial loss, to enhance the perceptual\nquality of image reconstructions. Additionally, we have implemented a latent\nrefinement process to generate content-aware latent codes. These codes adhere\nto bit-rate constraints, balance the trade-off between distortion and fidelity,\nand prioritize bit allocation to regions of greater importance. Our empirical\nfindings demonstrate that this approach significantly improves the statistical\nfidelity of neural image compression. On CLIC2024 validation set, our approach\nachieves a 62% bitrate saving compared to MS-ILLM under FID metric.\n","authors":["Daxin Li","Yuanchao Bai","Kai Wang","Junjun Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2401.14007v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.13998v1","updated":"2024-01-25T07:53:54Z","published":"2024-01-25T07:53:54Z","title":"WAL-Net: Weakly supervised auxiliary task learning network for carotid\n  plaques classification","summary":"  The classification of carotid artery ultrasound images is a crucial means for\ndiagnosing carotid plaques, holding significant clinical relevance for\npredicting the risk of stroke. Recent research suggests that utilizing plaque\nsegmentation as an auxiliary task for classification can enhance performance by\nleveraging the correlation between segmentation and classification tasks.\nHowever, this approach relies on obtaining a substantial amount of\nchallenging-to-acquire segmentation annotations. This paper proposes a novel\nweakly supervised auxiliary task learning network model (WAL-Net) to explore\nthe interdependence between carotid plaque classification and segmentation\ntasks. The plaque classification task is primary task, while the plaque\nsegmentation task serves as an auxiliary task, providing valuable information\nto enhance the performance of the primary task. Weakly supervised learning is\nadopted in the auxiliary task to completely break away from the dependence on\nsegmentation annotations. Experiments and evaluations are conducted on a\ndataset comprising 1270 carotid plaque ultrasound images from Wuhan University\nZhongnan Hospital. Results indicate that the proposed method achieved an\napproximately 1.3% improvement in carotid plaque classification accuracy\ncompared to the baseline network. Specifically, the accuracy of mixed-echoic\nplaques classification increased by approximately 3.3%, demonstrating the\neffectiveness of our approach.\n","authors":["Haitao Gan","Lingchao Fu","Ran Zhou","Weiyan Gan","Furong Wang","Xiaoyan Wu","Zhi Yang","Zhongwei Huang"],"pdf_url":"https://arxiv.org/pdf/2401.13998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05994v5","updated":"2024-01-25T07:50:47Z","published":"2023-01-15T00:55:40Z","title":"Min-Max-Jump distance and its applications","summary":"  We explore three applications of Min-Max-Jump distance (MMJ distance).\nMMJ-based K-means revises K-means with MMJ distance. MMJ-based Silhouette\ncoefficient revises Silhouette coefficient with MMJ distance. We also tested\nthe Clustering with Neural Network and Index (CNNI) model with MMJ-based\nSilhouette coefficient. In the last application, we tested using Min-Max-Jump\ndistance for predicting labels of new points, after a clustering analysis of\ndata. Result shows Min-Max-Jump distance achieves good performances in all the\nthree proposed applications. In addition, we devise several algorithms for\ncalculating or estimating the distance.\n","authors":["Gangli Liu"],"pdf_url":"https://arxiv.org/pdf/2301.05994v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13992v1","updated":"2024-01-25T07:28:22Z","published":"2024-01-25T07:28:22Z","title":"Diffusion-based Data Augmentation for Object Counting Problems","summary":"  Crowd counting is an important problem in computer vision due to its wide\nrange of applications in image understanding. Currently, this problem is\ntypically addressed using deep learning approaches, such as Convolutional\nNeural Networks (CNNs) and Transformers. However, deep networks are data-driven\nand are prone to overfitting, especially when the available labeled crowd\ndataset is limited. To overcome this limitation, we have designed a pipeline\nthat utilizes a diffusion model to generate extensive training data. We are the\nfirst to generate images conditioned on a location dot map (a binary dot map\nthat specifies the location of human heads) with a diffusion model. We are also\nthe first to use these diverse synthetic data to augment the crowd counting\nmodels. Our proposed smoothed density map input for ControlNet significantly\nimproves ControlNet's performance in generating crowds in the correct\nlocations. Also, Our proposed counting loss for the diffusion model effectively\nminimizes the discrepancies between the location dot map and the crowd images\ngenerated. Additionally, our innovative guidance sampling further directs the\ndiffusion process toward regions where the generated crowd images align most\naccurately with the location dot map. Collectively, we have enhanced\nControlNet's ability to generate specified objects from a location dot map,\nwhich can be used for data augmentation in various counting problems. Moreover,\nour framework is versatile and can be easily adapted to all kinds of counting\nproblems. Extensive experiments demonstrate that our framework improves the\ncounting performance on the ShanghaiTech, NWPU-Crowd, UCF-QNRF, and TRANCOS\ndatasets, showcasing its effectiveness.\n","authors":["Zhen Wang","Yuelei Li","Jia Wan","Nuno Vasconcelos"],"pdf_url":"https://arxiv.org/pdf/2401.13992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13990v1","updated":"2024-01-25T07:24:06Z","published":"2024-01-25T07:24:06Z","title":"Deep Learning Innovations in Diagnosing Diabetic Retinopathy: The\n  Potential of Transfer Learning and the DiaCNN Model","summary":"  Diabetic retinopathy (DR) is a significant cause of vision impairment,\nemphasizing the critical need for early detection and timely intervention to\navert visual deterioration. Diagnosing DR is inherently complex, as it\nnecessitates the meticulous examination of intricate retinal images by\nexperienced specialists. This makes the early diagnosis of DR essential for\neffective treatment and the prevention of eventual blindness. Traditional\ndiagnostic methods, relying on human interpretation of these medical images,\nface challenges in terms of accuracy and efficiency. In the present research,\nwe introduce a novel method that offers superior precision in DR diagnosis,\ncompared to these traditional methods, by employing advanced deep learning\ntechniques. Central to this approach is the concept of transfer learning. This\nentails using pre-existing, well-established models, specifically\nInceptionResNetv2 and Inceptionv3, to extract features and fine-tune select\nlayers to cater to the unique requirements of this specific diagnostic task.\nConcurrently, we also present a newly devised model, DiaCNN, which is tailored\nfor the classification of eye diseases. To validate the efficacy of the\nproposed methodology, we leveraged the Ocular Disease Intelligent Recognition\n(ODIR) dataset, which comprises eight different eye disease categories. The\nresults were promising. The InceptionResNetv2 model, incorporating transfer\nlearning, registered an impressive 97.5% accuracy in both the training and\ntesting phases. Its counterpart, the Inceptionv3 model, achieved an even more\ncommendable 99.7% accuracy during training, and 97.5% during testing.\nRemarkably, the DiaCNN model showcased unparalleled precision, achieving 100%\naccuracy in training and 98.3\\% in testing.\n","authors":["Mohamed R. Shoaib","Heba M. Emara","Jun Zhao","Walid El-Shafai","Naglaa F. Soliman","Ahmed S. Mubarak","Osama A. Omer","Fathi E. Abd El-Samie","Hamada Esmaiel"],"pdf_url":"https://arxiv.org/pdf/2401.13990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19094v2","updated":"2024-01-25T07:10:41Z","published":"2023-05-30T14:58:24Z","title":"Diffusion Model for Dense Matching","summary":"  The objective for establishing dense correspondence between paired images\nconsists of two terms: a data term and a prior term. While conventional\ntechniques focused on defining hand-designed prior terms, which are difficult\nto formulate, recent approaches have focused on learning the data term with\ndeep neural networks without explicitly modeling the prior, assuming that the\nmodel itself has the capacity to learn an optimal prior from a large-scale\ndataset. The performance improvement was obvious, however, they often fail to\naddress inherent ambiguities of matching, such as textureless regions,\nrepetitive patterns, and large displacements. To address this, we propose\nDiffMatch, a novel conditional diffusion-based framework designed to explicitly\nmodel both the data and prior terms. Unlike previous approaches, this is\naccomplished by leveraging a conditional denoising diffusion model. DiffMatch\nconsists of two main components: conditional denoising diffusion module and\ncost injection module. We stabilize the training process and reduce memory\nusage with a stage-wise training strategy. Furthermore, to boost performance,\nwe introduce an inference technique that finds a better path to the accurate\nmatching field. Our experimental results demonstrate significant performance\nimprovements of our method over existing approaches, and the ablation studies\nvalidate our design choices along with the effectiveness of each component.\nProject page is available at https://ku-cvlab.github.io/DiffMatch/.\n","authors":["Jisu Nam","Gyuseong Lee","Sunwoo Kim","Hyeonsu Kim","Hyoungwon Cho","Seyeon Kim","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2305.19094v2.pdf","comment":"ICLR 2024 (Oral), Project page is available at\n  https://ku-cvlab.github.io/DiffMatch/"},{"id":"http://arxiv.org/abs/2401.10475v2","updated":"2024-01-25T06:58:17Z","published":"2024-01-19T03:54:58Z","title":"CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short\n  Video Search Scenarios","summary":"  Vision-Language Models pre-trained on large-scale image-text datasets have\nshown superior performance in downstream tasks such as image retrieval. Most of\nthe images for pre-training are presented in the form of open domain\ncommon-sense visual elements. Differently, video covers in short video search\nscenarios are presented as user-originated contents that provide important\nvisual summaries of videos. In addition, a portion of the video covers come\nwith manually designed cover texts that provide semantic complements. In order\nto fill in the gaps in short video cover data, we establish the first\nlarge-scale cover-text benchmark for Chinese short video search scenarios.\nSpecifically, we release two large-scale datasets CBVS-5M/10M to provide short\nvideo covers, and the manual fine-labeling dataset CBVS-20K to provide real\nuser queries, which serves as an image-text benchmark test in the Chinese short\nvideo search field. To integrate the semantics of cover text in the case of\nmodality missing, we propose UniCLIP where cover texts play a guiding role\nduring training, however are not relied upon by inference. Extensive evaluation\non CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has\nbeen deployed to Tencent's online video search systems with hundreds of\nmillions of visits and achieved significant gains. The dataset and code are\navailable at https://github.com/QQBrowserVideoSearch/CBVS-UniCLIP.\n","authors":["Xiangshuo Qiao","Xianxin Li","Xiaozhe Qu","Jie Zhang","Yang Liu","Yu Luo","Cihang Jin","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2401.10475v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02982v2","updated":"2024-01-25T06:39:55Z","published":"2023-08-06T01:11:40Z","title":"Beyond First Impressions: Integrating Joint Multi-modal Cues for\n  Comprehensive 3D Representation","summary":"  In recent years, 3D understanding has turned to 2D vision-language\npre-trained models to overcome data scarcity challenges. However, existing\nmethods simply transfer 2D alignment strategies, aligning 3D representations\nwith single-view 2D images and coarse-grained parent category text. These\napproaches introduce information degradation and insufficient synergy issues,\nleading to performance loss. Information degradation arises from overlooking\nthe fact that a 3D representation should be equivalent to a series of\nmulti-view images and more fine-grained subcategory text. Insufficient synergy\nneglects the idea that a robust 3D representation should align with the joint\nvision-language space, rather than independently aligning with each modality.\nIn this paper, we propose a multi-view joint modality modeling approach, termed\nJM3D, to obtain a unified representation for point cloud, text, and image.\nSpecifically, a novel Structured Multimodal Organizer (SMO) is proposed to\naddress the information degradation issue, which introduces contiguous\nmulti-view images and hierarchical text to enrich the representation of vision\nand language modalities. A Joint Multi-modal Alignment (JMA) is designed to\ntackle the insufficient synergy problem, which models the joint modality by\nincorporating language knowledge into the visual modality. Extensive\nexperiments on ModelNet40 and ScanObjectNN demonstrate the effectiveness of our\nproposed method, JM3D, which achieves state-of-the-art performance in zero-shot\n3D classification. JM3D outperforms ULIP by approximately 4.3% on PointMLP and\nachieves an improvement of up to 6.5% accuracy on PointNet++ in top-1 accuracy\nfor zero-shot 3D classification on ModelNet40. The source code and trained\nmodels for all our experiments are publicly available at\nhttps://github.com/Mr-Neko/JM3D.\n","authors":["Haowei Wang","Jiji Tang","Jiayi Ji","Xiaoshuai Sun","Rongsheng Zhang","Yiwei Ma","Minda Zhao","Lincheng Li","zeng zhao","Tangjie Lv","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.02982v2.pdf","comment":"ACM MM 2023, 3D Understanding, JM3D"},{"id":"http://arxiv.org/abs/2401.13976v1","updated":"2024-01-25T06:34:49Z","published":"2024-01-25T06:34:49Z","title":"Learning to Manipulate Artistic Images","summary":"  Recent advancement in computer vision has significantly lowered the barriers\nto artistic creation. Exemplar-based image translation methods have attracted\nmuch attention due to flexibility and controllability. However, these methods\nhold assumptions regarding semantics or require semantic information as the\ninput, while accurate semantics is not easy to obtain in artistic images.\nBesides, these methods suffer from cross-domain artifacts due to training data\nprior and generate imprecise structure due to feature compression in the\nspatial domain. In this paper, we propose an arbitrary Style Image Manipulation\nNetwork (SIM-Net), which leverages semantic-free information as guidance and a\nregion transportation strategy in a self-supervised manner for image\ngeneration. Our method balances computational efficiency and high resolution to\na certain extent. Moreover, our method facilitates zero-shot style image\nmanipulation. Both qualitative and quantitative experiments demonstrate the\nsuperiority of our method over state-of-the-art methods.Code is available at\nhttps://github.com/SnailForce/SIM-Net.\n","authors":["Wei Guo","Yuqi Zhang","De Ma","Qian Zheng"],"pdf_url":"https://arxiv.org/pdf/2401.13976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09503v3","updated":"2024-01-25T06:34:09Z","published":"2023-10-14T06:13:20Z","title":"JM3D & JM3D-LLM: Elevating 3D Understanding with Joint Multi-modal Cues","summary":"  The rising importance of 3D understanding, pivotal in computer vision,\nautonomous driving, and robotics, is evident. However, a prevailing trend,\nwhich straightforwardly resorted to transferring 2D alignment strategies to the\n3D domain, encounters three distinct challenges: (1) Information Degradation:\nThis arises from the alignment of 3D data with mere single-view 2D images and\ngeneric texts, neglecting the need for multi-view images and detailed\nsubcategory texts. (2) Insufficient Synergy: These strategies align 3D\nrepresentations to image and text features individually, hampering the overall\noptimization for 3D models. (3) Underutilization: The fine-grained information\ninherent in the learned representations is often not fully exploited,\nindicating a potential loss in detail. To address these issues, we introduce\nJM3D, a comprehensive approach integrating point cloud, text, and image. Key\ncontributions include the Structured Multimodal Organizer (SMO), enriching\nvision-language representation with multiple views and hierarchical text, and\nthe Joint Multi-modal Alignment (JMA), combining language understanding with\nvisual representation. Our advanced model, JM3D-LLM, marries 3D representation\nwith large language models via efficient fine-tuning. Evaluations on ModelNet40\nand ScanObjectNN establish JM3D's superiority. The superior performance of\nJM3D-LLM further underscores the effectiveness of our representation transfer\napproach. Our code and models are available at https://github.com/Mr-Neko/JM3D.\n","authors":["Jiayi Ji","Haowei Wang","Changli Wu","Yiwei Ma","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2310.09503v3.pdf","comment":"16 pages, 4 figures, 10 tables, 3D understanding"},{"id":"http://arxiv.org/abs/2311.15193v2","updated":"2024-01-25T06:32:22Z","published":"2023-11-26T05:17:11Z","title":"IA-LSTM: Interaction-Aware LSTM for Pedestrian Trajectory Prediction","summary":"  Predicting the trajectory of pedestrians in crowd scenarios is indispensable\nin self-driving or autonomous mobile robot field because estimating the future\nlocations of pedestrians around is beneficial for policy decision to avoid\ncollision. It is a challenging issue because humans have different walking\nmotions, and the interactions between humans and objects in the current\nenvironment, especially between humans themselves, are complex. Previous\nresearchers focused on how to model human-human interactions but neglected the\nrelative importance of interactions. To address this issue, a novel mechanism\nbased on correntropy is introduced. The proposed mechanism not only can measure\nthe relative importance of human-human interactions but also can build personal\nspace for each pedestrian. An interaction module including this data-driven\nmechanism is further proposed. In the proposed module, the data-driven\nmechanism can effectively extract the feature representations of dynamic\nhuman-human interactions in the scene and calculate the corresponding weights\nto represent the importance of different interactions. To share such social\nmessages among pedestrians, an interaction-aware architecture based on long\nshort-term memory network for trajectory prediction is designed. Experiments\nare conducted on two public datasets. Experimental results demonstrate that our\nmodel can achieve better performance than several latest methods with good\nperformance.\n","authors":["Yuehai Chen"],"pdf_url":"https://arxiv.org/pdf/2311.15193v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13974v1","updated":"2024-01-25T06:18:20Z","published":"2024-01-25T06:18:20Z","title":"BootPIG: Bootstrapping Zero-shot Personalized Image Generation\n  Capabilities in Pretrained Diffusion Models","summary":"  Recent text-to-image generation models have demonstrated incredible success\nin generating images that faithfully follow input prompts. However, the\nrequirement of using words to describe a desired concept provides limited\ncontrol over the appearance of the generated concepts. In this work, we address\nthis shortcoming by proposing an approach to enable personalization\ncapabilities in existing text-to-image diffusion models. We propose a novel\narchitecture (BootPIG) that allows a user to provide reference images of an\nobject in order to guide the appearance of a concept in the generated images.\n  The proposed BootPIG architecture makes minimal modifications to a pretrained\ntext-to-image diffusion model and utilizes a separate UNet model to steer the\ngenerations toward the desired appearance. We introduce a training procedure\nthat allows us to bootstrap personalization capabilities in the BootPIG\narchitecture using data generated from pretrained text-to-image models, LLM\nchat agents, and image segmentation models. In contrast to existing methods\nthat require several days of pretraining, the BootPIG architecture can be\ntrained in approximately 1 hour. Experiments on the DreamBooth dataset\ndemonstrate that BootPIG outperforms existing zero-shot methods while being\ncomparable with test-time finetuning approaches. Through a user study, we\nvalidate the preference for BootPIG generations over existing methods both in\nmaintaining fidelity to the reference object's appearance and aligning with\ntextual prompts.\n","authors":["Senthil Purushwalkam","Akash Gokul","Shafiq Joty","Nikhil Naik"],"pdf_url":"https://arxiv.org/pdf/2401.13974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13965v1","updated":"2024-01-25T05:55:44Z","published":"2024-01-25T05:55:44Z","title":"Improving Pseudo-labelling and Enhancing Robustness for Semi-Supervised\n  Domain Generalization","summary":"  Beyond attaining domain generalization (DG), visual recognition models should\nalso be data-efficient during learning by leveraging limited labels. We study\nthe problem of Semi-Supervised Domain Generalization (SSDG) which is crucial\nfor real-world applications like automated healthcare. SSDG requires learning a\ncross-domain generalizable model when the given training data is only partially\nlabelled. Empirical investigations reveal that the DG methods tend to\nunderperform in SSDG settings, likely because they are unable to exploit the\nunlabelled data. Semi-supervised learning (SSL) shows improved but still\ninferior results compared to fully-supervised learning. A key challenge, faced\nby the best-performing SSL-based SSDG methods, is selecting accurate\npseudo-labels under multiple domain shifts and reducing overfitting to source\ndomains under limited labels. In this work, we propose new SSDG approach, which\nutilizes a novel uncertainty-guided pseudo-labelling with model averaging\n(UPLM). Our uncertainty-guided pseudo-labelling (UPL) uses model uncertainty to\nimprove pseudo-labelling selection, addressing poor model calibration under\nmulti-source unlabelled data. The UPL technique, enhanced by our novel model\naveraging (MA) strategy, mitigates overfitting to source domains with limited\nlabels. Extensive experiments on key representative DG datasets suggest that\nour method demonstrates effectiveness against existing methods. Our code and\nchosen labelled data seeds are available on GitHub:\nhttps://github.com/Adnan-Khan7/UPLM\n","authors":["Adnan Khan","Mai A. Shaaban","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2401.13965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13964v1","updated":"2024-01-25T05:55:03Z","published":"2024-01-25T05:55:03Z","title":"An Extensible Framework for Open Heterogeneous Collaborative Perception","summary":"  Collaborative perception aims to mitigate the limitations of single-agent\nperception, such as occlusions, by facilitating data exchange among multiple\nagents. However, most current works consider a homogeneous scenario where all\nagents use identity sensors and perception models. In reality, heterogeneous\nagent types may continually emerge and inevitably face a domain gap when\ncollaborating with existing agents. In this paper, we introduce a new open\nheterogeneous problem: how to accommodate continually emerging new\nheterogeneous agent types into collaborative perception, while ensuring high\nperception performance and low integration cost? To address this problem, we\npropose HEterogeneous ALliance (HEAL), a novel extensible collaborative\nperception framework. HEAL first establishes a unified feature space with\ninitial agents via a novel multi-scale foreground-aware Pyramid Fusion network.\nWhen heterogeneous new agents emerge with previously unseen modalities or\nmodels, we align them to the established unified space with an innovative\nbackward alignment. This step only involves individual training on the new\nagent type, thus presenting extremely low training costs and high\nextensibility. It also protects new agents' model details from disclosure since\nthe training can be conducted by the agent owner locally. To enrich agents'\ndata heterogeneity, we bring OPV2V-H, a new large-scale dataset with more\ndiverse sensor types. Extensive experiments on OPV2V-H and DAIR-V2X datasets\nshow that HEAL surpasses SOTA methods in performance while reducing the\ntraining parameters by 91.5% when integrating 3 new agent types. Code and data\nare available at: https://github.com/yifanlu0227/HEAL.\n","authors":["Yifan Lu","Yue Hu","Yiqi Zhong","Dequan Wang","Siheng Chen","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2401.13964v1.pdf","comment":"Accepted by ICLR 2024. The code and data are open-sourced at\n  https://github.com/yifanlu0227/HEAL"},{"id":"http://arxiv.org/abs/2401.13961v1","updated":"2024-01-25T05:50:48Z","published":"2024-01-25T05:50:48Z","title":"TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation\n  in VEM images","summary":"  In this paper, we address a significant gap in the field of neuroimaging by\nintroducing the largest-to-date public benchmark, BvEM, designed specifically\nfor cortical blood vessel segmentation in Volume Electron Microscopy (VEM)\nimages. The intricate relationship between cerebral blood vessels and neural\nfunction underscores the vital role of vascular analysis in understanding brain\nhealth. While imaging techniques at macro and mesoscales have garnered\nsubstantial attention and resources, the microscale VEM imaging, capable of\nrevealing intricate vascular details, has lacked the necessary benchmarking\ninfrastructure. As researchers delve deeper into the microscale intricacies of\ncerebral vasculature, our BvEM benchmark represents a critical step toward\nunraveling the mysteries of neurovascular coupling and its impact on brain\nfunction and pathology. The BvEM dataset is based on VEM image volumes from\nthree mammal species: adult mouse, macaque, and human. We standardized the\nresolution, addressed imaging variations, and meticulously annotated blood\nvessels through semi-automatic, manual, and quality control processes, ensuring\nhigh-quality 3D segmentation. Furthermore, we developed a zero-shot cortical\nblood vessel segmentation method named TriSAM, which leverages the powerful\nsegmentation model SAM for 3D segmentation. To lift SAM from 2D segmentation to\n3D volume segmentation, TriSAM employs a multi-seed tracking framework,\nleveraging the reliability of certain image planes for tracking while using\nothers to identify potential turning points. This approach, consisting of\nTri-Plane selection, SAM-based tracking, and recursive redirection, effectively\nachieves long-term 3D blood vessel segmentation without model training or\nfine-tuning. Experimental results show that TriSAM achieved superior\nperformances on the BvEM benchmark across three species.\n","authors":["Jia Wan","Wanhua Li","Atmadeep Banerjee","Jason Ken Adhinarta","Evelina Sjostedt","Jingpeng Wu","Jeff Lichtman","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2401.13961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13959v1","updated":"2024-01-25T05:36:48Z","published":"2024-01-25T05:36:48Z","title":"Conditional Neural Video Coding with Spatial-Temporal Super-Resolution","summary":"  This document is an expanded version of a one-page abstract originally\npresented at the 2024 Data Compression Conference. It describes our proposed\nmethod for the video track of the Challenge on Learned Image Compression (CLIC)\n2024. Our scheme follows the typical hybrid coding framework with some novel\ntechniques. Firstly, we adopt Spynet network to produce accurate motion vectors\nfor motion estimation. Secondly, we introduce the context mining scheme with\nconditional frame coding to fully exploit the spatial-temporal information. As\nfor the low target bitrates given by CLIC, we integrate spatial-temporal\nsuper-resolution modules to improve rate-distortion performance. Our team name\nis IMCLVC.\n","authors":["Henan Wang","Xiaohan Pan","Runsen Feng","Zongyu Guo","Zhibo Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13959v1.pdf","comment":"Accepted by the 2024 Data Compression Conference (DCC) for\n  presentation as a poster"},{"id":"http://arxiv.org/abs/2401.13956v1","updated":"2024-01-25T05:27:09Z","published":"2024-01-25T05:27:09Z","title":"A New Image Quality Database for Multiple Industrial Processes","summary":"  Recent years have witnessed a broader range of applications of image\nprocessing technologies in multiple industrial processes, such as smoke\ndetection, security monitoring, and workpiece inspection. Different kinds of\ndistortion types and levels must be introduced into an image during the\nprocesses of acquisition, compression, transmission, storage, and display,\nwhich might heavily degrade the image quality and thus strongly reduce the\nfinal display effect and clarity. To verify the reliability of existing image\nquality assessment methods, we establish a new industrial process image\ndatabase (IPID), which contains 3000 distorted images generated by applying\ndifferent levels of distortion types to each of the 50 source images. We\nconduct the subjective test on the aforementioned 3000 images to collect their\nsubjective quality ratings in a well-suited laboratory environment. Finally, we\nperform comparison experiments on IPID database to investigate the performance\nof some objective image quality assessment algorithms. The experimental results\nshow that the state-of-the-art image quality assessment methods have difficulty\nin predicting the quality of images that contain multiple distortion types.\n","authors":["Xuanchao Ma","Zehan Wu","Hongyan Liu","Chengxu Zhou","Ke Gu"],"pdf_url":"https://arxiv.org/pdf/2401.13956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13950v1","updated":"2024-01-25T05:09:30Z","published":"2024-01-25T05:09:30Z","title":"AM-SORT: Adaptable Motion Predictor with Historical Trajectory Embedding\n  for Multi-Object Tracking","summary":"  Many multi-object tracking (MOT) approaches, which employ the Kalman Filter\nas a motion predictor, assume constant velocity and Gaussian-distributed\nfiltering noises. These assumptions render the Kalman Filter-based trackers\neffective in linear motion scenarios. However, these linear assumptions serve\nas a key limitation when estimating future object locations within scenarios\ninvolving non-linear motion and occlusions. To address this issue, we propose a\nmotion-based MOT approach with an adaptable motion predictor, called AM-SORT,\nwhich adapts to estimate non-linear uncertainties. AM-SORT is a novel extension\nof the SORT-series trackers that supersedes the Kalman Filter with the\ntransformer architecture as a motion predictor. We introduce a historical\ntrajectory embedding that empowers the transformer to extract spatio-temporal\nfeatures from a sequence of bounding boxes. AM-SORT achieves competitive\nperformance compared to state-of-the-art trackers on DanceTrack, with 56.3 IDF1\nand 55.6 HOTA. We conduct extensive experiments to demonstrate the\neffectiveness of our method in predicting non-linear movement under occlusions.\n","authors":["Vitaliy Kim","Gunho Jung","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2401.13950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13942v1","updated":"2024-01-25T04:53:03Z","published":"2024-01-25T04:53:03Z","title":"StyleInject: Parameter Efficient Tuning of Text-to-Image Diffusion\n  Models","summary":"  The ability to fine-tune generative models for text-to-image generation tasks\nis crucial, particularly facing the complexity involved in accurately\ninterpreting and visualizing textual inputs. While LoRA is efficient for\nlanguage model adaptation, it often falls short in text-to-image tasks due to\nthe intricate demands of image generation, such as accommodating a broad\nspectrum of styles and nuances. To bridge this gap, we introduce StyleInject, a\nspecialized fine-tuning approach tailored for text-to-image models. StyleInject\ncomprises multiple parallel low-rank parameter matrices, maintaining the\ndiversity of visual features. It dynamically adapts to varying styles by\nadjusting the variance of visual features based on the characteristics of the\ninput signal. This approach significantly minimizes the impact on the original\nmodel's text-image alignment capabilities while adeptly adapting to various\nstyles in transfer learning. StyleInject proves particularly effective in\nlearning from and enhancing a range of advanced, community-fine-tuned\ngenerative models. Our comprehensive experiments, including both small-sample\nand large-scale data fine-tuning as well as base model distillation, show that\nStyleInject surpasses traditional LoRA in both text-image semantic consistency\nand human preference evaluation, all while ensuring greater parameter\nefficiency.\n","authors":["Yalong Bai","Mohan Zhou","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2401.13942v1.pdf","comment":"15 pages, 12 figures"},{"id":"http://arxiv.org/abs/2401.13937v1","updated":"2024-01-25T04:39:48Z","published":"2024-01-25T04:39:48Z","title":"Self-supervised Video Object Segmentation with Distillation Learning of\n  Deformable Attention","summary":"  Video object segmentation is a fundamental research problem in computer\nvision. Recent techniques have often applied attention mechanism to object\nrepresentation learning from video sequences. However, due to temporal changes\nin the video data, attention maps may not well align with the objects of\ninterest across video frames, causing accumulated errors in long-term video\nprocessing. In addition, existing techniques have utilised complex\narchitectures, requiring highly computational complexity and hence limiting the\nability to integrate video object segmentation into low-powered devices. To\naddress these issues, we propose a new method for self-supervised video object\nsegmentation based on distillation learning of deformable attention.\nSpecifically, we devise a lightweight architecture for video object\nsegmentation that is effectively adapted to temporal changes. This is enabled\nby deformable attention mechanism, where the keys and values capturing the\nmemory of a video sequence in the attention module have flexible locations\nupdated across frames. The learnt object representations are thus adaptive to\nboth the spatial and temporal dimensions. We train the proposed architecture in\na self-supervised fashion through a new knowledge distillation paradigm where\ndeformable attention maps are integrated into the distillation loss. We\nqualitatively and quantitatively evaluate our method and compare it with\nexisting methods on benchmark datasets including DAVIS 2016/2017 and\nYouTube-VOS 2018/2019. Experimental results verify the superiority of our\nmethod via its achieved state-of-the-art performance and optimal memory usage.\n","authors":["Quang-Trung Truong","Duc Thanh Nguyen","Binh-Son Hua","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2401.13937v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2401.12689v2","updated":"2024-01-25T04:37:38Z","published":"2024-01-23T11:54:09Z","title":"Energy-based Automated Model Evaluation","summary":"  The conventional evaluation protocols on machine learning models rely heavily\non a labeled, i.i.d-assumed testing dataset, which is not often present in real\nworld applications. The Automated Model Evaluation (AutoEval) shows an\nalternative to this traditional workflow, by forming a proximal prediction\npipeline of the testing performance without the presence of ground-truth\nlabels. Despite its recent successes, the AutoEval frameworks still suffer from\nan overconfidence issue, substantial storage and computational cost. In that\nregard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that\nallows the AutoEval framework to be both more efficient and effective. The core\nof the MDE is to establish a meta-distribution statistic, on the information\n(energy) associated with individual samples, then offer a smoother\nrepresentation enabled by energy-based learning. We further provide our\ntheoretical insights by connecting the MDE with the classification loss. We\nprovide extensive experiments across modalities, datasets and different\narchitectural backbones to validate MDE's validity, together with its\nsuperiority compared with prior approaches. We also prove MDE's versatility by\nshowing its seamless integration with large-scale models, and easy adaption to\nlearning scenarios with noisy- or imbalanced- labels. Code and data are\navailable: https://github.com/pengr/Energy_AutoEval\n","authors":["Ru Peng","Heming Zou","Haobo Wang","Yawen Zeng","Zenan Huang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.12689v2.pdf","comment":"ICLR2024 poster paper"},{"id":"http://arxiv.org/abs/2401.13934v1","updated":"2024-01-25T04:16:45Z","published":"2024-01-25T04:16:45Z","title":"MambaMorph: a Mamba-based Backbone with Contrastive Feature Learning for\n  Deformable MR-CT Registration","summary":"  Deformable image registration is an essential approach for medical image\nanalysis.This paper introduces MambaMorph, an innovative multi-modality\ndeformable registration network, specifically designed for Magnetic Resonance\n(MR) and Computed Tomography (CT) image alignment. MambaMorph stands out with\nits Mamba-based registration module and a contrastive feature learning\napproach, addressing the prevalent challenges in multi-modality registration.\nThe network leverages Mamba blocks for efficient long-range modeling and\nhigh-dimensional data processing, coupled with a feature extractor that learns\nfine-grained features for enhanced registration accuracy. Experimental results\nshowcase MambaMorph's superior performance over existing methods in MR-CT\nregistration, underlining its potential in clinical applications. This work\nunderscores the significance of feature learning in multi-modality registration\nand positions MambaMorph as a trailblazing solution in this field. The code for\nMambaMorph is available at: https://github.com/Guo-Stone/MambaMorph.\n","authors":["Tao Guo","Yinuo Wang","Cai Meng"],"pdf_url":"https://arxiv.org/pdf/2401.13934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10529v2","updated":"2024-01-25T04:11:57Z","published":"2024-01-19T07:10:13Z","title":"Mementos: A Comprehensive Benchmark for Multimodal Large Language Model\n  Reasoning over Image Sequences","summary":"  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in\nhandling a variety of visual-language tasks. However, current MLLM benchmarks\nare predominantly designed to evaluate reasoning based on static information\nabout a single image, and the ability of modern MLLMs to extrapolate from image\nsequences, which is essential for understanding our ever-changing world, has\nbeen less investigated. To address this challenge, this paper introduces\nMementos, a new benchmark designed to assess MLLMs' sequential image reasoning\nabilities. Mementos features 4,761 diverse image sequences with varying\nlengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning\nperformance. Through a careful evaluation of nine recent MLLMs on Mementos,\nincluding GPT-4V and Gemini, we find that they struggle to accurately describe\ndynamic information about given image sequences, often leading to\nhallucinations/misrepresentations of objects and their corresponding behaviors.\nOur quantitative analysis and case studies identify three key factors impacting\nMLLMs' sequential image reasoning: the correlation between object and\nbehavioral hallucinations, the influence of cooccurring behaviors, and the\ncompounding impact of behavioral hallucinations. Our dataset is available at\nhttps://github.com/umd-huang-lab/Mementos.\n","authors":["Xiyao Wang","Yuhang Zhou","Xiaoyu Liu","Hongjin Lu","Yuancheng Xu","Feihong He","Jaehong Yoon","Taixi Lu","Gedas Bertasius","Mohit Bansal","Huaxiu Yao","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.10529v2.pdf","comment":"27 pages, 23 figures"},{"id":"http://arxiv.org/abs/2306.12244v2","updated":"2024-01-25T03:26:22Z","published":"2023-06-21T13:04:16Z","title":"Discovering Intrinsic Spatial-Temporal Logic Rules to Explain Human\n  Actions","summary":"  We propose a logic-informed knowledge-driven modeling framework for human\nmovements by analyzing their trajectories. Our approach is inspired by the fact\nthat human actions are usually driven by their intentions or desires, and are\ninfluenced by environmental factors such as the spatial relationships with\nsurrounding objects. In this paper, we introduce a set of spatial-temporal\nlogic rules as knowledge to explain human actions. These rules will be\nautomatically discovered from observational data. To learn the model parameters\nand the rule content, we design an expectation-maximization (EM) algorithm,\nwhich treats the rule content as latent variables. The EM algorithm alternates\nbetween the E-step and M-step: in the E-step, the posterior distribution over\nthe latent rule content is evaluated; in the M-step, the rule generator and\nmodel parameters are jointly optimized by maximizing the current expected\nlog-likelihood. Our model may have a wide range of applications in areas such\nas sports analytics, robotics, and autonomous cars, where understanding human\nmovements are essential. We demonstrate the model's superior interpretability\nand prediction performance on pedestrian and NBA basketball player datasets,\nboth achieving promising results.\n","authors":["Chengzhi Cao","Chao Yang","Shuang Li"],"pdf_url":"https://arxiv.org/pdf/2306.12244v2.pdf","comment":"There are missing descriptions of the results in section 5.6, and the\n  coordinates have an offset"},{"id":"http://arxiv.org/abs/2303.13496v3","updated":"2024-01-25T03:20:12Z","published":"2023-03-23T17:56:12Z","title":"The effectiveness of MAE pre-pretraining for billion-scale pretraining","summary":"  This paper revisits the standard pretrain-then-finetune paradigm used in\ncomputer vision for visual recognition tasks. Typically, state-of-the-art\nfoundation models are pretrained using large scale (weakly) supervised datasets\nwith billions of images. We introduce an additional pre-pretraining stage that\nis simple and uses the self-supervised MAE technique to initialize the model.\nWhile MAE has only been shown to scale with the size of models, we find that it\nscales with the size of the training dataset as well. Thus, our MAE-based\npre-pretraining scales with both model and data size making it applicable for\ntraining foundation models. Pre-pretraining consistently improves both the\nmodel convergence and the downstream transfer performance across a range of\nmodel scales (millions to billions of parameters), and dataset sizes (millions\nto billions of images). We measure the effectiveness of pre-pretraining on 10\ndifferent visual recognition tasks spanning image classification, video\nrecognition, object detection, low-shot classification and zero-shot\nrecognition. Our largest model achieves new state-of-the-art results on\niNaturalist-18 (91.7%), ImageNet-ReaL (91.1%), 1-shot ImageNet-1k (63.6%), and\nzero-shot transfer on Food-101 (96.2%). Our study reveals that model\ninitialization plays a significant role, even for web-scale pretraining with\nbillions of images, and our models are available publicly.\n","authors":["Mannat Singh","Quentin Duval","Kalyan Vasudev Alwala","Haoqi Fan","Vaibhav Aggarwal","Aaron Adcock","Armand Joulin","Piotr Dollár","Christoph Feichtenhofer","Ross Girshick","Rohit Girdhar","Ishan Misra"],"pdf_url":"https://arxiv.org/pdf/2303.13496v3.pdf","comment":"ICCV 2023. Models available at\n  https://github.com/facebookresearch/maws/"},{"id":"http://arxiv.org/abs/2311.04234v2","updated":"2024-01-25T03:00:54Z","published":"2023-11-06T03:16:18Z","title":"Leveraging sinusoidal representation networks to predict fMRI signals\n  from EEG","summary":"  In modern neuroscience, functional magnetic resonance imaging (fMRI) has been\na crucial and irreplaceable tool that provides a non-invasive window into the\ndynamics of whole-brain activity. Nevertheless, fMRI is limited by hemodynamic\nblurring as well as high cost, immobility, and incompatibility with metal\nimplants. Electroencephalography (EEG) is complementary to fMRI and can\ndirectly record the cortical electrical activity at high temporal resolution,\nbut has more limited spatial resolution and is unable to recover information\nabout deep subcortical brain structures. The ability to obtain fMRI information\nfrom EEG would enable cost-effective, imaging across a wider set of brain\nregions. Further, beyond augmenting the capabilities of EEG, cross-modality\nmodels would facilitate the interpretation of fMRI signals. However, as both\nEEG and fMRI are high-dimensional and prone to artifacts, it is currently\nchallenging to model fMRI from EEG. To address this challenge, we propose a\nnovel architecture that can predict fMRI signals directly from multi-channel\nEEG without explicit feature engineering. Our model achieves this by\nimplementing a Sinusoidal Representation Network (SIREN) to learn frequency\ninformation in brain dynamics from EEG, which serves as the input to a\nsubsequent encoder-decoder to effectively reconstruct the fMRI signal from a\nspecific brain region. We evaluate our model using a simultaneous EEG-fMRI\ndataset with 8 subjects and investigate its potential for predicting\nsubcortical fMRI signals. The present results reveal that our model outperforms\na recent state-of-the-art model, and indicates the potential of leveraging\nperiodic activation functions in deep neural networks to model functional\nneuroimaging data.\n","authors":["Yamin Li","Ange Lou","Ziyuan Xu","Shiyu Wang","Catie Chang"],"pdf_url":"https://arxiv.org/pdf/2311.04234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08655v2","updated":"2024-01-25T02:29:00Z","published":"2023-12-25T04:40:32Z","title":"SAiD: Speech-driven Blendshape Facial Animation with Diffusion","summary":"  Speech-driven 3D facial animation is challenging due to the scarcity of\nlarge-scale visual-audio datasets despite extensive research. Most prior works,\ntypically focused on learning regression models on a small dataset using the\nmethod of least squares, encounter difficulties generating diverse lip\nmovements from speech and require substantial effort in refining the generated\noutputs. To address these issues, we propose a speech-driven 3D facial\nanimation with a diffusion model (SAiD), a lightweight Transformer-based U-Net\nwith a cross-modality alignment bias between audio and visual to enhance lip\nsynchronization. Moreover, we introduce BlendVOCA, a benchmark dataset of pairs\nof speech audio and parameters of a blendshape facial model, to address the\nscarcity of public resources. Our experimental results demonstrate that the\nproposed approach achieves comparable or superior performance in lip\nsynchronization to baselines, ensures more diverse lip movements, and\nstreamlines the animation editing process.\n","authors":["Inkyu Park","Jaewoong Cho"],"pdf_url":"https://arxiv.org/pdf/2401.08655v2.pdf","comment":"Fix bug related to the font size"},{"id":"http://arxiv.org/abs/2111.11057v4","updated":"2024-01-25T02:22:17Z","published":"2021-11-22T08:55:25Z","title":"Learning to Aggregate Multi-Scale Context for Instance Segmentation in\n  Remote Sensing Images","summary":"  The task of instance segmentation in remote sensing images, aiming at\nperforming per-pixel labeling of objects at instance level, is of great\nimportance for various civil applications. Despite previous successes, most\nexisting instance segmentation methods designed for natural images encounter\nsharp performance degradations when they are directly applied to top-view\nremote sensing images. Through careful analysis, we observe that the challenges\nmainly come from the lack of discriminative object features due to severe scale\nvariations, low contrasts, and clustered distributions. In order to address\nthese problems, a novel context aggregation network (CATNet) is proposed to\nimprove the feature extraction process. The proposed model exploits three\nlightweight plug-and-play modules, namely dense feature pyramid network\n(DenseFPN), spatial context pyramid (SCP), and hierarchical region of interest\nextractor (HRoIE), to aggregate global visual context at feature, spatial, and\ninstance domains, respectively. DenseFPN is a multi-scale feature propagation\nmodule that establishes more flexible information flows by adopting inter-level\nresidual connections, cross-level dense connections, and feature re-weighting\nstrategy. Leveraging the attention mechanism, SCP further augments the features\nby aggregating global spatial context into local regions. For each instance,\nHRoIE adaptively generates RoI features for different downstream tasks.\nExtensive evaluations of the proposed scheme on iSAID, DIOR, NWPU VHR-10, and\nHRSID datasets demonstrate that the proposed approach outperforms\nstate-of-the-arts under similar computational costs. Source code and\npre-trained models are available at https://github.com/yeliudev/CATNet.\n","authors":["Ye Liu","Huifang Li","Chao Hu","Shuang Luo","Yan Luo","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2111.11057v4.pdf","comment":"Accepted to IEEE Transactions on Neural Networks and Learning Systems\n  (TNNLS), 2023"},{"id":"http://arxiv.org/abs/2401.13888v1","updated":"2024-01-25T02:08:37Z","published":"2024-01-25T02:08:37Z","title":"Knowledge Graph Supported Benchmark and Video Captioning for Basketball","summary":"  Despite the recent emergence of video captioning models, how to generate the\ntext description with specific entity names and fine-grained actions is far\nfrom being solved, which however has great applications such as basketball live\ntext broadcast. In this paper, a new multimodal knowledge supported basketball\nbenchmark for video captioning is proposed. Specifically, we construct a\nMultimodal Basketball Game Knowledge Graph (MbgKG) to provide knowledge beyond\nvideos. Then, a Multimodal Basketball Game Video Captioning (MbgVC) dataset\nthat contains 9 types of fine-grained shooting events and 286 players'\nknowledge (i.e., images and names) is constructed based on MbgKG. We develop a\nnovel framework in the encoder-decoder form named Entity-Aware Captioner (EAC)\nfor basketball live text broadcast. The temporal information in video is\nencoded by introducing the bi-directional GRU (Bi-GRU) module. And the\nmulti-head self-attention module is utilized to model the relationships among\nthe players and select the key players. Besides, we propose a new performance\nevaluation metric named Game Description Score (GDS), which measures not only\nthe linguistic performance but also the accuracy of the names prediction.\nExtensive experiments on MbgVC dataset demonstrate that EAC effectively\nleverages external knowledge and outperforms advanced video captioning models.\nThe proposed benchmark and corresponding codes will be publicly available soon.\n","authors":["Zeyu Xi","Ge Shi","Lifang Wu","Xuefen Li","Junchi Yan","Liang Wang","Zilin Liu"],"pdf_url":"https://arxiv.org/pdf/2401.13888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13877v1","updated":"2024-01-25T01:22:29Z","published":"2024-01-25T01:22:29Z","title":"AscDAMs: Advanced SLAM-based channel detection and mapping system","summary":"  Obtaining high-resolution, accurate channel topography and deposit conditions\nis the prior challenge for the study of channelized debris flow. Currently,\nwide-used mapping technologies including satellite imaging and drone\nphotogrammetry struggle to precisely observe channel interior conditions of\nmountainous long-deep gullies, particularly those in the Wenchuan Earthquake\nregion. SLAM is an emerging tech for 3D mapping; however, extremely rugged\nenvironment in long-deep gullies poses two major challenges even for the\nstate-of-art SLAM: (1) Atypical features; (2) Violent swaying and oscillation\nof sensors. These issues result in large deviation and lots of noise for SLAM\nresults. To improve SLAM mapping in such environments, we propose an advanced\nSLAM-based channel detection and mapping system, namely AscDAMs. It features\nthree main enhancements to post-process SLAM results: (1) The digital\northophoto map aided deviation correction algorithm greatly eliminates the\nsystematic error; (2) The point cloud smoothing algorithm substantially\ndiminishes noises; (3) The cross section extraction algorithm enables the\nquantitative assessment of channel deposits and their changes. Two field\nexperiments were conducted in Chutou Gully, Wenchuan County in China in\nFebruary and November 2023, representing observations before and after the\nrainy season. We demonstrate the capability of AscDAMs to greatly improve SLAM\nresults, promoting SLAM for mapping the specially challenging environment. The\nproposed method compensates for the insufficiencies of existing technologies in\ndetecting debris flow channel interiors including detailed channel morphology,\nerosion patterns, deposit distinction, volume estimation and change detection.\nIt serves to enhance the study of full-scale debris flow mechanisms, long-term\npost-seismic evolution, and hazard assessment.\n","authors":["Tengfei Wang","Fucheng Lu","Jintao Qin","Taosheng Huang","Hui Kong","Ping Shen"],"pdf_url":"https://arxiv.org/pdf/2401.13877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18296v2","updated":"2024-01-25T01:18:28Z","published":"2023-11-30T07:00:14Z","title":"Perceptual Group Tokenizer: Building Perception with Iterative Grouping","summary":"  Human visual recognition system shows astonishing capability of compressing\nvisual information into a set of tokens containing rich representations without\nlabel supervision. One critical driving principle behind it is perceptual\ngrouping. Despite being widely used in computer vision in the early 2010s, it\nremains a mystery whether perceptual grouping can be leveraged to derive a\nneural visual recognition backbone that generates as powerful representations.\nIn this paper, we propose the Perceptual Group Tokenizer, a model that entirely\nrelies on grouping operations to extract visual features and perform\nself-supervised representation learning, where a series of grouping operations\nare used to iteratively hypothesize the context for pixels or superpixels to\nrefine feature representations. We show that the proposed model can achieve\ncompetitive performance compared to state-of-the-art vision architectures, and\ninherits desirable properties including adaptive computation without\nre-training, and interpretability. Specifically, Perceptual Group Tokenizer\nachieves 80.3% on ImageNet-1K self-supervised learning benchmark with linear\nprobe evaluation, marking a new progress under this paradigm.\n","authors":["Zhiwei Deng","Ting Chen","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2311.18296v2.pdf","comment":"The International Conference on Learning Representations (ICLR) 2024"},{"id":"http://arxiv.org/abs/2401.13865v1","updated":"2024-01-25T00:23:21Z","published":"2024-01-25T00:23:21Z","title":"Appearance Debiased Gaze Estimation via Stochastic Subject-Wise\n  Adversarial Learning","summary":"  Recently, appearance-based gaze estimation has been attracting attention in\ncomputer vision, and remarkable improvements have been achieved using various\ndeep learning techniques. Despite such progress, most methods aim to infer gaze\nvectors from images directly, which causes overfitting to person-specific\nappearance factors. In this paper, we address these challenges and propose a\nnovel framework: Stochastic subject-wise Adversarial gaZE learning (SAZE),\nwhich trains a network to generalize the appearance of subjects. We design a\nFace generalization Network (Fgen-Net) using a face-to-gaze encoder and face\nidentity classifier and a proposed adversarial loss. The proposed loss\ngeneralizes face appearance factors so that the identity classifier inferences\na uniform probability distribution. In addition, the Fgen-Net is trained by a\nlearning mechanism that optimizes the network by reselecting a subset of\nsubjects at every training step to avoid overfitting. Our experimental results\nverify the robustness of the method in that it yields state-of-the-art\nperformance, achieving 3.89 and 4.42 on the MPIIGaze and EyeDiap datasets,\nrespectively. Furthermore, we demonstrate the positive generalization effect by\nconducting further experiments using face images involving different styles\ngenerated from the generative model.\n","authors":["Suneung Kim","Woo-Jeoung Nam","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2401.13865v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.14021v1","updated":"2024-01-25T09:06:44Z","published":"2024-01-25T09:06:44Z","title":"Accelerating Retrieval-Augmented Language Model Serving with Speculation","summary":"  Retrieval-augmented language models (RaLM) have demonstrated the potential to\nsolve knowledge-intensive natural language processing (NLP) tasks by combining\na non-parametric knowledge base with a parametric language model. Instead of\nfine-tuning a fully parametric model, RaLM excels at its low-cost adaptation to\nthe latest data and better source attribution mechanisms. Among various RaLM\napproaches, iterative RaLM delivers a better generation quality due to a more\nfrequent interaction between the retriever and the language model. Despite the\nbenefits, iterative RaLM usually encounters high overheads due to the frequent\nretrieval step. To this end, we propose RaLMSpec, a speculation-inspired\nframework that provides generic speed-up over iterative RaLM while preserving\nthe same model outputs through speculative retrieval and batched verification.\nBy further incorporating prefetching, optimal speculation stride scheduler, and\nasynchronous verification, RaLMSpec can automatically exploit the acceleration\npotential to the fullest. For naive iterative RaLM serving, extensive\nevaluations over three language models on four downstream QA datasets\ndemonstrate that RaLMSpec can achieve a speed-up ratio of 1.75-2.39x,\n1.04-1.39x, and 1.31-1.77x when the retriever is an exact dense retriever,\napproximate dense retriever, and sparse retriever respectively compared with\nthe baseline. For KNN-LM serving, RaLMSpec can achieve a speed-up ratio up to\n7.59x and 2.45x when the retriever is an exact dense retriever and approximate\ndense retriever, respectively, compared with the baseline.\n","authors":["Zhihao Zhang","Alan Zhu","Lijie Yang","Yihua Xu","Lanting Li","Phitchaya Mangpo Phothilimthana","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2401.14021v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2305.16695v3","updated":"2024-01-25T07:06:42Z","published":"2023-05-26T07:31:30Z","title":"The Search for Stability: Learning Dynamics of Strategic Publishers with\n  Initial Documents","summary":"  We study a game-theoretic information retrieval model in which strategic\npublishers aim to maximize their chances of being ranked first by the search\nengine while maintaining the integrity of their original documents. We show\nthat the commonly used Probability Ranking Principle (PRP) ranking scheme\nresults in an unstable environment where games often fail to reach pure Nash\nequilibrium. We propose the Relative Ranking Principle (RRP) as an alternative\nranking principle and introduce two families of ranking functions that are\ninstances of the RRP. We provide both theoretical and empirical evidence that\nthese methods lead to a stable search ecosystem, by providing positive results\non the learning dynamics convergence. We also define the publishers' and users'\nwelfare, demonstrate a possible publisher-user trade-off, and provide means for\na search system designer to control it. Finally, we show how instability harms\nlong-term users' welfare.\n","authors":["Omer Madmon","Idan Pipano","Itamar Reinman","Moshe Tennenholtz"],"pdf_url":"https://arxiv.org/pdf/2305.16695v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10835v2","updated":"2024-01-25T03:50:15Z","published":"2023-08-21T16:35:19Z","title":"Enhancing Recommender Systems with Large Language Model Reasoning Graphs","summary":"  Recommendation systems aim to provide users with relevant suggestions, but\noften lack interpretability and fail to capture higher-level semantic\nrelationships between user behaviors and profiles. In this paper, we propose a\nnovel approach that leverages large language models (LLMs) to construct\npersonalized reasoning graphs. These graphs link a user's profile and\nbehavioral sequences through causal and logical inferences, representing the\nuser's interests in an interpretable way. Our approach, LLM reasoning graphs\n(LLMRG), has four components: chained graph reasoning, divergent extension,\nself-verification and scoring, and knowledge base self-improvement. The\nresulting reasoning graph is encoded using graph neural networks, which serves\nas additional input to improve conventional recommender systems, without\nrequiring extra user or item information. Our approach demonstrates how LLMs\ncan enable more logical and interpretable recommender systems through\npersonalized reasoning graphs. LLMRG allows recommendations to benefit from\nboth engineered recommendation systems and LLM-derived reasoning graphs. We\ndemonstrate the effectiveness of LLMRG on benchmarks and real-world scenarios\nin enhancing base recommendation models.\n","authors":["Yan Wang","Zhixuan Chu","Xin Ouyang","Simeng Wang","Hongyan Hao","Yue Shen","Jinjie Gu","Siqiao Xue","James Y Zhang","Qing Cui","Longfei Li","Jun Zhou","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.10835v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.13923v1","updated":"2024-01-25T03:42:00Z","published":"2024-01-25T03:42:00Z","title":"Towards 3D Molecule-Text Interpretation in Language Models","summary":"  Language Models (LMs) have greatly influenced diverse domains. However, their\ninherent limitation in comprehending 3D molecular structures has considerably\nconstrained their potential in the biomolecular domain. To bridge this gap, we\nfocus on 3D molecule-text interpretation, and propose 3D-MoLM: 3D-Molecular\nLanguage Modeling. Specifically, 3D-MoLM enables an LM to interpret and analyze\n3D molecules by equipping the LM with a 3D molecular encoder. This integration\nis achieved by a 3D molecule-text projector, bridging the 3D molecular\nencoder's representation space and the LM's input space. Moreover, to enhance\n3D-MoLM's ability of cross-modal molecular understanding and instruction\nfollowing, we meticulously curated a 3D molecule-centric instruction tuning\ndataset -- 3D-MoIT. Through 3D molecule-text alignment and 3D molecule-centric\ninstruction tuning, 3D-MoLM establishes an integration of 3D molecular encoder\nand LM. It significantly surpasses existing baselines on downstream tasks,\nincluding molecule-text retrieval, molecule captioning, and more challenging\nopen-text molecular QA tasks, especially focusing on 3D-dependent properties.\n","authors":["Sihang Li","Zhiyuan Liu","Yanchen Luo","Xiang Wang","Xiangnan He","Kenji Kawaguchi","Tat-Seng Chua","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2401.13923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09085v4","updated":"2024-01-25T03:00:56Z","published":"2023-09-16T19:40:30Z","title":"SynthTab: Leveraging Synthesized Data for Guitar Tablature Transcription","summary":"  Guitar tablature is a form of music notation widely used among guitarists. It\ncaptures not only the musical content of a piece, but also its implementation\nand ornamentation on the instrument. Guitar Tablature Transcription (GTT) is an\nimportant task with broad applications in music education, composition, and\nentertainment. Existing GTT datasets are quite limited in size and scope,\nrendering models trained on them prone to overfitting and incapable of\ngeneralizing to out-of-domain data. In order to address this issue, we present\na methodology for synthesizing large-scale GTT audio using commercial acoustic\nand electric guitar plugins. We procure SynthTab, a dataset derived from\nDadaGP, which is a vast and diverse collection of richly annotated symbolic\ntablature. The proposed synthesis pipeline produces audio which faithfully\nadheres to the original fingerings and a subset of techniques specified in the\ntablature, and covers multiple guitars and styles for each track. Experiments\nshow that pre-training a baseline GTT model on SynthTab can improve\ntranscription performance when fine-tuning and testing on an individual\ndataset. More importantly, cross-dataset experiments show that pre-training\nsignificantly mitigates issues with overfitting.\n","authors":["Yongyi Zang","Yi Zhong","Frank Cwitkowitz","Zhiyao Duan"],"pdf_url":"https://arxiv.org/pdf/2309.09085v4.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.13448v2","updated":"2024-01-25T02:33:10Z","published":"2024-01-24T13:42:49Z","title":"Decentralized Collaborative Learning with Adaptive Reference Data for\n  On-Device POI Recommendation","summary":"  In Location-based Social Networks, Point-of-Interest (POI) recommendation\nhelps users discover interesting places. There is a trend to move from the\ncloud-based model to on-device recommendations for privacy protection and\nreduced server reliance. Due to the scarcity of local user-item interactions on\nindividual devices, solely relying on local instances is not adequate.\nCollaborative Learning (CL) emerges to promote model sharing among users, where\nreference data is an intermediary that allows users to exchange their soft\ndecisions without directly sharing their private data or parameters, ensuring\nprivacy and benefiting from collaboration. However, existing CL-based\nrecommendations typically use a single reference for all users. Reference data\nvaluable for one user might be harmful to another, given diverse user\npreferences. Users may not offer meaningful soft decisions on items outside\ntheir interest scope. Consequently, using the same reference data for all\ncollaborations can impede knowledge exchange and lead to sub-optimal\nperformance. To address this gap, we introduce the Decentralized Collaborative\nLearning with Adaptive Reference Data (DARD) framework, which crafts adaptive\nreference data for effective user collaboration. It first generates a\ndesensitized public reference data pool with transformation and probability\ndata generation methods. For each user, the selection of adaptive reference\ndata is executed in parallel by training loss tracking and influence function.\nLocal models are trained with individual private data and collaboratively with\nthe geographical and semantic neighbors. During the collaboration between two\nusers, they exchange soft decisions based on a combined set of their adaptive\nreference data. Our evaluations across two real-world datasets highlight DARD's\nsuperiority in recommendation performance and addressing the scarcity of\navailable reference data.\n","authors":["Ruiqi Zheng","Liang Qu","Tong Chen","Lizhen Cui","Yuhui Shi","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2401.13448v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13222v2","updated":"2024-01-25T01:43:37Z","published":"2024-01-24T04:25:54Z","title":"It's About Time: Incorporating Temporality in Retrieval Augmented\n  Language Models","summary":"  The web serves as a global repository of knowledge, used by billions of\npeople to search for information. Ensuring that users receive the most relevant\nand up-to-date information, especially in the presence of multiple versions of\nweb content from different time points remains a critical challenge for\ninformation retrieval. This challenge has recently been compounded by the\nincreased use of question answering tools trained on Wikipedia or web content\nand powered by large language models (LLMs) which have been found to make up\ninformation (or hallucinate), and in addition have been shown to struggle with\nthe temporal dimensions of information. Even Retriever Augmented Language\nModels (RALMs) which incorporate a document database to reduce LLM\nhallucination are unable to handle temporal queries correctly. This leads to\ninstances where RALMs respond to queries such as \"Who won the Wimbledon\nChampionship?\", by retrieving document passages related to Wimbledon but\nwithout the ability to differentiate between them based on how recent they are.\n  In this paper, we propose and evaluate, TempRALM, a temporally-aware\nRetriever Augmented Language Model (RALM) with few-shot learning extensions,\nwhich takes into account both semantically and temporally relevant documents\nrelative to a given query, rather than relying on semantic similarity alone. We\nshow that our approach results in up to 74% improvement in performance over the\nbaseline RALM model, without requiring model pre-training, recalculating or\nreplacing the RALM document index, or adding other computationally intensive\nelements.\n","authors":["Anoushka Gade","Jorjeta Jetcheva"],"pdf_url":"https://arxiv.org/pdf/2401.13222v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13870v1","updated":"2024-01-25T00:41:07Z","published":"2024-01-25T00:41:07Z","title":"Integrating Large Language Models into Recommendation via Mutual\n  Augmentation and Adaptive Aggregation","summary":"  Conventional recommendation methods have achieved notable advancements by\nharnessing collaborative or sequential information from user behavior.\nRecently, large language models (LLMs) have gained prominence for their\ncapabilities in understanding and reasoning over textual semantics, and have\nfound utility in various domains, including recommendation. Conventional\nrecommendation methods and LLMs each have their strengths and weaknesses. While\nconventional methods excel at mining collaborative information and modeling\nsequential behavior, they struggle with data sparsity and the long-tail\nproblem. LLMs, on the other hand, are proficient at utilizing rich textual\ncontexts but face challenges in mining collaborative or sequential information.\nDespite their individual successes, there is a significant gap in leveraging\ntheir combined potential to enhance recommendation performance.\n  In this paper, we introduce a general and model-agnostic framework known as\n\\textbf{L}arge \\textbf{la}nguage model with \\textbf{m}utual augmentation and\n\\textbf{a}daptive aggregation for \\textbf{Rec}ommendation (\\textbf{Llama4Rec}).\nLlama4Rec synergistically combines conventional and LLM-based recommendation\nmodels. Llama4Rec proposes data augmentation and prompt augmentation strategies\ntailored to enhance the conventional model and LLM respectively. An adaptive\naggregation module is adopted to combine the predictions of both kinds of\nmodels to refine the final recommendation results. Empirical studies on three\nreal-world datasets validate the superiority of Llama4Rec, demonstrating its\nconsistent outperformance of baseline methods and significant improvements in\nrecommendation performance.\n","authors":["Sichun Luo","Yuxuan Yao","Bowei He","Yinya Huang","Aojun Zhou","Xinyi Zhang","Yuanzhang Xiao","Mingjie Zhan","Linqi Song"],"pdf_url":"https://arxiv.org/pdf/2401.13870v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.14405v1","updated":"2024-01-25T18:59:58Z","published":"2024-01-25T18:59:58Z","title":"Multimodal Pathway: Improve Transformers with Irrelevant Data from Other\n  Modalities","summary":"  We propose to improve transformers of a specific modality with irrelevant\ndata from other modalities, e.g., improve an ImageNet model with audio or point\ncloud datasets. We would like to highlight that the data samples of the target\nmodality are irrelevant to the other modalities, which distinguishes our method\nfrom other works utilizing paired (e.g., CLIP) or interleaved data of different\nmodalities. We propose a methodology named Multimodal Pathway - given a target\nmodality and a transformer designed for it, we use an auxiliary transformer\ntrained with data of another modality and construct pathways to connect\ncomponents of the two models so that data of the target modality can be\nprocessed by both models. In this way, we utilize the universal\nsequence-to-sequence modeling abilities of transformers obtained from two\nmodalities. As a concrete implementation, we use a modality-specific tokenizer\nand task-specific head as usual but utilize the transformer blocks of the\nauxiliary model via a proposed method named Cross-Modal Re-parameterization,\nwhich exploits the auxiliary weights without any inference costs. On the image,\npoint cloud, video, and audio recognition tasks, we observe significant and\nconsistent performance improvements with irrelevant data from other modalities.\nThe code and models are available at https://github.com/AILab-CVC/M2PT.\n","authors":["Yiyuan Zhang","Xiaohan Ding","Kaixiong Gong","Yixiao Ge","Ying Shan","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2401.14405v1.pdf","comment":"The code and models are available at\n  https://github.com/AILab-CVC/M2PT"},{"id":"http://arxiv.org/abs/2401.14404v1","updated":"2024-01-25T18:59:57Z","published":"2024-01-25T18:59:57Z","title":"Deconstructing Denoising Diffusion Models for Self-Supervised Learning","summary":"  In this study, we examine the representation learning abilities of Denoising\nDiffusion Models (DDM) that were originally purposed for image generation. Our\nphilosophy is to deconstruct a DDM, gradually transforming it into a classical\nDenoising Autoencoder (DAE). This deconstructive procedure allows us to explore\nhow various components of modern DDMs influence self-supervised representation\nlearning. We observe that only a very few modern components are critical for\nlearning good representations, while many others are nonessential. Our study\nultimately arrives at an approach that is highly simplified and to a large\nextent resembles a classical DAE. We hope our study will rekindle interest in a\nfamily of classical methods within the realm of modern self-supervised\nlearning.\n","authors":["Xinlei Chen","Zhuang Liu","Saining Xie","Kaiming He"],"pdf_url":"https://arxiv.org/pdf/2401.14404v1.pdf","comment":"Technical report, 10 pages"},{"id":"http://arxiv.org/abs/2401.14403v1","updated":"2024-01-25T18:59:44Z","published":"2024-01-25T18:59:44Z","title":"Adaptive Mobile Manipulation for Articulated Objects In the Open World","summary":"  Deploying robots in open-ended unstructured environments such as homes has\nbeen a long-standing research problem. However, robots are often studied only\nin closed-off lab settings, and prior mobile manipulation work is restricted to\npick-move-place, which is arguably just the tip of the iceberg in this area. In\nthis paper, we introduce Open-World Mobile Manipulation System, a full-stack\napproach to tackle realistic articulated object operation, e.g. real-world\ndoors, cabinets, drawers, and refrigerators in open-ended unstructured\nenvironments. The robot utilizes an adaptive learning framework to initially\nlearns from a small set of data through behavior cloning, followed by learning\nfrom online practice on novel objects that fall outside the training\ndistribution. We also develop a low-cost mobile manipulation hardware platform\ncapable of safe and autonomous online adaptation in unstructured environments\nwith a cost of around 20,000 USD. In our experiments we utilize 20 articulate\nobjects across 4 buildings in the CMU campus. With less than an hour of online\nlearning for each object, the system is able to increase success rate from 50%\nof BC pre-training to 95% using online adaptation. Video results at\nhttps://open-world-mobilemanip.github.io/\n","authors":["Haoyu Xiong","Russell Mendonca","Kenneth Shaw","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2401.14403v1.pdf","comment":"Website at https://open-world-mobilemanip.github.io/"},{"id":"http://arxiv.org/abs/2401.14398v1","updated":"2024-01-25T18:57:36Z","published":"2024-01-25T18:57:36Z","title":"pix2gestalt: Amodal Segmentation by Synthesizing Wholes","summary":"  We introduce pix2gestalt, a framework for zero-shot amodal segmentation,\nwhich learns to estimate the shape and appearance of whole objects that are\nonly partially visible behind occlusions. By capitalizing on large-scale\ndiffusion models and transferring their representations to this task, we learn\na conditional diffusion model for reconstructing whole objects in challenging\nzero-shot cases, including examples that break natural and physical priors,\nsuch as art. As training data, we use a synthetically curated dataset\ncontaining occluded objects paired with their whole counterparts. Experiments\nshow that our approach outperforms supervised baselines on established\nbenchmarks. Our model can furthermore be used to significantly improve the\nperformance of existing object recognition and 3D reconstruction methods in the\npresence of occlusions.\n","authors":["Ege Ozguroglu","Ruoshi Liu","Dídac Surís","Dian Chen","Achal Dave","Pavel Tokmakov","Carl Vondrick"],"pdf_url":"https://arxiv.org/pdf/2401.14398v1.pdf","comment":"Website: https://gestalt.cs.columbia.edu/"},{"id":"http://arxiv.org/abs/2401.14388v1","updated":"2024-01-25T18:47:23Z","published":"2024-01-25T18:47:23Z","title":"Smooth Ranking SVM via Cutting-Plane Method","summary":"  The most popular classification algorithms are designed to maximize\nclassification accuracy during training. However, this strategy may fail in the\npresence of class imbalance since it is possible to train models with high\naccuracy by overfitting to the majority class. On the other hand, the Area\nUnder the Curve (AUC) is a widely used metric to compare classification\nperformance of different algorithms when there is a class imbalance, and\nvarious approaches focusing on the direct optimization of this metric during\ntraining have been proposed. Among them, SVM-based formulations are especially\npopular as this formulation allows incorporating different regularization\nstrategies easily. In this work, we develop a prototype learning approach that\nrelies on cutting-plane method, similar to Ranking SVM, to maximize AUC. Our\nalgorithm learns simpler models by iteratively introducing cutting planes, thus\noverfitting is prevented in an unconventional way. Furthermore, it penalizes\nthe changes in the weights at each iteration to avoid large jumps that might be\nobserved in the test performance, thus facilitating a smooth learning process.\nBased on the experiments conducted on 73 binary classification datasets, our\nmethod yields the best test AUC in 25 datasets among its relevant competitors.\n","authors":["Erhan Can Ozcan","Berk Görgülü","Mustafa G. Baydogan","Ioannis Ch. Paschalidis"],"pdf_url":"https://arxiv.org/pdf/2401.14388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03761v3","updated":"2024-01-25T18:45:31Z","published":"2023-07-07T12:22:16Z","title":"DyEdgeGAT: Dynamic Edge via Graph Attention for Early Fault Detection in\n  IIoT Systems","summary":"  In the Industrial Internet of Things (IIoT), condition monitoring sensor\nsignals from complex systems often exhibit nonlinear and stochastic\nspatial-temporal dynamics under varying conditions. These complex dynamics make\nfault detection particularly challenging. While previous methods effectively\nmodel these dynamics, they often neglect the evolution of relationships between\nsensor signals. Undetected shifts in these relationships can lead to\nsignificant system failures. Furthermore, these methods frequently misidentify\nnovel operating conditions as faults. Addressing these limitations, we propose\nDyEdgeGAT (Dynamic Edge via Graph Attention), a novel approach for early-stage\nfault detection in IIoT systems. DyEdgeGAT's primary innovation lies in a novel\ngraph inference scheme for multivariate time series that tracks the evolution\nof relationships between time series, enabled by dynamic edge construction.\nAnother key innovation of DyEdgeGAT is its ability to incorporate operating\ncondition contexts into node dynamics modeling, enhancing its accuracy and\nrobustness. We rigorously evaluated DyEdgeGAT using both a synthetic dataset,\nsimulating varying levels of fault severity, and a real-world industrial-scale\nmultiphase flow facility benchmark with diverse fault types under varying\noperating conditions and detection complexities. The results show that\nDyEdgeGAT significantly outperforms other baseline methods in fault detection,\nparticularly in the early stages with low severity, and exhibits robust\nperformance under novel operating conditions.\n","authors":["Mengjie Zhao","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2307.03761v3.pdf","comment":"16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.14382v1","updated":"2024-01-25T18:37:17Z","published":"2024-01-25T18:37:17Z","title":"An Orthogonal Polynomial Kernel-Based Machine Learning Model for\n  Differential-Algebraic Equations","summary":"  The recent introduction of the Least-Squares Support Vector Regression\n(LS-SVR) algorithm for solving differential and integral equations has sparked\ninterest. In this study, we expand the application of this algorithm to address\nsystems of differential-algebraic equations (DAEs). Our work presents a novel\napproach to solving general DAEs in an operator format by establishing\nconnections between the LS-SVR machine learning model, weighted residual\nmethods, and Legendre orthogonal polynomials. To assess the effectiveness of\nour proposed method, we conduct simulations involving various DAE scenarios,\nsuch as nonlinear systems, fractional-order derivatives, integro-differential,\nand partial DAEs. Finally, we carry out comparisons between our proposed method\nand currently established state-of-the-art approaches, demonstrating its\nreliability and effectiveness.\n","authors":["Tayebeh Taheri","Alireza Afzal Aghaei","Kourosh Parand"],"pdf_url":"https://arxiv.org/pdf/2401.14382v1.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.14381v1","updated":"2024-01-25T18:36:10Z","published":"2024-01-25T18:36:10Z","title":"Manifold GCN: Diffusion-based Convolutional Neural Network for\n  Manifold-valued Graphs","summary":"  We propose two graph neural network layers for graphs with features in a\nRiemannian manifold. First, based on a manifold-valued graph diffusion\nequation, we construct a diffusion layer that can be applied to an arbitrary\nnumber of nodes and graph connectivity patterns. Second, we model a tangent\nmultilayer perceptron by transferring ideas from the vector neuron framework to\nour general setting. Both layers are equivariant with respect to node\npermutations and isometries of the feature manifold. These properties have been\nshown to lead to a beneficial inductive bias in many deep learning tasks.\nNumerical examples on synthetic data as well as on triangle meshes of the right\nhippocampus to classify Alzheimer's disease demonstrate the very good\nperformance of our layers.\n","authors":["Martin Hanik","Gabriele Steidl","Christoph von Tycowicz"],"pdf_url":"https://arxiv.org/pdf/2401.14381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14379v1","updated":"2024-01-25T18:30:46Z","published":"2024-01-25T18:30:46Z","title":"UrbanGenAI: Reconstructing Urban Landscapes using Panoptic Segmentation\n  and Diffusion Models","summary":"  In contemporary design practices, the integration of computer vision and\ngenerative artificial intelligence (genAI) represents a transformative shift\ntowards more interactive and inclusive processes. These technologies offer new\ndimensions of image analysis and generation, which are particularly relevant in\nthe context of urban landscape reconstruction. This paper presents a novel\nworkflow encapsulated within a prototype application, designed to leverage the\nsynergies between advanced image segmentation and diffusion models for a\ncomprehensive approach to urban design. Our methodology encompasses the\nOneFormer model for detailed image segmentation and the Stable Diffusion XL\n(SDXL) diffusion model, implemented through ControlNet, for generating images\nfrom textual descriptions. Validation results indicated a high degree of\nperformance by the prototype application, showcasing significant accuracy in\nboth object detection and text-to-image generation. This was evidenced by\nsuperior Intersection over Union (IoU) and CLIP scores across iterative\nevaluations for various categories of urban landscape features. Preliminary\ntesting included utilising UrbanGenAI as an educational tool enhancing the\nlearning experience in design pedagogy, and as a participatory instrument\nfacilitating community-driven urban planning. Early results suggested that\nUrbanGenAI not only advances the technical frontiers of urban landscape\nreconstruction but also provides significant pedagogical and participatory\nplanning benefits. The ongoing development of UrbanGenAI aims to further\nvalidate its effectiveness across broader contexts and integrate additional\nfeatures such as real-time feedback mechanisms and 3D modelling capabilities.\nKeywords: generative AI; panoptic image segmentation; diffusion models; urban\nlandscape design; design pedagogy; co-design\n","authors":["Timo Kapsalis"],"pdf_url":"https://arxiv.org/pdf/2401.14379v1.pdf","comment":"19 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2401.14373v1","updated":"2024-01-25T18:24:13Z","published":"2024-01-25T18:24:13Z","title":"TURNA: A Turkish Encoder-Decoder Language Model for Enhanced\n  Understanding and Generation","summary":"  The recent advances in natural language processing have predominantly favored\nwell-resourced English-centric models, resulting in a significant gap with\nlow-resource languages. In this work, we introduce the language model TURNA,\nwhich is developed for the low-resource language Turkish and is capable of both\nnatural language understanding and generation tasks. TURNA is pretrained with\nan encoder-decoder architecture based on the unified framework UL2 with a\ndiverse corpus that we specifically curated for this purpose. We evaluated\nTURNA with three generation tasks and five understanding tasks for Turkish. The\nresults show that TURNA outperforms several multilingual models in both\nunderstanding and generation tasks, and competes with monolingual Turkish\nmodels in understanding tasks. TURNA is made available at\nhttps://huggingface.co/boun-tabi-LMG/TURNA .\n","authors":["Gökçe Uludoğan","Zeynep Yirmibeşoğlu Balal","Furkan Akkurt","Melikşah Türker","Onur Güngör","Susan Üsküdarlı"],"pdf_url":"https://arxiv.org/pdf/2401.14373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14367v1","updated":"2024-01-25T18:14:57Z","published":"2024-01-25T18:14:57Z","title":"Genie: Achieving Human Parity in Content-Grounded Datasets Generation","summary":"  The lack of high-quality data for content-grounded generation tasks has been\nidentified as a major obstacle to advancing these tasks. To address this gap,\nwe propose Genie, a novel method for automatically generating high-quality\ncontent-grounded data. It consists of three stages: (a) Content Preparation,\n(b) Generation: creating task-specific examples from the content (e.g.,\nquestion-answer pairs or summaries). (c) Filtering mechanism aiming to ensure\nthe quality and faithfulness of the generated data. We showcase this\nmethodology by generating three large-scale synthetic data, making wishes, for\nLong-Form Question-Answering (LFQA), summarization, and information extraction.\nIn a human evaluation, our generated data was found to be natural and of high\nquality. Furthermore, we compare models trained on our data with models trained\non human-written data -- ELI5 and ASQA for LFQA and CNN-DailyMail for\nSummarization. We show that our models are on par with or outperforming models\ntrained on human-generated data and consistently outperforming them in\nfaithfulness. Finally, we applied our method to create LFQA data within the\nmedical domain and compared a model trained on it with models trained on other\ndomains.\n","authors":["Asaf Yehudai","Boaz Carmeli","Yosi Mass","Ofir Arviv","Nathaniel Mills","Assaf Toledo","Eyal Shnarch","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2401.14367v1.pdf","comment":"Accepted to ICLR24"},{"id":"http://arxiv.org/abs/2401.14361v1","updated":"2024-01-25T18:07:50Z","published":"2024-01-25T18:07:50Z","title":"MoE-Infinity: Activation-Aware Expert Offloading for Efficient MoE\n  Serving","summary":"  This paper presents MoE-Infinity, a cost-efficient mixture-of-expert (MoE)\nserving system that realizes activation-aware expert offloading. MoE-Infinity\nfeatures sequence-level expert activation tracing, a new approach adept at\nidentifying sparse activations and capturing the temporal locality of MoE\ninference. By analyzing these traces, MoE-Infinity performs novel\nactivation-aware expert prefetching and caching, substantially reducing the\nlatency overheads usually associated with offloading experts for improved cost\nperformance. Extensive experiments in a cluster show that MoE-Infinity\noutperforms numerous existing systems and approaches, reducing latency by 4 -\n20X and decreasing deployment costs by over 8X for various MoEs. MoE-Infinity's\nsource code is publicly available at https://github.com/TorchMoE/MoE-Infinity\n","authors":["Leyang Xue","Yao Fu","Zhan Lu","Luo Mai","Mahesh Marina"],"pdf_url":"https://arxiv.org/pdf/2401.14361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07799v2","updated":"2024-01-25T18:00:05Z","published":"2023-10-11T18:32:21Z","title":"Domain-invariant Clinical Representation Learning by Bridging Data\n  Distribution Shift across EMR Datasets","summary":"  Due to the limited information about emerging diseases, symptoms are hard to\nbe noticed and recognized, so that the window for clinical intervention could\nbe ignored. An effective prognostic model is expected to assist doctors in\nmaking right diagnosis and designing personalized treatment plan, so to\npromptly prevent unfavorable outcomes. However, in the early stage of a\ndisease, limited data collection and clinical experiences, plus the concern out\nof privacy and ethics, may result in restricted data availability for\nreference, to the extent that even data labels are difficult to mark correctly.\nIn addition, Electronic Medical Record (EMR) data of different diseases or of\ndifferent sources of the same disease can prove to be having serious\ncross-dataset feature misalignment problems, greatly mutilating the efficiency\nof deep learning models. This article introduces a domain-invariant\nrepresentation learning method to build a transition model from source dataset\nto target dataset. By way of constraining the distribution shift of features\ngenerated in disparate domains, domain-invariant features that are exclusively\nrelative to downstream tasks are captured, so to cultivate a unified\ndomain-invariant encoder across various task domains to achieve better feature\nrepresentation. Experimental results of several target tasks demonstrate that\nour proposed model outperforms competing baseline methods and has higher rate\nof training convergence, especially in dealing with limited data amount. A\nmultitude of experiences have proven the efficacy of our method to provide more\naccurate predictions concerning newly emergent pandemics and other diseases.\n","authors":["Zhongji Zhang","Yuhang Wang","Yinghao Zhu","Xinyu Ma","Tianlong Wang","Chaohe Zhang","Yasha Wang","Liantao Ma"],"pdf_url":"https://arxiv.org/pdf/2310.07799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14351v1","updated":"2024-01-25T17:55:07Z","published":"2024-01-25T17:55:07Z","title":"ServerlessLLM: Locality-Enhanced Serverless Inference for Large Language\n  Models","summary":"  This paper presents ServerlessLLM, a locality-enhanced serverless inference\nsystem for Large Language Models (LLMs). ServerlessLLM exploits the substantial\ncapacity and bandwidth of storage and memory devices available on GPU servers,\nthereby reducing costly remote checkpoint downloads and achieving efficient\ncheckpoint loading. ServerlessLLM achieves this through three main\ncontributions: (i) fast LLM checkpoint loading via a novel loading-optimized\ncheckpoint format design, coupled with an efficient multi-tier checkpoint\nloading system; (ii) locality-driven LLM inference with live migration, which\nallows ServerlessLLM to effectively achieve locality-driven server allocation\nwhile preserving the low latency of ongoing LLM inference; and (iii)\nlocality-aware server allocation, enabling ServerlessLLM to evaluate the status\nof each server in a cluster and effectively schedule model startup time to\ncapitalize on local checkpoint placement. Our comprehensive experiments, which\ninclude microbenchmarks and real-world traces, show that ServerlessLLM\nsurpasses state-of-the-art systems by 10 - 200X in latency performance when\nrunning various LLM inference workloads.\n","authors":["Yao Fu","Leyang Xue","Yeqi Huang","Andrei-Octavian Brabete","Dmitrii Ustiugov","Yuvraj Patel","Luo Mai"],"pdf_url":"https://arxiv.org/pdf/2401.14351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10295v3","updated":"2024-01-25T17:51:12Z","published":"2023-02-20T20:39:07Z","title":"Correlation Clustering with Active Learning of Pairwise Similarities","summary":"  Correlation clustering is a well-known unsupervised learning setting that\ndeals with positive and negative pairwise similarities. In this paper, we study\nthe case where the pairwise similarities are not given in advance and must be\nqueried in a cost-efficient way. Thereby, we develop a generic active learning\nframework for this task that benefits from several advantages, e.g.,\nflexibility in the type of feedback that a user/annotator can provide,\nadaptation to any correlation clustering algorithm and query strategy, and\nrobustness to noise. In addition, we propose and analyze a number of novel\nquery strategies suited to this setting. We demonstrate the effectiveness of\nour framework and the proposed query strategies via several experimental\nstudies.\n","authors":["Linus Aronsson","Morteza Haghir Chehreghani"],"pdf_url":"https://arxiv.org/pdf/2302.10295v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03233v2","updated":"2024-01-25T17:50:54Z","published":"2024-01-06T15:05:49Z","title":"Convergence Rate Maximization for Split Learning-based Control of EMG\n  Prosthetic Devices","summary":"  Split Learning (SL) is a promising Distributed Learning approach in\nelectromyography (EMG) based prosthetic control, due to its applicability\nwithin resource-constrained environments. Other learning approaches, such as\nDeep Learning and Federated Learning (FL), provide suboptimal solutions, since\nprosthetic devices are extremely limited in terms of processing power and\nbattery life. The viability of implementing SL in such scenarios is caused by\nits inherent model partitioning, with clients executing the smaller model\nsegment. However, selecting an inadequate cut layer hinders the training\nprocess in SL systems. This paper presents an algorithm for optimal cut layer\nselection in terms of maximizing the convergence rate of the model. The\nperformance evaluation demonstrates that the proposed algorithm substantially\naccelerates the convergence in an EMG pattern recognition task for improving\nprosthetic device control.\n","authors":["Matea Marinova","Daniel Denkovski","Hristijan Gjoreski","Zoran Hadzi-Velkov","Valentin Rakovic"],"pdf_url":"https://arxiv.org/pdf/2401.03233v2.pdf","comment":"8 pages, 7 figures, corrected typos"},{"id":"http://arxiv.org/abs/2401.14343v1","updated":"2024-01-25T17:43:39Z","published":"2024-01-25T17:43:39Z","title":"Class-attribute Priors: Adapting Optimization to Heterogeneity and\n  Fairness Objective","summary":"  Modern classification problems exhibit heterogeneities across individual\nclasses: Each class may have unique attributes, such as sample size, label\nquality, or predictability (easy vs difficult), and variable importance at\ntest-time. Without care, these heterogeneities impede the learning process,\nmost notably, when optimizing fairness objectives. Confirming this, under a\ngaussian mixture setting, we show that the optimal SVM classifier for balanced\naccuracy needs to be adaptive to the class attributes. This motivates us to\npropose CAP: An effective and general method that generates a class-specific\nlearning strategy (e.g. hyperparameter) based on the attributes of that class.\nThis way, optimization process better adapts to heterogeneities. CAP leads to\nsubstantial improvements over the naive approach of assigning separate\nhyperparameters to each class. We instantiate CAP for loss function design and\npost-hoc logit adjustment, with emphasis on label-imbalanced problems. We show\nthat CAP is competitive with prior art and its flexibility unlocks clear\nbenefits for fairness objectives beyond balanced accuracy. Finally, we evaluate\nCAP on problems with label noise as well as weighted test objectives to\nshowcase how CAP can jointly adapt to different heterogeneities.\n","authors":["Xuechen Zhang","Mingchen Li","Jiasi Chen","Christos Thrampoulidis","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2401.14343v1.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.14340v1","updated":"2024-01-25T17:39:47Z","published":"2024-01-25T17:39:47Z","title":"Estimation of partially known Gaussian graphical models with score-based\n  structural priors","summary":"  We propose a novel algorithm for the support estimation of partially known\nGaussian graphical models that incorporates prior information about the\nunderlying graph. In contrast to classical approaches that provide a point\nestimate based on a maximum likelihood or a maximum a posteriori criterion\nusing (simple) priors on the precision matrix, we consider a prior on the graph\nand rely on annealed Langevin diffusion to generate samples from the posterior\ndistribution. Since the Langevin sampler requires access to the score function\nof the underlying graph prior, we use graph neural networks to effectively\nestimate the score from a graph dataset (either available beforehand or\ngenerated from a known distribution). Numerical experiments demonstrate the\nbenefits of our approach.\n","authors":["Martín Sevilla","Antonio García Marques","Santiago Segarra"],"pdf_url":"https://arxiv.org/pdf/2401.14340v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.13764v2","updated":"2024-01-25T17:39:19Z","published":"2023-05-23T07:29:08Z","title":"Mitigating Label Noise through Data Ambiguation","summary":"  Label noise poses an important challenge in machine learning, especially in\ndeep learning, in which large models with high expressive power dominate the\nfield. Models of that kind are prone to memorizing incorrect labels, thereby\nharming generalization performance. Many methods have been proposed to address\nthis problem, including robust loss functions and more complex label correction\napproaches. Robust loss functions are appealing due to their simplicity, but\ntypically lack flexibility, while label correction usually adds substantial\ncomplexity to the training setup. In this paper, we suggest to address the\nshortcomings of both methodologies by \"ambiguating\" the target information,\nadding additional, complementary candidate labels in case the learner is not\nsufficiently convinced of the observed training label. More precisely, we\nleverage the framework of so-called superset learning to construct set-valued\ntargets based on a confidence threshold, which deliver imprecise yet more\nreliable beliefs about the ground-truth, effectively helping the learner to\nsuppress the memorization effect. In an extensive empirical evaluation, our\nmethod demonstrates favorable learning behavior on synthetic and real-world\nnoise, confirming the effectiveness in detecting and correcting erroneous\ntraining labels.\n","authors":["Julian Lienen","Eyke Hüllermeier"],"pdf_url":"https://arxiv.org/pdf/2305.13764v2.pdf","comment":"Paper incl. appendix accepted at AAAI-2024 (cf. copyright remark on\n  title page), 20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.10895v2","updated":"2024-01-25T17:38:36Z","published":"2023-12-12T17:47:51Z","title":"AI in Supply Chain Risk Assessment: A Systematic Literature Review and\n  Bibliometric Analysis","summary":"  Supply chain risk assessment (SCRA) has witnessed a profound evolution\nthrough the integration of artificial intelligence (AI) and machine learning\n(ML) techniques, revolutionizing predictive capabilities and risk mitigation\nstrategies. The significance of this evolution stems from the critical role of\nrobust risk management strategies in ensuring operational resilience and\ncontinuity within modern supply chains. Previous reviews have outlined\nestablished methodologies but have overlooked emerging AI/ML techniques,\nleaving a notable research gap in understanding their practical implications\nwithin SCRA. This paper conducts a systematic literature review combined with a\ncomprehensive bibliometric analysis. We meticulously examined 1,717 papers and\nderived key insights from a select group of 48 articles published between 2014\nand 2023. The review fills this research gap by addressing pivotal research\nquestions, and exploring existing AI/ML techniques, methodologies, findings,\nand future trajectories, thereby providing a more encompassing view of the\nevolving landscape of SCRA. Our study unveils the transformative impact of\nAI/ML models, such as Random Forest, XGBoost, and hybrids, in substantially\nenhancing precision within SCRA. It underscores adaptable post-COVID\nstrategies, advocating for resilient contingency plans and aligning with\nevolving risk landscapes. Significantly, this review surpasses previous\nexaminations by accentuating emerging AI/ML techniques and their practical\nimplications within SCRA. Furthermore, it highlights the contributions through\na comprehensive bibliometric analysis, revealing publication trends,\ninfluential authors, and highly cited articles.\n","authors":["Md Abrar Jahin","Saleh Akram Naife","Anik Kumar Saha","M. F. Mridha"],"pdf_url":"https://arxiv.org/pdf/2401.10895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14332v1","updated":"2024-01-25T17:30:08Z","published":"2024-01-25T17:30:08Z","title":"SunBlock: Cloudless Protection for IoT Systems","summary":"  With an increasing number of Internet of Things (IoT) devices present in\nhomes, there is a rise in the number of potential information leakage channels\nand their associated security threats and privacy risks. Despite a long history\nof attacks on IoT devices in unprotected home networks, the problem of\naccurate, rapid detection and prevention of such attacks remains open. Many\nexisting IoT protection solutions are cloud-based, sometimes ineffective, and\nmight share consumer data with unknown third parties. This paper investigates\nthe potential for effective IoT threat detection locally, on a home router,\nusing AI tools combined with classic rule-based traffic-filtering algorithms.\nOur results show that with a slight rise of router hardware resources caused by\nmachine learning and traffic filtering logic, a typical home router\ninstrumented with our solution is able to effectively detect risks and protect\na typical home IoT network, equaling or outperforming existing popular\nsolutions, without any effects on benign IoT functionality, and without relying\non cloud services and third parties.\n","authors":["Vadim Safronov","Anna Maria Mandalari","Daniel J. Dubois","David Choffnes","Hamed Haddadi"],"pdf_url":"https://arxiv.org/pdf/2401.14332v1.pdf","comment":"This paper is accepted at Passive and Active Measurement (PAM)\n  conference 2024"},{"id":"http://arxiv.org/abs/2401.12012v2","updated":"2024-01-25T17:27:10Z","published":"2024-01-22T14:59:11Z","title":"TurboSVM-FL: Boosting Federated Learning through SVM Aggregation for\n  Lazy Clients","summary":"  Federated learning is a distributed collaborative machine learning paradigm\nthat has gained strong momentum in recent years. In federated learning, a\ncentral server periodically coordinates models with clients and aggregates the\nmodels trained locally by clients without necessitating access to local data.\nDespite its potential, the implementation of federated learning continues to\nencounter several challenges, predominantly the slow convergence that is\nlargely due to data heterogeneity. The slow convergence becomes particularly\nproblematic in cross-device federated learning scenarios where clients may be\nstrongly limited by computing power and storage space, and hence counteracting\nmethods that induce additional computation or memory cost on the client side\nsuch as auxiliary objective terms and larger training iterations can be\nimpractical. In this paper, we propose a novel federated aggregation strategy,\nTurboSVM-FL, that poses no additional computation burden on the client side and\ncan significantly accelerate convergence for federated classification task,\nespecially when clients are \"lazy\" and train their models solely for few epochs\nfor next global aggregation. TurboSVM-FL extensively utilizes support vector\nmachine to conduct selective aggregation and max-margin spread-out\nregularization on class embeddings. We evaluate TurboSVM-FL on multiple\ndatasets including FEMNIST, CelebA, and Shakespeare using user-independent\nvalidation with non-iid data distribution. Our results show that TurboSVM-FL\ncan significantly outperform existing popular algorithms on convergence rate\nand reduce communication rounds while delivering better test metrics including\naccuracy, F1 score, and MCC.\n","authors":["Mengdi Wang","Anna Bodonhelyi","Efe Bozkir","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2401.12012v2.pdf","comment":"Proceedings of the AAAI Conference on Artificial Intelligence 2024\n  (AAAI'24)"},{"id":"http://arxiv.org/abs/2308.12243v3","updated":"2024-01-25T17:15:41Z","published":"2023-08-23T16:42:27Z","title":"Multi-Objective Optimization for Sparse Deep Multi-Task Learning","summary":"  Different conflicting optimization criteria arise naturally in various Deep\nLearning scenarios. These can address different main tasks (i.e., in the\nsetting of Multi-Task Learning), but also main and secondary tasks such as loss\nminimization versus sparsity. The usual approach is a simple weighting of the\ncriteria, which formally only works in the convex setting. In this paper, we\npresent a Multi-Objective Optimization algorithm using a modified Weighted\nChebyshev scalarization for training Deep Neural Networks (DNNs) with respect\nto several tasks. By employing this scalarization technique, the algorithm can\nidentify all optimal solutions of the original problem while reducing its\ncomplexity to a sequence of single-objective problems. The simplified problems\nare then solved using an Augmented Lagrangian method, enabling the use of\npopular optimization techniques such as Adam and Stochastic Gradient Descent,\nwhile efficaciously handling constraints. Our work aims to address the\n(economical and also ecological) sustainability issue of DNN models, with a\nparticular focus on Deep Multi-Task models, which are typically designed with a\nvery large number of weights to perform equally well on multiple tasks. Through\nexperiments conducted on two Machine Learning datasets, we demonstrate the\npossibility of adaptively sparsifying the model during training without\nsignificantly impacting its performance, if we are willing to apply\ntask-specific adaptations to the network weights. The code is available at\nhttps://github.com/salomonhotegni/MDMTN\n","authors":["S. S. Hotegni","M. Berkemeier","S. Peitz"],"pdf_url":"https://arxiv.org/pdf/2308.12243v3.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.04160v3","updated":"2024-01-25T17:14:33Z","published":"2023-09-08T07:01:38Z","title":"PRISM: Leveraging Prototype Patient Representations with\n  Feature-Missing-Aware Calibration for EHR Data Sparsity Mitigation","summary":"  Electronic Health Record (EHR) data, while rich in information, often suffers\nfrom sparsity, posing significant challenges in predictive modeling.\nTraditional imputation methods inadequately distinguish between real and\nimputed data, leading to potential inaccuracies in models. Addressing this, we\nintroduce PRISM, a novel approach that indirectly imputes data through\nprototype representations of similar patients, thus ensuring denser and more\naccurate embeddings. PRISM innovates further with a feature confidence learner\nmodule, which evaluates the reliability of each feature in light of missing\ndata. Additionally, it incorporates a novel patient similarity metric that\naccounts for feature confidence, avoiding overreliance on imprecise imputed\nvalues. Our extensive experiments on the MIMIC-III and MIMIC-IV datasets\ndemonstrate PRISM's superior performance in predicting in-hospital mortality\nand 30-day readmission tasks, showcasing its effectiveness in handling EHR data\nsparsity. For the sake of reproducibility and further research, we have made\nthe code publicly available at https://github.com/yhzhu99/PRISM.\n","authors":["Yinghao Zhu","Zixiang Wang","Long He","Shiyun Xie","Liantao Ma","Chengwei Pan"],"pdf_url":"https://arxiv.org/pdf/2309.04160v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09627v2","updated":"2024-01-25T17:09:21Z","published":"2024-01-17T22:34:20Z","title":"SymTC: A Symbiotic Transformer-CNN Net for Instance Segmentation of\n  Lumbar Spine MRI","summary":"  Intervertebral disc disease, a prevalent ailment, frequently leads to\nintermittent or persistent low back pain, and diagnosing and assessing of this\ndisease rely on accurate measurement of vertebral bone and intervertebral disc\ngeometries from lumbar MR images. Deep neural network (DNN) models may assist\nclinicians with more efficient image segmentation of individual instances\n(disks and vertebrae) of the lumbar spine in an automated way, which is termed\nas instance image segmentation. In this work, we proposed SymTC, an innovative\nlumbar spine MR image segmentation model that combines the strengths of\nTransformer and Convolutional Neural Network (CNN). Specifically, we designed a\nparallel dual-path architecture to merge CNN layers and Transformer layers, and\nwe integrated a novel position embedding into the self-attention module of\nTransformer, enhancing the utilization of positional information for more\naccurate segmentation. To further improves model performance, we introduced a\nnew data augmentation technique to create synthetic yet realistic MR image\ndataset, named SSMSpine, which is made publicly available. We evaluated our\nSymTC and the other 15 existing image segmentation models on our private\nin-house dataset and the public SSMSpine dataset, using two metrics, Dice\nSimilarity Coefficient and 95% Hausdorff Distance. The results show that our\nSymTC has the best performance for segmenting vertebral bones and\nintervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine\ndataset are available at https://github.com/jiasongchen/SymTC.\n","authors":["Jiasong Chen","Linchen Qian","Linhai Ma","Timur Urakov","Weiyong Gu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2401.09627v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.01839v2","updated":"2024-01-25T16:49:33Z","published":"2022-11-03T14:20:32Z","title":"HyperSound: Generating Implicit Neural Representations of Audio Signals\n  with Hypernetworks","summary":"  Implicit neural representations (INRs) are a rapidly growing research field,\nwhich provides alternative ways to represent multimedia signals. Recent\napplications of INRs include image super-resolution, compression of\nhigh-dimensional signals, or 3D rendering. However, these solutions usually\nfocus on visual data, and adapting them to the audio domain is not trivial.\nMoreover, it requires a separately trained model for every data sample. To\naddress this limitation, we propose HyperSound, a meta-learning method\nleveraging hypernetworks to produce INRs for audio signals unseen at training\ntime. We show that our approach can reconstruct sound waves with quality\ncomparable to other state-of-the-art models.\n","authors":["Filip Szatkowski","Karol J. Piczak","Przemysław Spurek","Jacek Tabor","Tomasz Trzciński"],"pdf_url":"https://arxiv.org/pdf/2211.01839v2.pdf","comment":"NeurIPS 2022 MetaLearn workshop"},{"id":"http://arxiv.org/abs/2401.13536v2","updated":"2024-01-25T16:47:18Z","published":"2024-01-24T15:46:25Z","title":"Finetuning Foundation Models for Joint Analysis Optimization","summary":"  In this work we demonstrate that significant gains in performance and data\nefficiency can be achieved in High Energy Physics (HEP) by moving beyond the\nstandard paradigm of sequential optimization or reconstruction and analysis\ncomponents. We conceptually connect HEP reconstruction and analysis to modern\nmachine learning workflows such as pretraining, finetuning, domain adaptation\nand high-dimensional embedding spaces and quantify the gains in the example\nusecase of searches of heavy resonances decaying via an intermediate di-Higgs\nsystem to four $b$-jets.\n","authors":["Matthias Vigl","Nicole Hartman","Lukas Heinrich"],"pdf_url":"https://arxiv.org/pdf/2401.13536v2.pdf","comment":"13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2401.13537v2","updated":"2024-01-25T16:44:01Z","published":"2024-01-24T15:46:32Z","title":"Masked Particle Modeling on Sets: Towards Self-Supervised High Energy\n  Physics Foundation Models","summary":"  We propose masked particle modeling (MPM) as a self-supervised method for\nlearning generic, transferable, and reusable representations on unordered sets\nof inputs for use in high energy physics (HEP) scientific data. This work\nprovides a novel scheme to perform masked modeling based pre-training to learn\npermutation invariant functions on sets. More generally, this work provides a\nstep towards building large foundation models for HEP that can be generically\npre-trained with self-supervised learning and later fine-tuned for a variety of\ndown-stream tasks. In MPM, particles in a set are masked and the training\nobjective is to recover their identity, as defined by a discretized token\nrepresentation of a pre-trained vector quantized variational autoencoder. We\nstudy the efficacy of the method in samples of high energy jets at collider\nphysics experiments, including studies on the impact of discretization,\npermutation invariance, and ordering. We also study the fine-tuning capability\nof the model, showing that it can be adapted to tasks such as supervised and\nweakly supervised jet classification, and that the model can transfer\nefficiently with small fine-tuning data sets to new classes and new data\ndomains.\n","authors":["Lukas Heinrich","Tobias Golling","Michael Kagan","Samuel Klein","Matthew Leigh","Margarita Osadchy","John Andrew Raine"],"pdf_url":"https://arxiv.org/pdf/2401.13537v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14296v1","updated":"2024-01-25T16:38:06Z","published":"2024-01-25T16:38:06Z","title":"\"All of Me\": Mining Users' Attributes from their Public Spotify\n  Playlists","summary":"  In the age of digital music streaming, playlists on platforms like Spotify\nhave become an integral part of individuals' musical experiences. People create\nand publicly share their own playlists to express their musical tastes, promote\nthe discovery of their favorite artists, and foster social connections. These\npublicly accessible playlists transcend the boundaries of mere musical\npreferences: they serve as sources of rich insights into users' attributes and\nidentities. For example, the musical preferences of elderly individuals may\nlean more towards Frank Sinatra, while Billie Eilish remains a favored choice\namong teenagers. These playlists thus become windows into the diverse and\nevolving facets of one's musical identity.\n  In this work, we investigate the relationship between Spotify users'\nattributes and their public playlists. In particular, we focus on identifying\nrecurring musical characteristics associated with users' individual attributes,\nsuch as demographics, habits, or personality traits. To this end, we conducted\nan online survey involving 739 Spotify users, yielding a dataset of 10,286\npublicly shared playlists encompassing over 200,000 unique songs and 55,000\nartists. Through extensive statistical analyses, we first assess a deep\nconnection between a user's Spotify playlists and their real-life attributes.\nFor instance, we found individuals high in openness often create playlists\nfeaturing a diverse array of artists, while female users prefer Pop and K-pop\nmusic genres. Building upon these observed associations, we create accurate\npredictive models for users' attributes, presenting a novel DeepSet application\nthat outperforms baselines in most of these users' attributes.\n","authors":["Pier Paolo Tricomi","Luca Pajola","Luca Pasa","Mauro Conti"],"pdf_url":"https://arxiv.org/pdf/2401.14296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07159v2","updated":"2024-01-25T16:35:00Z","published":"2023-09-11T09:23:01Z","title":"A Strong and Simple Deep Learning Baseline for BCI MI Decoding","summary":"  We propose EEG-SimpleConv, a straightforward 1D convolutional neural network\nfor Motor Imagery decoding in BCI. Our main motivation is to propose a simple\nand performing baseline to compare to, using only very standard ingredients\nfrom the literature. We evaluate its performance on four EEG Motor Imagery\ndatasets, including simulated online setups, and compare it to recent Deep\nLearning and Machine Learning approaches. EEG-SimpleConv is at least as good or\nfar more efficient than other approaches, showing strong knowledge-transfer\ncapabilities across subjects, at the cost of a low inference time. We advocate\nthat using off-the-shelf ingredients rather than coming with ad-hoc solutions\ncan significantly help the adoption of Deep Learning approaches for BCI. We\nmake the code of the models and the experiments accessible.\n","authors":["Yassine El Ouahidi","Vincent Gripon","Bastien Pasdeloup","Ghaith Bouallegue","Nicolas Farrugia","Giulia Lioi"],"pdf_url":"https://arxiv.org/pdf/2309.07159v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08859v2","updated":"2024-01-25T16:34:22Z","published":"2024-01-16T22:20:36Z","title":"Shabari: Delayed Decision-Making for Faster and Efficient Serverless\n  Functions","summary":"  Serverless computing relieves developers from the burden of resource\nmanagement, thus providing ease-of-use to the users and the opportunity to\noptimize resource utilization for the providers. However, today's serverless\nsystems lack performance guarantees for function invocations, thus limiting\nsupport for performance-critical applications: we observed severe performance\nvariability (up to 6x). Providers lack visibility into user functions and hence\nfind it challenging to right-size them: we observed heavy resource\nunderutilization (up to 80%). To understand the causes behind the performance\nvariability and underutilization, we conducted a measurement study of commonly\ndeployed serverless functions and learned that the function performance and\nresource utilization depend crucially on function semantics and inputs. Our key\ninsight is to delay making resource allocation decisions until after the\nfunction inputs are available. We introduce Shabari, a resource management\nframework for serverless systems that makes decisions as late as possible to\nright-size each invocation to meet functions' performance objectives (SLOs) and\nimprove resource utilization. Shabari uses an online learning agent to\nright-size each function invocation based on the features of the function input\nand makes cold-start-aware scheduling decisions. For a range of serverless\nfunctions and inputs, Shabari reduces SLO violations by 11-73% while not\nwasting any vCPUs and reducing wasted memory by 64-94% in the median case,\ncompared to state-of-the-art systems, including Aquatope, Parrotfish, and\nCypress.\n","authors":["Prasoon Sinha","Kostis Kaffes","Neeraja J. Yadwadkar"],"pdf_url":"https://arxiv.org/pdf/2401.08859v2.pdf","comment":"17 pages, 14 figures, update typo in manually entered arxiv title"},{"id":"http://arxiv.org/abs/2401.14295v1","updated":"2024-01-25T16:34:00Z","published":"2024-01-25T16:34:00Z","title":"Topologies of Reasoning: Demystifying Chains, Trees, and Graphs of\n  Thoughts","summary":"  The field of natural language processing (NLP) has witnessed significant\nprogress in recent years, with a notable focus on improving large language\nmodels' (LLM) performance through innovative prompting techniques. Among these,\nprompt engineering coupled with structures has emerged as a promising paradigm,\nwith designs such as Chain-of-Thought, Tree of Thoughts, or Graph of Thoughts,\nin which the overall LLM reasoning is guided by a structure such as a graph. As\nillustrated with numerous examples, this paradigm significantly enhances the\nLLM's capability to solve numerous tasks, ranging from logical or mathematical\nreasoning to planning or creative writing. To facilitate the understanding of\nthis growing field and pave the way for future developments, we devise a\ngeneral blueprint for effective and efficient LLM reasoning schemes. For this,\nwe conduct an in-depth analysis of the prompt execution pipeline, clarifying\nand clearly defining different concepts. We then build the first taxonomy of\nstructure-enhanced LLM reasoning schemes. We focus on identifying fundamental\nclasses of harnessed structures, and we analyze the representations of these\nstructures, algorithms executed with these structures, and many others. We\nrefer to these structures as reasoning topologies, because their representation\nbecomes to a degree spatial, as they are contained within the LLM context. Our\nstudy compares existing prompting schemes using the proposed taxonomy,\ndiscussing how certain design choices lead to different patterns in performance\nand cost. We also outline theoretical underpinnings, relationships between\nprompting and others parts of the LLM ecosystem such as knowledge bases, and\nthe associated research challenges. Our work will help to advance future prompt\nengineering techniques.\n","authors":["Maciej Besta","Florim Memedi","Zhenyu Zhang","Robert Gerstenberger","Nils Blach","Piotr Nyczyk","Marcin Copik","Grzegorz Kwaśniewski","Jürgen Müller","Lukas Gianinazzi","Ales Kubicek","Hubert Niewiadomski","Onur Mutlu","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2401.14295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05207v3","updated":"2024-01-25T16:29:03Z","published":"2023-10-08T15:49:26Z","title":"Facial Action Unit Detection Based on Multi-task Learning Strategy for\n  Unlabeled Facial Images in the Wild","summary":"  Facial Action Unit (AU) detection often relies on highly-cost accurate\nlabeling or inaccurate pseudo labeling techniques in recent years. How to\nintroduce large amounts of unlabeled facial images in the wild into supervised\nAU detection frameworks has become a challenging problem. Additionally, nearly\nevery type of AUs has the problem of unbalanced positive and negative samples.\nInspired by other multi-task learning frameworks, we first propose a multi-task\nlearning strategy boosting AU detection in the wild through jointing facial\nlandmark detection and AU domain separation and reconstruction. Our introduced\ndual domains facial landmark detection framework can solve the lack of accurate\nfacial landmark coordinates during the AU domain separation and reconstruction\ntraining process, while the parameters of homostructural facial extraction\nmodules from these two similar facial tasks are shared. Moreover, we propose a\npixel-level feature alignment scheme to maintain the consistency of features\nobtained from two separation and reconstruction processes. Furthermore, a\nweighted asymmetric loss is proposed to change the contribution of positive and\nnegative samples of each type of AUs to model parameters updating. Experimental\nresults on three widely used benchmarks demonstrate our superiority to most\nstate-of-the-art methods for AU detection.\n","authors":["Ziqiao Shang","Bin Liu"],"pdf_url":"https://arxiv.org/pdf/2310.05207v3.pdf","comment":"15 pages, 6 figure, submitted to Expert Systems with Applications"},{"id":"http://arxiv.org/abs/2303.03106v2","updated":"2024-01-25T16:19:09Z","published":"2023-03-03T10:53:30Z","title":"Rotation Invariant Quantization for Model Compression","summary":"  Post-training Neural Network (NN) model compression is an attractive approach\nfor deploying large, memory-consuming models on devices with limited memory\nresources. In this study, we investigate the rate-distortion tradeoff for NN\nmodel compression. First, we suggest a Rotation-Invariant Quantization (RIQ)\ntechnique that utilizes a single parameter to quantize the entire NN model,\nyielding a different rate at each layer, i.e., mixed-precision quantization.\nThen, we prove that our rotation-invariant approach is optimal in terms of\ncompression. We rigorously evaluate RIQ and demonstrate its capabilities on\nvarious models and tasks. For example, RIQ facilitates $\\times 19.4$ and\n$\\times 52.9$ compression ratios on pre-trained VGG dense and pruned models,\nrespectively, with $<0.4\\%$ accuracy degradation. Code is available in\n\\url{https://github.com/ehaleva/RIQ}.\n","authors":["Joseph Kampeas","Yury Nahshan","Hanoch Kremer","Gil Lederman","Shira Zaloshinski","Zheng Li","Emir Haleva"],"pdf_url":"https://arxiv.org/pdf/2303.03106v2.pdf","comment":"19 pages, 5 figures, submitted to ICML 2023"},{"id":"http://arxiv.org/abs/2401.14283v1","updated":"2024-01-25T16:15:27Z","published":"2024-01-25T16:15:27Z","title":"Information Leakage Detection through Approximate Bayes-optimal\n  Prediction","summary":"  In today's data-driven world, the proliferation of publicly available\ninformation intensifies the challenge of information leakage (IL), raising\nsecurity concerns. IL involves unintentionally exposing secret (sensitive)\ninformation to unauthorized parties via systems' observable information.\nConventional statistical approaches, which estimate mutual information (MI)\nbetween observable and secret information for detecting IL, face challenges\nsuch as the curse of dimensionality, convergence, computational complexity, and\nMI misestimation. Furthermore, emerging supervised machine learning (ML)\nmethods, though effective, are limited to binary system-sensitive information\nand lack a comprehensive theoretical framework. To address these limitations,\nwe establish a theoretical framework using statistical learning theory and\ninformation theory to accurately quantify and detect IL. We demonstrate that MI\ncan be accurately estimated by approximating the log-loss and accuracy of the\nBayes predictor. As the Bayes predictor is typically unknown in practice, we\npropose to approximate it with the help of automated machine learning (AutoML).\nFirst, we compare our MI estimation approaches against current baselines, using\nsynthetic data sets generated using the multivariate normal (MVN) distribution\nwith known MI. Second, we introduce a cut-off technique using one-sided\nstatistical tests to detect IL, employing the Holm-Bonferroni correction to\nincrease confidence in detection decisions. Our study evaluates IL detection\nperformance on real-world data sets, highlighting the effectiveness of the\nBayes predictor's log-loss estimation, and finds our proposed method to\neffectively estimate MI on synthetic data sets and thus detect ILs accurately.\n","authors":["Pritha Gupta","Marcel Wever","Eyke Hüllermeier"],"pdf_url":"https://arxiv.org/pdf/2401.14283v1.pdf","comment":"Under submission in JMLR"},{"id":"http://arxiv.org/abs/2305.15277v3","updated":"2024-01-25T15:58:06Z","published":"2023-05-24T16:02:51Z","title":"Successor-Predecessor Intrinsic Exploration","summary":"  Exploration is essential in reinforcement learning, particularly in\nenvironments where external rewards are sparse. Here we focus on exploration\nwith intrinsic rewards, where the agent transiently augments the external\nrewards with self-generated intrinsic rewards. Although the study of intrinsic\nrewards has a long history, existing methods focus on composing the intrinsic\nreward based on measures of future prospects of states, ignoring the\ninformation contained in the retrospective structure of transition sequences.\nHere we argue that the agent can utilise retrospective information to generate\nexplorative behaviour with structure-awareness, facilitating efficient\nexploration based on global instead of local information. We propose\nSuccessor-Predecessor Intrinsic Exploration (SPIE), an exploration algorithm\nbased on a novel intrinsic reward combining prospective and retrospective\ninformation. We show that SPIE yields more efficient and ethologically\nplausible exploratory behaviour in environments with sparse rewards and\nbottleneck states than competing methods. We also implement SPIE in deep\nreinforcement learning agents, and show that the resulting agent achieves\nstronger empirical performance than existing methods on sparse-reward Atari\ngames.\n","authors":["Changmin Yu","Neil Burgess","Maneesh Sahani","Samuel J. Gershman"],"pdf_url":"https://arxiv.org/pdf/2305.15277v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17944v2","updated":"2024-01-25T15:52:51Z","published":"2023-10-27T07:39:54Z","title":"A Survey on Trustworthy Edge Intelligence: From Security and Reliability\n  To Transparency and Sustainability","summary":"  Edge Intelligence (EI) integrates Edge Computing (EC) and Artificial\nIntelligence (AI) to push the capabilities of AI to the network edge for\nreal-time, efficient and secure intelligent decision-making and computation.\nHowever, EI faces various challenges due to resource constraints, heterogeneous\nnetwork environments, and diverse service requirements of different\napplications, which together affect the trustworthiness of EI in the eyes of\nstakeholders. This survey comprehensively summarizes the characteristics,\narchitecture, technologies, and solutions of trustworthy EI. Specifically, we\nfirst emphasize the need for trustworthy EI in the context of the trend toward\nlarge models. We then provide an initial definition of trustworthy EI, explore\nits key characteristics and give a multi-layered architecture for trustworthy\nEI. Then, we summarize several important issues that hinder the achievement of\ntrustworthy EI. Subsequently, we present enabling technologies for trustworthy\nEI systems and provide an in-depth literature review of the state-of-the-art\nsolutions for realizing the trustworthiness of EI. Finally, we discuss the\ncorresponding research challenges and open issues.\n","authors":["Xiaojie Wang","Beibei Wang","Yu Wu","Zhaolong Ning","Song Guo","Fei Richard Yu"],"pdf_url":"https://arxiv.org/pdf/2310.17944v2.pdf","comment":"25 pages, 6 figures, 8 tables"},{"id":"http://arxiv.org/abs/2307.09437v2","updated":"2024-01-25T15:52:19Z","published":"2023-07-18T17:11:55Z","title":"Grounded Object Centric Learning","summary":"  The extraction of modular object-centric representations for downstream tasks\nis an emerging area of research. Learning grounded representations of objects\nthat are guaranteed to be stable and invariant promises robust performance\nacross different tasks and environments. Slot Attention (SA) learns\nobject-centric representations by assigning objects to \\textit{slots}, but\npresupposes a \\textit{single} distribution from which all slots are randomly\ninitialised. This results in an inability to learn \\textit{specialized} slots\nwhich bind to specific object types and remain invariant to identity-preserving\nchanges in object appearance. To address this, we present\n\\emph{\\textsc{Co}nditional \\textsc{S}lot \\textsc{A}ttention} (\\textsc{CoSA})\nusing a novel concept of \\emph{Grounded Slot Dictionary} (GSD) inspired by\nvector quantization. Our proposed GSD comprises (i) canonical object-level\nproperty vectors and (ii) parametric Gaussian distributions, which define a\nprior over the slots. We demonstrate the benefits of our method in multiple\ndownstream tasks such as scene generation, composition, and task adaptation,\nwhilst remaining competitive with SA in popular object discovery benchmarks.\n","authors":["Avinash Kori","Francesco Locatello","Fabio De Sousa Ribeiro","Francesca Toni","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2307.09437v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10467v2","updated":"2024-01-25T15:51:21Z","published":"2023-12-16T14:49:36Z","title":"TrojFST: Embedding Trojans in Few-shot Prompt Tuning","summary":"  Prompt-tuning has emerged as a highly effective approach for adapting a\npre-trained language model (PLM) to handle new natural language processing\ntasks with limited input samples. However, the success of prompt-tuning has led\nto adversaries attempting backdoor attacks against this technique. Previous\nprompt-based backdoor attacks faced challenges when implemented through\nfew-shot prompt-tuning, requiring either full-model fine-tuning or a large\ntraining dataset. We observe the difficulty in constructing a prompt-based\nbackdoor using few-shot prompt-tuning, which involves freezing the PLM and\ntuning a soft prompt with a restricted set of input samples. This approach\nintroduces an imbalanced poisoned dataset, making it susceptible to overfitting\nand lacking attention awareness. To address these challenges, we introduce\nTrojFST for backdoor attacks within the framework of few-shot prompt-tuning.\nTrojFST comprises three modules: balanced poison learning, selective token\npoisoning, and trojan-trigger attention. In comparison to previous prompt-based\nbackdoor attacks, TrojFST demonstrates significant improvements, enhancing ASR\n$> 9\\%$ and CDA by $> 4\\%$ across various PLMs and a diverse set of downstream\ntasks.\n","authors":["Mengxin Zheng","Jiaqi Xue","Xun Chen","YanShan Wang","Qian Lou","Lei Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.10467v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2401.14256v1","updated":"2024-01-25T15:47:18Z","published":"2024-01-25T15:47:18Z","title":"Producing Plankton Classifiers that are Robust to Dataset Shift","summary":"  Modern plankton high-throughput monitoring relies on deep learning\nclassifiers for species recognition in water ecosystems. Despite satisfactory\nnominal performances, a significant challenge arises from Dataset Shift, which\ncauses performances to drop during deployment. In our study, we integrate the\nZooLake dataset with manually-annotated images from 10 independent days of\ndeployment, serving as test cells to benchmark Out-Of-Dataset (OOD)\nperformances. Our analysis reveals instances where classifiers, initially\nperforming well in In-Dataset conditions, encounter notable failures in\npractical scenarios. For example, a MobileNet with a 92% nominal test accuracy\nshows a 77% OOD accuracy. We systematically investigate conditions leading to\nOOD performance drops and propose a preemptive assessment method to identify\npotential pitfalls when classifying new data, and pinpoint features in OOD\nimages that adversely impact classification. We present a three-step pipeline:\n(i) identifying OOD degradation compared to nominal test performance, (ii)\nconducting a diagnostic analysis of degradation causes, and (iii) providing\nsolutions. We find that ensembles of BEiT vision transformers, with targeted\naugmentations addressing OOD robustness, geometric ensembling, and\nrotation-based test-time augmentation, constitute the most robust model, which\nwe call BEsT model. It achieves an 83% OOD accuracy, with errors concentrated\non container classes. Moreover, it exhibits lower sensitivity to dataset shift,\nand reproduces well the plankton abundances. Our proposed pipeline is\napplicable to generic plankton classifiers, contingent on the availability of\nsuitable test cells. By identifying critical shortcomings and offering\npractical procedures to fortify models against dataset shift, our study\ncontributes to the development of more reliable plankton classification\ntechnologies.\n","authors":["Cheng Chen","Sreenath Kyathanahally","Marta Reyes","Stefanie Merkli","Ewa Merz","Emanuele Francazi","Marvin Hoege","Francesco Pomati","Marco Baity-Jesi"],"pdf_url":"https://arxiv.org/pdf/2401.14256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14255v1","updated":"2024-01-25T15:45:28Z","published":"2024-01-25T15:45:28Z","title":"Interpretable Solutions for Breast Cancer Diagnosis with Grammatical\n  Evolution and Data Augmentation","summary":"  Medical imaging diagnosis increasingly relies on Machine Learning (ML)\nmodels. This is a task that is often hampered by severely imbalanced datasets,\nwhere positive cases can be quite rare. Their use is further compromised by\ntheir limited interpretability, which is becoming increasingly important. While\npost-hoc interpretability techniques such as SHAP and LIME have been used with\nsome success on so-called black box models, the use of inherently\nunderstandable models makes such endeavors more fruitful. This paper addresses\nthese issues by demonstrating how a relatively new synthetic data generation\ntechnique, STEM, can be used to produce data to train models produced by\nGrammatical Evolution (GE) that are inherently understandable. STEM is a\nrecently introduced combination of the Synthetic Minority Oversampling\nTechnique (SMOTE), Edited Nearest Neighbour (ENN), and Mixup; it has previously\nbeen successfully used to tackle both between class and within class imbalance\nissues. We test our technique on the Digital Database for Screening Mammography\n(DDSM) and the Wisconsin Breast Cancer (WBC) datasets and compare Area Under\nthe Curve (AUC) results with an ensemble of the top three performing\nclassifiers from a set of eight standard ML classifiers with varying degrees of\ninterpretability. We demonstrate that the GE-derived models present the best\nAUC while still maintaining interpretable solutions.\n","authors":["Yumnah Hasan","Allan de Lima","Fatemeh Amerehi","Darian Reyes Fernandez de Bulnes","Patrick Healy","Conor Ryan"],"pdf_url":"https://arxiv.org/pdf/2401.14255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03223v2","updated":"2024-01-25T15:36:24Z","published":"2023-05-05T00:57:55Z","title":"Structural Group Unfairness: Measurement and Mitigation by means of the\n  Effective Resistance","summary":"  Social networks contribute to the distribution of social capital, defined as\nthe relationships, norms of trust and reciprocity within a community or society\nthat facilitate cooperation and collective action. Social capital exists in the\nrelations among individuals, such that better positioned members in a social\nnetwork benefit from faster access to diverse information and higher influence\non information dissemination. A variety of methods have been proposed in the\nliterature to measure social capital at an individual level. However, there is\na lack of methods to quantify social capital at a group level, which is\nparticularly important when the groups are defined on the grounds of protected\nattributes. Furthermore, state-of-the-art approaches fail to model the role of\nlong-range interactions between nodes in the network and their contributions to\nsocial capital. To fill this gap, we propose to measure the social capital of a\ngroup of nodes by means of their information flow and emphasize the importance\nof considering the whole network topology. Grounded in spectral graph theory,\nwe introduce three effective resistance-based measures of group social capital,\nnamely group isolation, group diameter and group control. We denote the social\ncapital disparity among different groups in a network as structural group\nunfairness, and propose to mitigate it by means of a budgeted edge augmentation\nheuristic that systematically increases the social capital of the most\ndisadvantaged group. In experiments on real networks, we uncover significant\nlevels of structural group unfairness when using gender as the protected\nattribute, with females being the most disadvantaged group in comparison to\nmales. We also illustrate how our proposed edge augmentation approach is able\nto not only effectively mitigate the structural group unfairness but also\nincrease the social capital of all groups in the network.\n","authors":["Adrian Arnaiz-Rodriguez","Georgina Curto","Nuria Oliver"],"pdf_url":"https://arxiv.org/pdf/2305.03223v2.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.09543v2","updated":"2024-01-25T15:35:26Z","published":"2023-10-14T09:36:01Z","title":"Benchmarking the Sim-to-Real Gap in Cloth Manipulation","summary":"  Realistic physics engines play a crucial role for learning to manipulate\ndeformable objects such as garments in simulation. By doing so, researchers can\ncircumvent challenges such as sensing the deformation of the object in the\nrealworld. In spite of the extensive use of simulations for this task, few\nworks have evaluated the reality gap between deformable object simulators and\nreal-world data. We present a benchmark dataset to evaluate the sim-to-real gap\nin cloth manipulation. The dataset is collected by performing a dynamic as well\nas a quasi-static cloth manipulation task involving contact with a rigid table.\nWe use the dataset to evaluate the reality gap, computational time, and\nsimulation stability of four popular deformable object simulators: MuJoCo,\nBullet, Flex, and SOFA. Additionally, we discuss the benefits and drawbacks of\neach simulator. The benchmark dataset is open-source. Supplementary material,\nvideos, and code, can be found at\nhttps://sites.google.com/view/cloth-sim2real-benchmark.\n","authors":["David Blanco-Mulero","Oriol Barbany","Gokhan Alcan","Adrià Colomé","Carme Torras","Ville Kyrki"],"pdf_url":"https://arxiv.org/pdf/2310.09543v2.pdf","comment":"Accepted to IEEE Robotics and Automation Letters (RA-L). 8 pages, 6\n  figures. Supplementary material available at\n  https://sites.google.com/view/cloth-sim2real-benchmark"},{"id":"http://arxiv.org/abs/2401.14240v1","updated":"2024-01-25T15:28:07Z","published":"2024-01-25T15:28:07Z","title":"Enhanced Labeling Technique for Reddit Text and Fine-Tuned Longformer\n  Models for Classifying Depression Severity in English and Luganda","summary":"  Depression is a global burden and one of the most challenging mental health\nconditions to control. Experts can detect its severity early using the Beck\nDepression Inventory (BDI) questionnaire, administer appropriate medication to\npatients, and impede its progression. Due to the fear of potential\nstigmatization, many patients turn to social media platforms like Reddit for\nadvice and assistance at various stages of their journey. This research\nextracts text from Reddit to facilitate the diagnostic process. It employs a\nproposed labeling approach to categorize the text and subsequently fine-tunes\nthe Longformer model. The model's performance is compared against baseline\nmodels, including Naive Bayes, Random Forest, Support Vector Machines, and\nGradient Boosting. Our findings reveal that the Longformer model outperforms\nthe baseline models in both English (48%) and Luganda (45%) languages on a\ncustom-made dataset.\n","authors":["Richard Kimera","Daniela N. Rim","Joseph Kirabira","Ubong Godwin Udomah","Heeyoul Choi"],"pdf_url":"https://arxiv.org/pdf/2401.14240v1.pdf","comment":"In IEEE Proceedings of the 14th International Conference on ICT\n  Convergence (ICTC), Jeju, Korea, October 2023"},{"id":"http://arxiv.org/abs/2108.00473v5","updated":"2024-01-25T15:15:45Z","published":"2021-08-01T15:23:49Z","title":"Derivative-free Alternating Projection Algorithms for General\n  Nonconvex-Concave Minimax Problems","summary":"  In this paper, we study zeroth-order algorithms for nonconvex-concave minimax\nproblems, which have attracted widely attention in machine learning, signal\nprocessing and many other fields in recent years. We propose a zeroth-order\nalternating randomized gradient projection (ZO-AGP) algorithm for smooth\nnonconvex-concave minimax problems, and its iteration complexity to obtain an\n$\\varepsilon$-stationary point is bounded by $\\mathcal{O}(\\varepsilon^{-4})$,\nand the number of function value estimation is bounded by\n$\\mathcal{O}(d_{x}+d_{y})$ per iteration. Moreover, we propose a zeroth-order\nblock alternating randomized proximal gradient algorithm (ZO-BAPG) for solving\nblock-wise nonsmooth nonconvex-concave minimax optimization problems, and the\niteration complexity to obtain an $\\varepsilon$-stationary point is bounded by\n$\\mathcal{O}(\\varepsilon^{-4})$ and the number of function value estimation per\niteration is bounded by $\\mathcal{O}(K d_{x}+d_{y})$. To the best of our\nknowledge, this is the first time that zeroth-order algorithms with iteration\ncomplexity gurantee are developed for solving both general smooth and\nblock-wise nonsmooth nonconvex-concave minimax problems. Numerical results on\ndata poisoning attack problem and distributed nonconvex sparse principal\ncomponent analysis problem validate the efficiency of the proposed algorithms.\n","authors":["Zi Xu","Ziqi Wang","Jingjing Shen","Yuhong Dai"],"pdf_url":"https://arxiv.org/pdf/2108.00473v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10818v3","updated":"2024-01-25T15:15:17Z","published":"2023-05-18T08:56:05Z","title":"Diffusion Language Models Generation Can Be Halted Early","summary":"  Diffusion Language models (DLMs) are a promising avenue for text generation\ndue to their practical properties on tractable controllable generation. They\nalso have the advantage of not having to predict text autoregressively.\nHowever, despite these notable features, DLMs have not yet reached the\nperformance levels of their autoregressive counterparts. One of the ways to\nreduce the performance gap between these two types of language models is to\nspeed up the generation of DLMs. Therefore, we propose a novel methodology to\naddress this issue in this work. It enables the execution of more generation\nsteps within a given time frame, leading to higher-quality outputs.\nSpecifically, our methods estimate DLMs completeness of text generation and\nallow adaptive halting of the generation process. We evaluate our methods on\nPlaid, SSD, and CDCD DLMs and create a cohesive perspective on their generation\nworkflows. Finally, we confirm that our methods allow halting these models and\ndecrease the generation time by $10$-$40$\\% without a drop in the quality of\nmodel samples.\n","authors":["Sofia Maria Lo Cicero Vaina","Nikita Balagansky","Daniil Gavrilov"],"pdf_url":"https://arxiv.org/pdf/2305.10818v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14228v1","updated":"2024-01-25T15:11:07Z","published":"2024-01-25T15:11:07Z","title":"Assessing the Portability of Parameter Matrices Trained by\n  Parameter-Efficient Finetuning Methods","summary":"  As the cost of training ever larger language models has grown, so has the\ninterest in reusing previously learnt knowledge. Transfer learning methods have\nshown how reusing non-task-specific knowledge can help in subsequent\ntask-specific learning. In this paper, we investigate the inverse: porting\nwhole functional modules that encode task-specific knowledge from one model to\nanother. We designed a study comprising 1,440 training/testing runs to test the\nportability of modules trained by parameter-efficient finetuning (PEFT)\ntechniques, using sentiment analysis as an example task. We test portability in\na wide range of scenarios, involving different PEFT techniques and different\npretrained host models, among other dimensions. We compare the performance of\nported modules with that of equivalent modules trained (i) from scratch, and\n(ii) from parameters sampled from the same distribution as the ported module.\nWe find that the ported modules far outperform the two alternatives tested, but\nthat there are interesting performance differences between the four PEFT\ntechniques. We conclude that task-specific knowledge in the form of\nstructurally modular sets of parameters as produced by PEFT techniques is\nhighly portable, but that degree of success depends on type of PEFT and on\ndifferences between originating and receiving pretrained models.\n","authors":["Mohammed Sabry","Anya Belz"],"pdf_url":"https://arxiv.org/pdf/2401.14228v1.pdf","comment":"Accepted to Findings of EACL 2024. Camera ready version"},{"id":"http://arxiv.org/abs/2401.08534v3","updated":"2024-01-25T15:06:40Z","published":"2024-01-16T17:54:02Z","title":"DiConStruct: Causal Concept-based Explanations through Black-Box\n  Distillation","summary":"  Model interpretability plays a central role in human-AI decision-making\nsystems. Ideally, explanations should be expressed using human-interpretable\nsemantic concepts. Moreover, the causal relations between these concepts should\nbe captured by the explainer to allow for reasoning about the explanations.\nLastly, explanation methods should be efficient and not compromise the\nperformance of the predictive task. Despite the rapid advances in AI\nexplainability in recent years, as far as we know to date, no method fulfills\nthese three properties. Indeed, mainstream methods for local concept\nexplainability do not produce causal explanations and incur a trade-off between\nexplainability and prediction performance. We present DiConStruct, an\nexplanation method that is both concept-based and causal, with the goal of\ncreating more interpretable local explanations in the form of structural causal\nmodels and concept attributions. Our explainer works as a distillation model to\nany black-box machine learning model by approximating its predictions while\nproducing the respective explanations. Because of this, DiConStruct generates\nexplanations efficiently while not impacting the black-box prediction task. We\nvalidate our method on an image dataset and a tabular dataset, showing that\nDiConStruct approximates the black-box models with higher fidelity than other\nconcept explainability baselines, while providing explanations that include the\ncausal relations between the concepts.\n","authors":["Ricardo Moreira","Jacopo Bono","Mário Cardoso","Pedro Saleiro","Mário A. T. Figueiredo","Pedro Bizarro"],"pdf_url":"https://arxiv.org/pdf/2401.08534v3.pdf","comment":"Accepted at Conference on Causal Learning and Reasoning (CLeaR 2024,\n  https://www.cclear.cc/2024). To be published at Proceedings of Machine\n  Learning Research (PMLR)"},{"id":"http://arxiv.org/abs/2401.14226v1","updated":"2024-01-25T15:06:40Z","published":"2024-01-25T15:06:40Z","title":"Sample Efficient Reinforcement Learning by Automatically Learning to\n  Compose Subtasks","summary":"  Improving sample efficiency is central to Reinforcement Learning (RL),\nespecially in environments where the rewards are sparse. Some recent approaches\nhave proposed to specify reward functions as manually designed or learned\nreward structures whose integrations in the RL algorithms are claimed to\nsignificantly improve the learning efficiency. Manually designed reward\nstructures can suffer from inaccuracy and existing automatically learning\nmethods are often computationally intractable for complex tasks. The\nintegration of inaccurate or partial reward structures in RL algorithms fail to\nlearn optimal policies. In this work, we propose an RL algorithm that can\nautomatically structure the reward function for sample efficiency, given a set\nof labels that signify subtasks. Given such minimal knowledge about the task,\nwe train a high-level policy that selects optimal sub-tasks in each state\ntogether with a low-level policy that efficiently learns to complete each\nsub-task. We evaluate our algorithm in a variety of sparse-reward environments.\nThe experiment results show that our approach significantly outperforms the\nstate-of-art baselines as the difficulty of the task increases.\n","authors":["Shuai Han","Mehdi Dastani","Shihan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.14226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12236v2","updated":"2024-01-25T14:57:48Z","published":"2024-01-19T15:40:46Z","title":"The Surprising Harmfulness of Benign Overfitting for Adversarial\n  Robustness","summary":"  Recent empirical and theoretical studies have established the generalization\ncapabilities of large machine learning models that are trained to\n(approximately or exactly) fit noisy data. In this work, we prove a surprising\nresult that even if the ground truth itself is robust to adversarial examples,\nand the benignly overfitted model is benign in terms of the ``standard''\nout-of-sample risk objective, this benign overfitting process can be harmful\nwhen out-of-sample data are subject to adversarial manipulation. More\nspecifically, our main results contain two parts: (i) the min-norm estimator in\noverparameterized linear model always leads to adversarial vulnerability in the\n``benign overfitting'' setting; (ii) we verify an asymptotic trade-off result\nbetween the standard risk and the ``adversarial'' risk of every ridge\nregression estimator, implying that under suitable conditions these two items\ncannot both be small at the same time by any single choice of the ridge\nregularization parameter. Furthermore, under the lazy training regime, we\ndemonstrate parallel results on two-layer neural tangent kernel (NTK) model,\nwhich align with empirical observations in deep neural networks. Our finding\nprovides theoretical insights into the puzzling phenomenon observed in\npractice, where the true target function (e.g., human) is robust against\nadverasrial attack, while beginly overfitted neural networks lead to models\nthat are not robust.\n","authors":["Yifan Hao","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.12236v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01622v3","updated":"2024-01-25T14:55:17Z","published":"2023-02-03T09:49:13Z","title":"Private, fair and accurate: Training large-scale, privacy-preserving AI\n  models in medical imaging","summary":"  Artificial intelligence (AI) models are increasingly used in the medical\ndomain. However, as medical data is highly sensitive, special precautions to\nensure its protection are required. The gold standard for privacy preservation\nis the introduction of differential privacy (DP) to model training. Prior work\nindicates that DP has negative implications on model accuracy and fairness,\nwhich are unacceptable in medicine and represent a main barrier to the\nwidespread use of privacy-preserving techniques. In this work, we evaluated the\neffect of privacy-preserving training of AI models regarding accuracy and\nfairness compared to non-private training. For this, we used two datasets: (1)\nA large dataset (N=193,311) of high quality clinical chest radiographs, and (2)\na dataset (N=1,625) of 3D abdominal computed tomography (CT) images, with the\ntask of classifying the presence of pancreatic ductal adenocarcinoma (PDAC).\nBoth were retrospectively collected and manually labeled by experienced\nradiologists. We then compared non-private deep convolutional neural networks\n(CNNs) and privacy-preserving (DP) models with respect to privacy-utility\ntrade-offs measured as area under the receiver-operator-characteristic curve\n(AUROC), and privacy-fairness trade-offs, measured as Pearson's r or\nStatistical Parity Difference. We found that, while the privacy-preserving\ntrainings yielded lower accuracy, they did largely not amplify discrimination\nagainst age, sex or co-morbidity. Our study shows that -- under the challenging\nrealistic circumstances of a real-life clinical dataset -- the\nprivacy-preserving training of diagnostic deep learning models is possible with\nexcellent diagnostic accuracy and fairness.\n","authors":["Soroosh Tayebi Arasteh","Alexander Ziller","Christiane Kuhl","Marcus Makowski","Sven Nebelung","Rickmer Braren","Daniel Rueckert","Daniel Truhn","Georgios Kaissis"],"pdf_url":"https://arxiv.org/pdf/2302.01622v3.pdf","comment":"To appear in Communications Medicine. 2024. Nature Portfolio"},{"id":"http://arxiv.org/abs/2401.14211v1","updated":"2024-01-25T14:49:15Z","published":"2024-01-25T14:49:15Z","title":"Communication-Efficient Federated Learning through Adaptive Weight\n  Clustering and Server-Side Distillation","summary":"  Federated Learning (FL) is a promising technique for the collaborative\ntraining of deep neural networks across multiple devices while preserving data\nprivacy. Despite its potential benefits, FL is hindered by excessive\ncommunication costs due to repeated server-client communication during\ntraining. To address this challenge, model compression techniques, such as\nsparsification and weight clustering are applied, which often require modifying\nthe underlying model aggregation schemes or involve cumbersome hyperparameter\ntuning, with the latter not only adjusts the model's compression rate but also\nlimits model's potential for continuous improvement over growing data. In this\npaper, we propose FedCompress, a novel approach that combines dynamic weight\nclustering and server-side knowledge distillation to reduce communication costs\nwhile learning highly generalizable models. Through a comprehensive evaluation\non diverse public datasets, we demonstrate the efficacy of our approach\ncompared to baselines in terms of communication costs and inference speed. We\nwill make our implementation public upon acceptance.\n","authors":["Vasileios Tsouvalas. Aaqib Saeed","Tanir Ozcelebi","Nirvana Meratnia"],"pdf_url":"https://arxiv.org/pdf/2401.14211v1.pdf","comment":"9 pages, 2 figures, Accepted on ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.14210v1","updated":"2024-01-25T14:48:08Z","published":"2024-01-25T14:48:08Z","title":"At the junction between deep learning and statistics of extremes:\n  formalizing the landslide hazard definition","summary":"  The most adopted definition of landslide hazard combines spatial information\nabout landslide location (susceptibility), threat (intensity), and frequency\n(return period). Only the first two elements are usually considered and\nestimated when working over vast areas. Even then, separate models constitute\nthe standard, with frequency being rarely investigated. Frequency and intensity\nare intertwined and depend on each other because larger events occur less\nfrequently and vice versa. However, due to the lack of multi-temporal\ninventories and joint statistical models, modelling such properties via a\nunified hazard model has always been challenging and has yet to be attempted.\nHere, we develop a unified model to estimate landslide hazard at the slope unit\nlevel to address such gaps. We employed deep learning, combined with a model\nmotivated by extreme-value theory to analyse an inventory of 30 years of\nobserved rainfall-triggered landslides in Nepal and assess landslide hazard for\nmultiple return periods. We also use our model to further explore landslide\nhazard for the same return periods under different climate change scenarios up\nto the end of the century. Our results show that the proposed model performs\nexcellently and can be used to model landslide hazard in a unified manner.\nGeomorphologically, we find that under both climate change scenarios (SSP245\nand SSP885), landslide hazard is likely to increase up to two times on average\nin the lower Himalayan regions while remaining the same in the middle Himalayan\nregion whilst decreasing slightly in the upper Himalayan region areas.\n","authors":["Ashok Dahal","Raphaël Huser","Luigi Lombardo"],"pdf_url":"https://arxiv.org/pdf/2401.14210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14199v1","updated":"2024-01-25T14:21:14Z","published":"2024-01-25T14:21:14Z","title":"MTRGL:Effective Temporal Correlation Discerning through Multi-modal\n  Temporal Relational Graph Learning","summary":"  In this study, we explore the synergy of deep learning and financial market\napplications, focusing on pair trading. This market-neutral strategy is\nintegral to quantitative finance and is apt for advanced deep-learning\ntechniques. A pivotal challenge in pair trading is discerning temporal\ncorrelations among entities, necessitating the integration of diverse data\nmodalities. Addressing this, we introduce a novel framework, Multi-modal\nTemporal Relation Graph Learning (MTRGL). MTRGL combines time series data and\ndiscrete features into a temporal graph and employs a memory-based temporal\ngraph neural network. This approach reframes temporal correlation\nidentification as a temporal graph link prediction task, which has shown\nempirical success. Our experiments on real-world datasets confirm the superior\nperformance of MTRGL, emphasizing its promise in refining automated pair\ntrading strategies.\n","authors":["Junwei Su","Shan Wu","Jinhui Li"],"pdf_url":"https://arxiv.org/pdf/2401.14199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14196v1","updated":"2024-01-25T14:17:53Z","published":"2024-01-25T14:17:53Z","title":"DeepSeek-Coder: When the Large Language Model Meets Programming -- The\n  Rise of Code Intelligence","summary":"  The rapid development of large language models has revolutionized code\nintelligence in software development. However, the predominance of\nclosed-source models has restricted extensive research and development. To\naddress this, we introduce the DeepSeek-Coder series, a range of open-source\ncode models with sizes from 1.3B to 33B, trained from scratch on 2 trillion\ntokens. These models are pre-trained on a high-quality project-level code\ncorpus and employ a fill-in-the-blank task with a 16K window to enhance code\ngeneration and infilling. Our extensive evaluations demonstrate that\nDeepSeek-Coder not only achieves state-of-the-art performance among open-source\ncode models across multiple benchmarks but also surpasses existing\nclosed-source models like Codex and GPT-3.5. Furthermore, DeepSeek-Coder models\nare under a permissive license that allows for both research and unrestricted\ncommercial use.\n","authors":["Daya Guo","Qihao Zhu","Dejian Yang","Zhenda Xie","Kai Dong","Wentao Zhang","Guanting Chen","Xiao Bi","Y. Wu","Y. K. Li","Fuli Luo","Yingfei Xiong","Wenfeng Liang"],"pdf_url":"https://arxiv.org/pdf/2401.14196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09437v3","updated":"2024-01-25T14:06:18Z","published":"2022-12-16T10:34:27Z","title":"Machine Learning Systems are Bloated and Vulnerable","summary":"  Today's software is bloated with both code and features that are not used by\nmost users. This bloat is prevalent across the entire software stack, from\noperating systems and applications to containers. Containers are lightweight\nvirtualization technologies used to package code and dependencies, providing\nportable, reproducible and isolated environments. For their ease of use, data\nscientists often utilize machine learning containers to simplify their\nworkflow. However, this convenience comes at a cost: containers are often\nbloated with unnecessary code and dependencies, resulting in very large sizes.\nIn this paper, we analyze and quantify bloat in machine learning containers. We\ndevelop MMLB, a framework for analyzing bloat in software systems, focusing on\nmachine learning containers. MMLB measures the amount of bloat at both the\ncontainer and package levels, quantifying the sources of bloat. In addition,\nMMLB integrates with vulnerability analysis tools and performs package\ndependency analysis to evaluate the impact of bloat on container\nvulnerabilities. Through experimentation with 15 machine learning containers\nfrom TensorFlow, PyTorch, and Nvidia, we show that bloat accounts for up to 80%\nof machine learning container sizes, increasing container provisioning times by\nup to 370% and exacerbating vulnerabilities by up to 99%.\n","authors":["Huaifeng Zhang","Fahmi Abdulqadir Ahmed","Dyako Fatih","Akayou Kitessa","Mohannad Alhanahnah","Philipp Leitner","Ahmed Ali-Eldin"],"pdf_url":"https://arxiv.org/pdf/2212.09437v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14192v1","updated":"2024-01-25T14:03:15Z","published":"2024-01-25T14:03:15Z","title":"How Can Large Language Models Understand Spatial-Temporal Data?","summary":"  While Large Language Models (LLMs) dominate tasks like natural language\nprocessing and computer vision, harnessing their power for spatial-temporal\nforecasting remains challenging. The disparity between sequential text and\ncomplex spatial-temporal data hinders this application. To address this issue,\nthis paper introduces STG-LLM, an innovative approach empowering LLMs for\nspatial-temporal forecasting. We tackle the data mismatch by proposing: 1)\nSTG-Tokenizer: This spatial-temporal graph tokenizer transforms intricate graph\ndata into concise tokens capturing both spatial and temporal relationships; 2)\nSTG-Adapter: This minimalistic adapter, consisting of linear encoding and\ndecoding layers, bridges the gap between tokenized data and LLM comprehension.\nBy fine-tuning only a small set of parameters, it can effectively grasp the\nsemantics of tokens generated by STG-Tokenizer, while preserving the original\nnatural language understanding capabilities of LLMs. Extensive experiments on\ndiverse spatial-temporal benchmark datasets show that STG-LLM successfully\nunlocks LLM potential for spatial-temporal forecasting. Remarkably, our\napproach achieves competitive performance on par with dedicated SOTA methods.\n","authors":["Lei Liu","Shuo Yu","Runze Wang","Zhenxun Ma","Yanming Shen"],"pdf_url":"https://arxiv.org/pdf/2401.14192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12522v2","updated":"2024-01-25T14:02:03Z","published":"2024-01-23T06:36:49Z","title":"BiTA: Bi-Directional Tuning for Lossless Acceleration in Large Language\n  Models","summary":"  Large language models (LLMs) commonly employ autoregressive generation during\ninference, leading to high memory bandwidth demand and consequently extended\nlatency. To mitigate this inefficiency, we present Bi-directional Tuning for\nlossless Acceleration (BiTA), an innovative method expediting LLMs via\nstreamlined semi-autoregressive generation and draft verification. Inspired by\nthe concept of prompt tuning, we enhance LLMs with a parameter-efficient design\ncalled bi-directional tuning for the capability in semi-autoregressive\ngeneration. Employing efficient tree-based decoding, the models perform draft\ncandidate generation and verification in parallel, ensuring outputs identical\nto their autoregressive counterparts under greedy sampling. BiTA serves as a\nlightweight plug-in module, seamlessly boosting the inference efficiency of\nexisting LLMs without requiring additional assistance models or incurring\nsignificant extra memory costs. Applying the proposed BiTA, LLaMA-2-70B-Chat\nachieves a 2.7$\\times$ speedup on the MT-Bench benchmark. Extensive experiments\nconfirm our method surpasses state-of-the-art acceleration techniques.\n","authors":["Feng Lin","Hanling Yi","Hongbin Li","Yifan Yang","Xiaotian Yu","Guangming Lu","Rong Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.12522v2.pdf","comment":"An appendix has been included. Source code at\n  https://github.com/linfeng93/BiTA"},{"id":"http://arxiv.org/abs/2312.04402v2","updated":"2024-01-25T13:49:15Z","published":"2023-12-07T16:16:47Z","title":"Semi-Supervised Active Learning for Semantic Segmentation in Unknown\n  Environments Using Informative Path Planning","summary":"  Semantic segmentation enables robots to perceive and reason about their\nenvironments beyond geometry. Most of such systems build upon deep learning\napproaches. As autonomous robots are commonly deployed in initially unknown\nenvironments, pre-training on static datasets cannot always capture the variety\nof domains and limits the robot's perception performance during missions.\nRecently, self-supervised and fully supervised active learning methods emerged\nto improve a robot's vision. These approaches rely on large in-domain\npre-training datasets or require substantial human labelling effort. We propose\na planning method for semi-supervised active learning of semantic segmentation\nthat substantially reduces human labelling requirements compared to fully\nsupervised approaches. We leverage an adaptive map-based planner guided towards\nthe frontiers of unexplored space with high model uncertainty collecting\ntraining data for human labelling. A key aspect of our approach is to combine\nthe sparse high-quality human labels with pseudo labels automatically extracted\nfrom highly certain environment map areas. Experimental results show that our\nmethod reaches segmentation performance close to fully supervised approaches\nwith drastically reduced human labelling effort while outperforming\nself-supervised approaches.\n","authors":["Julius Rückin","Federico Magistri","Cyrill Stachniss","Marija Popović"],"pdf_url":"https://arxiv.org/pdf/2312.04402v2.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.14184v1","updated":"2024-01-25T13:46:21Z","published":"2024-01-25T13:46:21Z","title":"Friendly Attacks to Improve Channel Coding Reliability","summary":"  This paper introduces a novel approach called \"friendly attack\" aimed at\nenhancing the performance of error correction channel codes. Inspired by the\nconcept of adversarial attacks, our method leverages the idea of introducing\nslight perturbations to the neural network input, resulting in a substantial\nimpact on the network's performance. By introducing small perturbations to\nfixed-point modulated codewords before transmission, we effectively improve the\ndecoder's performance without violating the input power constraint. The\nperturbation design is accomplished by a modified iterative fast gradient\nmethod. This study investigates various decoder architectures suitable for\ncomputing gradients to obtain the desired perturbations. Specifically, we\nconsider belief propagation (BP) for LDPC codes; the error correcting code\ntransformer, BP and neural BP (NBP) for polar codes, and neural BCJR for\nconvolutional codes. We demonstrate that the proposed friendly attack method\ncan improve the reliability across different channels, modulations, codes, and\ndecoders. This method allows us to increase the reliability of communication\nwith a legacy receiver by simply modifying the transmitted codeword\nappropriately.\n","authors":["Anastasiia Kurmukova","Deniz Gunduz"],"pdf_url":"https://arxiv.org/pdf/2401.14184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03301v4","updated":"2024-01-25T13:26:37Z","published":"2021-10-07T09:39:40Z","title":"EvadeDroid: A Practical Evasion Attack on Machine Learning for Black-box\n  Android Malware Detection","summary":"  Over the last decade, researchers have extensively explored the\nvulnerabilities of Android malware detectors to adversarial examples through\nthe development of evasion attacks; however, the practicality of these attacks\nin real-world scenarios remains arguable. The majority of studies have assumed\nattackers know the details of the target classifiers used for malware\ndetection, while in reality, malicious actors have limited access to the target\nclassifiers. This paper introduces EvadeDroid, a problem-space adversarial\nattack designed to effectively evade black-box Android malware detectors in\nreal-world scenarios. EvadeDroid constructs a collection of problem-space\ntransformations derived from benign donors that share opcode-level similarity\nwith malware apps by leveraging an n-gram-based approach. These transformations\nare then used to morph malware instances into benign ones via an iterative and\nincremental manipulation strategy. The proposed manipulation technique is a\nquery-efficient optimization algorithm that can find and inject optimal\nsequences of transformations into malware apps. Our empirical evaluations,\ncarried out on 1K malware apps, demonstrate the effectiveness of our approach\nin generating real-world adversarial examples in both soft- and hard-label\nsettings. Our findings reveal that EvadeDroid can effectively deceive diverse\nmalware detectors that utilize different features with various feature types.\nSpecifically, EvadeDroid achieves evasion rates of 80%-95% against DREBIN,\nSec-SVM, ADE-MA, MaMaDroid, and Opcode-SVM with only 1-9 queries. Furthermore,\nwe show that the proposed problem-space adversarial attack is able to preserve\nits stealthiness against five popular commercial antiviruses with an average of\n79% evasion rate, thus demonstrating its feasibility in the real world.\n","authors":["Hamid Bostani","Veelasha Moonsamy"],"pdf_url":"https://arxiv.org/pdf/2110.03301v4.pdf","comment":"The paper was accepted by Elsevier Computers & Security on 20\n  December 2023"},{"id":"http://arxiv.org/abs/2206.03183v3","updated":"2024-01-25T13:23:56Z","published":"2022-06-07T11:08:16Z","title":"Risk Measures and Upper Probabilities: Coherence and Stratification","summary":"  Machine learning typically presupposes classical probability theory which\nimplies that aggregation is built upon expectation. There are now multiple\nreasons to motivate looking at richer alternatives to classical probability\ntheory as a mathematical foundation for machine learning. We systematically\nexamine a powerful and rich class of alternative aggregation functionals, known\nvariously as spectral risk measures, Choquet integrals or Lorentz norms. We\npresent a range of characterization results, and demonstrate what makes this\nspectral family so special. In doing so we arrive at a natural stratification\nof all coherent risk measures in terms of the upper probabilities that they\ninduce by exploiting results from the theory of rearrangement invariant Banach\nspaces. We empirically demonstrate how this new approach to uncertainty helps\ntackling practical machine learning problems.\n","authors":["Christian Fröhlich","Robert C. Williamson"],"pdf_url":"https://arxiv.org/pdf/2206.03183v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14051v3","updated":"2024-01-25T13:23:05Z","published":"2022-10-25T14:30:48Z","title":"Bridging Distributional and Risk-sensitive Reinforcement Learning with\n  Provable Regret Bounds","summary":"  We study the regret guarantee for risk-sensitive reinforcement learning\n(RSRL) via distributional reinforcement learning (DRL) methods. In particular,\nwe consider finite episodic Markov decision processes whose objective is the\nentropic risk measure (EntRM) of return. By leveraging a key property of the\nEntRM, the independence property, we establish the risk-sensitive\ndistributional dynamic programming framework. We then propose two novel DRL\nalgorithms that implement optimism through two different schemes, including a\nmodel-free one and a model-based one.\n  We prove that they both attain $\\tilde{\\mathcal{O}}(\\frac{\\exp(|\\beta|\nH)-1}{|\\beta|}H\\sqrt{S^2AK})$ regret upper bound, where $S$, $A$, $K$, and $H$\nrepresent the number of states, actions, episodes, and the time horizon,\nrespectively. It matches RSVI2 proposed in \\cite{fei2021exponential}, with\nnovel distributional analysis. To the best of our knowledge, this is the first\nregret analysis that bridges DRL and RSRL in terms of sample complexity.\n  Acknowledging the computational inefficiency associated with the model-free\nDRL algorithm, we propose an alternative DRL algorithm with distribution\nrepresentation. This approach not only maintains the established regret bounds\nbut also significantly amplifies computational efficiency.\n  We also prove a tighter minimax lower bound of $\\Omega(\\frac{\\exp(\\beta\nH/6)-1}{\\beta H}H\\sqrt{SAT})$ for the $\\beta>0$ case, which recovers the tight\nlower bound $\\Omega(H\\sqrt{SAT})$ in the risk-neutral setting.\n","authors":["Hao Liang","Zhi-Quan Luo"],"pdf_url":"https://arxiv.org/pdf/2210.14051v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14155v1","updated":"2024-01-25T13:07:34Z","published":"2024-01-25T13:07:34Z","title":"Alleviating Structural Distribution Shift in Graph Anomaly Detection","summary":"  Graph anomaly detection (GAD) is a challenging binary classification problem\ndue to its different structural distribution between anomalies and normal nodes\n-- abnormal nodes are a minority, therefore holding high heterophily and low\nhomophily compared to normal nodes. Furthermore, due to various time factors\nand the annotation preferences of human experts, the heterophily and homophily\ncan change across training and testing data, which is called structural\ndistribution shift (SDS) in this paper. The mainstream methods are built on\ngraph neural networks (GNNs), benefiting the classification of normals from\naggregating homophilous neighbors, yet ignoring the SDS issue for anomalies and\nsuffering from poor generalization.\n  This work solves the problem from a feature view. We observe that the degree\nof SDS varies between anomalies and normal nodes. Hence to address the issue,\nthe key lies in resisting high heterophily for anomalies meanwhile benefiting\nthe learning of normals from homophily. We tease out the anomaly features on\nwhich we constrain to mitigate the effect of heterophilous neighbors and make\nthem invariant. We term our proposed framework as Graph Decomposition Network\n(GDN). Extensive experiments are conducted on two benchmark datasets, and the\nproposed framework achieves a remarkable performance boost in GAD, especially\nin an SDS environment where anomalies have largely different structural\ndistribution across training and testing environments. Codes are open-sourced\nin https://github.com/blacksingular/wsdm_GDN.\n","authors":["Yuan Gao","Xiang Wang","Xiangnan He","Zhenguang Liu","Huamin Feng","Yongdong Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.14155v1.pdf","comment":"Accepted to WSDM 2023"},{"id":"http://arxiv.org/abs/2401.14151v1","updated":"2024-01-25T13:03:20Z","published":"2024-01-25T13:03:20Z","title":"True Knowledge Comes from Practice: Aligning LLMs with Embodied\n  Environments via Reinforcement Learning","summary":"  Despite the impressive performance across numerous tasks, large language\nmodels (LLMs) often fail in solving simple decision-making tasks due to the\nmisalignment of the knowledge in LLMs with environments. On the contrary,\nreinforcement learning (RL) agents learn policies from scratch, which makes\nthem always align with environments but difficult to incorporate prior\nknowledge for efficient explorations. To narrow the gap, we propose TWOSOME, a\nnovel general online framework that deploys LLMs as decision-making agents to\nefficiently interact and align with embodied environments via RL without\nrequiring any prepared datasets or prior knowledge of the environments.\nFirstly, we query the joint probabilities of each valid action with LLMs to\nform behavior policies. Then, to enhance the stability and robustness of the\npolicies, we propose two normalization methods and summarize four prompt design\nprinciples. Finally, we design a novel parameter-efficient training\narchitecture where the actor and critic share one frozen LLM equipped with\nlow-rank adapters (LoRA) updated by PPO. We conduct extensive experiments to\nevaluate TWOSOME. i) TWOSOME exhibits significantly better sample efficiency\nand performance compared to the conventional RL method, PPO, and prompt tuning\nmethod, SayCan, in both classical decision-making environment, Overcooked, and\nsimulated household environment, VirtualHome. ii) Benefiting from LLMs'\nopen-vocabulary feature, TWOSOME shows superior generalization ability to\nunseen tasks. iii) Under our framework, there is no significant loss of the\nLLMs' original ability during online PPO finetuning.\n","authors":["Weihao Tan","Wentao Zhang","Shanqi Liu","Longtao Zheng","Xinrun Wang","Bo An"],"pdf_url":"https://arxiv.org/pdf/2401.14151v1.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2307.10895v2","updated":"2024-01-25T12:59:04Z","published":"2023-07-20T14:18:44Z","title":"Variational Autoencoding of Dental Point Clouds","summary":"  Digital dentistry has made significant advancements, yet numerous challenges\nremain. This paper introduces the FDI 16 dataset, an extensive collection of\ntooth meshes and point clouds. Additionally, we present a novel approach:\nVariational FoldingNet (VF-Net), a fully probabilistic variational autoencoder\ndesigned for point clouds. Notably, prior latent variable models for point\nclouds lack a one-to-one correspondence between input and output points.\nInstead, they rely on optimizing Chamfer distances, a metric that lacks a\nnormalized distributional counterpart, rendering it unsuitable for\nprobabilistic modeling. We replace the explicit minimization of Chamfer\ndistances with a suitable encoder, increasing computational efficiency while\nsimplifying the probabilistic extension. This allows for straightforward\napplication in various tasks, including mesh generation, shape completion, and\nrepresentation learning. Empirically, we provide evidence of lower\nreconstruction error in dental reconstruction and interpolation, showcasing\nstate-of-the-art performance in dental sample generation while identifying\nvaluable latent representations.\n","authors":["Johan Ziruo Ye","Thomas Ørkild","Peter Lempel Søndergaard","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2307.10895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12806v2","updated":"2024-01-25T12:53:39Z","published":"2024-01-23T14:37:51Z","title":"Binary structured physics-informed neural networks for solving equations\n  with rapidly changing solutions","summary":"  Physics-informed neural networks (PINNs), rooted in deep learning, have\nemerged as a promising approach for solving partial differential equations\n(PDEs). By embedding the physical information described by PDEs into\nfeedforward neural networks, PINNs are trained as surrogate models to\napproximate solutions without the need for label data. Nevertheless, even\nthough PINNs have shown remarkable performance, they can face difficulties,\nespecially when dealing with equations featuring rapidly changing solutions.\nThese difficulties encompass slow convergence, susceptibility to becoming\ntrapped in local minima, and reduced solution accuracy. To address these\nissues, we propose a binary structured physics-informed neural network (BsPINN)\nframework, which employs binary structured neural network (BsNN) as the neural\nnetwork component. By leveraging a binary structure that reduces inter-neuron\nconnections compared to fully connected neural networks, BsPINNs excel in\ncapturing the local features of solutions more effectively and efficiently.\nThese features are particularly crucial for learning the rapidly changing in\nthe nature of solutions. In a series of numerical experiments solving Burgers\nequation, Euler equation, Helmholtz equation, and high-dimension Poisson\nequation, BsPINNs exhibit superior convergence speed and heightened accuracy\ncompared to PINNs. From these experiments, we discover that BsPINNs resolve the\nissues caused by increased hidden layers in PINNs resulting in over-smoothing,\nand prevent the decline in accuracy due to non-smoothness of PDEs solutions.\n","authors":["Yanzhi Liu","Ruifan Wu","Ying Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.12806v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14142v1","updated":"2024-01-25T12:46:37Z","published":"2024-01-25T12:46:37Z","title":"Energy-Based Concept Bottleneck Models: Unifying Prediction, Concept\n  Intervention, and Conditional Interpretations","summary":"  Existing methods, such as concept bottleneck models (CBMs), have been\nsuccessful in providing concept-based interpretations for black-box deep\nlearning models. They typically work by predicting concepts given the input and\nthen predicting the final class label given the predicted concepts. However,\n(1) they often fail to capture the high-order, nonlinear interaction between\nconcepts, e.g., correcting a predicted concept (e.g., \"yellow breast\") does not\nhelp correct highly correlated concepts (e.g., \"yellow belly\"), leading to\nsuboptimal final accuracy; (2) they cannot naturally quantify the complex\nconditional dependencies between different concepts and class labels (e.g., for\nan image with the class label \"Kentucky Warbler\" and a concept \"black bill\",\nwhat is the probability that the model correctly predicts another concept\n\"black crown\"), therefore failing to provide deeper insight into how a\nblack-box model works. In response to these limitations, we propose\nEnergy-based Concept Bottleneck Models (ECBMs). Our ECBMs use a set of neural\nnetworks to define the joint energy of candidate (input, concept, class)\ntuples. With such a unified interface, prediction, concept correction, and\nconditional dependency quantification are then represented as conditional\nprobabilities, which are generated by composing different energy functions. Our\nECBMs address both limitations of existing CBMs, providing higher accuracy and\nricher concept interpretations. Empirical results show that our approach\noutperforms the state-of-the-art on real-world datasets.\n","authors":["Xinyue Xu","Yi Qin","Lu Mi","Hao Wang","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2401.14142v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2306.01690v2","updated":"2024-01-25T12:41:44Z","published":"2023-06-02T17:04:36Z","title":"Context selectivity with dynamic availability enables lifelong continual\n  learning","summary":"  \"You never forget how to ride a bike\", -- but how is that possible? The brain\nis able to learn complex skills, stop the practice for years, learn other\nskills in between, and still retrieve the original knowledge when necessary.\nThe mechanisms of this capability, referred to as lifelong learning (or\ncontinual learning, CL), are unknown. We suggest a bio-plausible\nmeta-plasticity rule building on classical work in CL which we summarize in two\nprinciples: (i) neurons are context selective, and (ii) a local availability\nvariable partially freezes the plasticity if the neuron was relevant for\nprevious tasks. In a new neuro-centric formalization of these principles, we\nsuggest that neuron selectivity and neuron-wide consolidation is a simple and\nviable meta-plasticity hypothesis to enable CL in the brain. In simulation,\nthis simple model balances forgetting and consolidation leading to better\ntransfer learning than contemporary CL algorithms on image recognition and\nnatural language processing CL benchmarks.\n","authors":["Martin Barry","Wulfram Gerstner","Guillaume Bellec"],"pdf_url":"https://arxiv.org/pdf/2306.01690v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14135v1","updated":"2024-01-25T12:31:41Z","published":"2024-01-25T12:31:41Z","title":"Convolutional Neural Networks can achieve binary bail judgement\n  classification","summary":"  There is an evident lack of implementation of Machine Learning (ML) in the\nlegal domain in India, and any research that does take place in this domain is\nusually based on data from the higher courts of law and works with English\ndata. The lower courts and data from the different regional languages of India\nare often overlooked. In this paper, we deploy a Convolutional Neural Network\n(CNN) architecture on a corpus of Hindi legal documents. We perform a bail\nPrediction task with the help of a CNN model and achieve an overall accuracy of\n93\\% which is an improvement on the benchmark accuracy, set by Kapoor et al.\n(2022), albeit in data from 20 districts of the Indian state of Uttar Pradesh.\n","authors":["Amit Barman","Devangan Roy","Debapriya Paul","Indranil Dutta","Shouvik Kumar Guha","Samir Karmakar","Sudip Kumar Naskar"],"pdf_url":"https://arxiv.org/pdf/2401.14135v1.pdf","comment":"Accepted on 20th International Conference on Natural Language\n  Processing (ICON)"},{"id":"http://arxiv.org/abs/2401.13657v2","updated":"2024-01-25T12:31:21Z","published":"2024-01-24T18:49:30Z","title":"Inadequacy of common stochastic neural networks for reliable clinical\n  decision support","summary":"  Widespread adoption of AI for medical decision making is still hindered due\nto ethical and safety-related concerns. For AI-based decision support systems\nin healthcare settings it is paramount to be reliable and trustworthy. Common\ndeep learning approaches, however, have the tendency towards overconfidence\nunder data shift. Such inappropriate extrapolation beyond evidence-based\nscenarios may have dire consequences. This highlights the importance of\nreliable estimation of local uncertainty and its communication to the end user.\nWhile stochastic neural networks have been heralded as a potential solution to\nthese issues, this study investigates their actual reliability in clinical\napplications. We centered our analysis on the exemplary use case of mortality\nprediction for ICU hospitalizations using EHR from MIMIC3 study. For\npredictions on the EHR time series, Encoder-Only Transformer models were\nemployed. Stochasticity of model functions was achieved by incorporating common\nmethods such as Bayesian neural network layers and model ensembles. Our models\nachieve state of the art performance in terms of discrimination performance\n(AUC ROC: 0.868+-0.011, AUC PR: 0.554+-0.034) and calibration on the mortality\nprediction benchmark. However, epistemic uncertainty is critically\nunderestimated by the selected stochastic deep learning methods. A heuristic\nproof for the responsible collapse of the posterior distribution is provided.\nOur findings reveal the inadequacy of commonly used stochastic deep learning\napproaches to reliably recognize OoD samples. In both methods, unsubstantiated\nmodel confidence is not prevented due to strongly biased functional posteriors,\nrendering them inappropriate for reliable clinical decision support. This\nhighlights the need for approaches with more strictly enforced or inherent\ndistance-awareness to known data points, e.g., using kernel-based techniques.\n","authors":["Adrian Lindenmeyer","Malte Blattmann","Stefan Franke","Thomas Neumuth","Daniel Schneider"],"pdf_url":"https://arxiv.org/pdf/2401.13657v2.pdf","comment":"Keywords: probabilistic inference, uncertainty estimation,\n  uncertainty quantification, epistemic uncertainty, clinical prognosis,\n  electronic health records"},{"id":"http://arxiv.org/abs/2401.14131v1","updated":"2024-01-25T12:23:22Z","published":"2024-01-25T12:23:22Z","title":"Equivariant Manifold Neural ODEs and Differential Invariants","summary":"  In this paper we develop a manifestly geometric framework for equivariant\nmanifold neural ordinary differential equations (NODEs), and use it to analyse\ntheir modelling capabilities for symmetric data. First, we consider the action\nof a Lie group $G$ on a smooth manifold $M$ and establish the equivalence\nbetween equivariance of vector fields, symmetries of the corresponding Cauchy\nproblems, and equivariance of the associated NODEs. We also propose a novel\nformulation of the equivariant NODEs in terms of the differential invariants of\nthe action of $G$ on $M$, based on Lie theory for symmetries of differential\nequations, which provides an efficient parameterisation of the space of\nequivariant vector fields in a way that is agnostic to both the manifold $M$\nand the symmetry group $G$. Second, we construct augmented manifold NODEs,\nthrough embeddings into equivariant flows, and show that they are universal\napproximators of equivariant diffeomorphisms on any path-connected $M$.\nFurthermore, we show that the augmented NODEs can be incorporated in the\ngeometric framework and parameterised using higher order differential\ninvariants. Finally, we consider the induced action of $G$ on different fields\non $M$ and show how it can be used to generalise previous work, on, e.g.,\ncontinuous normalizing flows, to equivariant models in any geometry.\n","authors":["Emma Andersdotter","Fredrik Ohlsson"],"pdf_url":"https://arxiv.org/pdf/2401.14131v1.pdf","comment":"17 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.14130v1","updated":"2024-01-25T12:18:46Z","published":"2024-01-25T12:18:46Z","title":"Attention-based Efficient Classification for 3D MRI Image of Alzheimer's\n  Disease","summary":"  Early diagnosis of Alzheimer Diagnostics (AD) is a challenging task due to\nits subtle and complex clinical symptoms. Deep learning-assisted medical\ndiagnosis using image recognition techniques has become an important research\ntopic in this field. The features have to accurately capture main variations of\nanatomical brain structures. However, time-consuming is expensive for feature\nextraction by deep learning training. This study proposes a novel Alzheimer's\ndisease detection model based on Convolutional Neural Networks. The model\nutilizes a pre-trained ResNet network as the backbone, incorporating\npost-fusion algorithm for 3D medical images and attention mechanisms. The\nexperimental results indicate that the employed 2D fusion algorithm effectively\nimproves the model's training expense. And the introduced attention mechanism\naccurately weights important regions in images, further enhancing the model's\ndiagnostic accuracy.\n","authors":["Yihao Lin","Ximeng Li","Yan Zhang","Jinshan Tang"],"pdf_url":"https://arxiv.org/pdf/2401.14130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14080v2","updated":"2024-01-25T12:11:03Z","published":"2022-10-25T15:00:05Z","title":"Learning Individual Treatment Effects under Heterogeneous Interference\n  in Networks","summary":"  Estimates of individual treatment effects from networked observational data\nare attracting increasing attention these days. One major challenge in network\nscenarios is the violation of the stable unit treatment value assumption\n(SUTVA), which assumes that the treatment assignment of a unit does not\ninfluence others' outcomes. In network data, due to interference, the outcome\nof a unit is influenced not only by its treatment (i.e., direct effects) but\nalso by others' treatments (i.e., spillover effects). Furthermore, the\ninfluences from other units are always heterogeneous (e.g., friends with\nsimilar interests affect a person differently than friends with different\ninterests). In this paper, we focus on the problem of estimating individual\ntreatment effects (both direct and spillover effects) under heterogeneous\ninterference. To address this issue, we propose a novel Dual Weighting\nRegression (DWR) algorithm by simultaneously learning attention weights that\ncapture the heterogeneous interference and sample weights to eliminate the\ncomplex confounding bias in networks. We formulate the entire learning process\nas a bi-level optimization problem. In theory, we present generalization error\nbounds for individual treatment effect estimation. Extensive experiments on\nfour benchmark datasets demonstrate that the proposed DWR algorithm outperforms\nstate-of-the-art methods for estimating individual treatment effects under\nheterogeneous interference.\n","authors":["Ziyu Zhao","Yuqi Bai","Kun Kuang","Ruoxuan Xiong","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2210.14080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14112v1","updated":"2024-01-25T11:46:38Z","published":"2024-01-25T11:46:38Z","title":"FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric\n  Algorithm-System Co-Design","summary":"  Six-bit quantization (FP6) can effectively reduce the size of large language\nmodels (LLMs) and preserve the model quality consistently across varied\napplications. However, existing systems do not provide Tensor Core support for\nFP6 quantization and struggle to achieve practical performance improvements\nduring LLM inference. It is challenging to support FP6 quantization on GPUs due\nto (1) unfriendly memory access of model weights with irregular bit-width and\n(2) high runtime overhead of weight de-quantization. To address these problems,\nwe propose TC-FPx, the first full-stack GPU kernel design scheme with unified\nTensor Core support of float-point weights for various quantization bit-width.\nWe integrate TC-FPx kernel into an existing inference system, providing new\nend-to-end support (called FP6-LLM) for quantized LLM inference, where better\ntrade-offs between inference cost and model quality are achieved. Experiments\nshow that FP6-LLM enables the inference of LLaMA-70b using only a single GPU,\nachieving 1.69x-2.65x higher normalized inference throughput than the FP16\nbaseline. The source code will be publicly available soon.\n","authors":["Haojun Xia","Zhen Zheng","Xiaoxia Wu","Shiyang Chen","Zhewei Yao","Stephen Youn","Arash Bakhtiari","Michael Wyatt","Donglin Zhuang","Zhongzhu Zhou","Olatunji Ruwase","Yuxiong He","Shuaiwen Leon Song"],"pdf_url":"https://arxiv.org/pdf/2401.14112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14110v1","updated":"2024-01-25T11:46:01Z","published":"2024-01-25T11:46:01Z","title":"Towards Cheaper Inference in Deep Networks with Lower Bit-Width\n  Accumulators","summary":"  The majority of the research on the quantization of Deep Neural Networks\n(DNNs) is focused on reducing the precision of tensors visible by high-level\nframeworks (e.g., weights, activations, and gradients). However, current\nhardware still relies on high-accuracy core operations. Most significant is the\noperation of accumulating products. This high-precision accumulation operation\nis gradually becoming the main computational bottleneck. This is because, so\nfar, the usage of low-precision accumulators led to a significant degradation\nin performance. In this work, we present a simple method to train and fine-tune\nhigh-end DNNs, to allow, for the first time, utilization of cheaper, $12$-bits\naccumulators, with no significant degradation in accuracy. Lastly, we show that\nas we decrease the accumulation precision further, using fine-grained gradient\napproximations can improve the DNN accuracy.\n","authors":["Yaniv Blumenfeld","Itay Hubara","Daniel Soudry"],"pdf_url":"https://arxiv.org/pdf/2401.14110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14109v1","updated":"2024-01-25T11:45:21Z","published":"2024-01-25T11:45:21Z","title":"CompactifAI: Extreme Compression of Large Language Models using\n  Quantum-Inspired Tensor Networks","summary":"  Large Language Models (LLMs) such as ChatGPT and LlaMA are advancing rapidly\nin generative Artificial Intelligence (AI), but their immense size poses\nsignificant challenges, such as huge training and inference costs, substantial\nenergy demands, and limitations for on-site deployment. Traditional compression\nmethods such as pruning, distillation, and low-rank approximation focus on\nreducing the effective number of neurons in the network, while quantization\nfocuses on reducing the numerical precision of individual weights to reduce the\nmodel size while keeping the number of neurons fixed. While these compression\nmethods have been relatively successful in practice, there's no compelling\nreason to believe that truncating the number of neurons is an optimal strategy.\nIn this context, this paper introduces CompactifAI, an innovative LLM\ncompression approach using quantum-inspired Tensor Networks that focuses on the\nmodel's correlation space instead, allowing for a more controlled, refined and\ninterpretable model compression. Our method is versatile and can be implemented\nwith - or on top of - other compression techniques. As a benchmark, we\ndemonstrate that CompactifAI alone enables compression of the LlaMA-2 7B model\nto only $30\\%$ of its original size while recovering over $90\\%$ of the\noriginal accuracy after a brief distributed retraining.\n","authors":["Andrei Tomut","Saeed S. Jahromi","Sukhbinder Singh","Faysal Ishtiaq","Cesar Muñoz","Prabdeep Singh Bajaj","Ali Elborady","Gianni del Bimbo","Mehrazin Alizadeh","David Montero","Pablo Martin-Ramiro","Muhammad Ibrahim","Oussama Tahiri Alaoui","John Malcolm","Samuel Mugel","Roman Orus"],"pdf_url":"https://arxiv.org/pdf/2401.14109v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.14107v1","updated":"2024-01-25T11:43:35Z","published":"2024-01-25T11:43:35Z","title":"Learning under Label Noise through Few-Shot Human-in-the-Loop Refinement","summary":"  Wearable technologies enable continuous monitoring of various health metrics,\nsuch as physical activity, heart rate, sleep, and stress levels. A key\nchallenge with wearable data is obtaining quality labels. Unlike modalities\nlike video where the videos themselves can be effectively used to label objects\nor events, wearable data do not contain obvious cues about the physical\nmanifestation of the users and usually require rich metadata. As a result,\nlabel noise can become an increasingly thorny issue when labeling such data. In\nthis paper, we propose a novel solution to address noisy label learning,\nentitled Few-Shot Human-in-the-Loop Refinement (FHLR). Our method initially\nlearns a seed model using weak labels. Next, it fine-tunes the seed model using\na handful of expert corrections. Finally, it achieves better generalizability\nand robustness by merging the seed and fine-tuned models via weighted parameter\naveraging. We evaluate our approach on four challenging tasks and datasets, and\ncompare it against eight competitive baselines designed to deal with noisy\nlabels. We show that FHLR achieves significantly better performance when\nlearning from noisy labels and achieves state-of-the-art by a large margin,\nwith up to 19% accuracy improvement under symmetric and asymmetric noise.\nNotably, we find that FHLR is particularly robust to increased label noise,\nunlike prior works that suffer from severe performance degradation. Our work\nnot only achieves better generalization in high-stakes health sensing\nbenchmarks but also sheds light on how noise affects commonly-used models.\n","authors":["Aaqib Saeed","Dimitris Spathis","Jungwoo Oh","Edward Choi","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2401.14107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09750v2","updated":"2024-01-25T11:36:00Z","published":"2024-01-18T06:32:53Z","title":"Exploration and Anti-Exploration with Distributional Random Network\n  Distillation","summary":"  Exploration remains a critical issue in deep reinforcement learning for an\nagent to attain high returns in unknown environments. Although the prevailing\nexploration Random Network Distillation (RND) algorithm has been demonstrated\nto be effective in numerous environments, it often needs more discriminative\npower in bonus allocation. This paper highlights the ``bonus inconsistency''\nissue within RND, pinpointing its primary limitation. To address this issue, we\nintroduce the Distributional RND (DRND), a derivative of the RND. DRND enhances\nthe exploration process by distilling a distribution of random networks and\nimplicitly incorporating pseudo counts to improve the precision of bonus\nallocation. This refinement encourages agents to engage in more extensive\nexploration. Our method effectively mitigates the inconsistency issue without\nintroducing significant computational overhead. Both theoretical analysis and\nexperimental results demonstrate the superiority of our approach over the\noriginal RND algorithm. Our method excels in challenging online exploration\nscenarios and effectively serves as an anti-exploration mechanism in D4RL\noffline tasks.\n","authors":["Kai Yang","Jian Tao","Jiafei Lyu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2401.09750v2.pdf","comment":"Submitted to ICML 2024"},{"id":"http://arxiv.org/abs/2306.16424v3","updated":"2024-01-25T11:25:09Z","published":"2023-06-22T10:32:51Z","title":"Realistic Synthetic Financial Transactions for Anti-Money Laundering\n  Models","summary":"  With the widespread digitization of finance and the increasing popularity of\ncryptocurrencies, the sophistication of fraud schemes devised by cybercriminals\nis growing. Money laundering -- the movement of illicit funds to conceal their\norigins -- can cross bank and national boundaries, producing complex\ntransaction patterns. The UN estimates 2-5\\% of global GDP or \\$0.8 - \\$2.0\ntrillion dollars are laundered globally each year. Unfortunately, real data to\ntrain machine learning models to detect laundering is generally not available,\nand previous synthetic data generators have had significant shortcomings. A\nrealistic, standardized, publicly-available benchmark is needed for comparing\nmodels and for the advancement of the area.\n  To this end, this paper contributes a synthetic financial transaction dataset\ngenerator and a set of synthetically generated AML (Anti-Money Laundering)\ndatasets. We have calibrated this agent-based generator to match real\ntransactions as closely as possible and made the datasets public. We describe\nthe generator in detail and demonstrate how the datasets generated can help\ncompare different machine learning models in terms of their AML abilities. In a\nkey way, using synthetic data in these comparisons can be even better than\nusing real data: the ground truth labels are complete, whilst many laundering\ntransactions in real data are never detected.\n","authors":["Erik Altman","Jovan Blanuša","Luc von Niederhäusern","Béni Egressy","Andreea Anghel","Kubilay Atasu"],"pdf_url":"https://arxiv.org/pdf/2306.16424v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11562v5","updated":"2024-01-25T11:20:16Z","published":"2023-12-17T15:16:13Z","title":"A Survey of Reasoning with Foundation Models","summary":"  Reasoning, a crucial ability for complex problem-solving, plays a pivotal\nrole in various real-world settings such as negotiation, medical diagnosis, and\ncriminal investigation. It serves as a fundamental methodology in the field of\nArtificial General Intelligence (AGI). With the ongoing development of\nfoundation models, e.g., Large Language Models (LLMs), there is a growing\ninterest in exploring their abilities in reasoning tasks. In this paper, we\nintroduce seminal foundation models proposed or adaptable for reasoning,\nhighlighting the latest advancements in various reasoning tasks, methods, and\nbenchmarks. We then delve into the potential future directions behind the\nemergence of reasoning abilities within foundation models. We also discuss the\nrelevance of multimodal learning, autonomous agents, and super alignment in the\ncontext of reasoning. By discussing these future research directions, we hope\nto inspire researchers in their exploration of this field, stimulate further\nadvancements in reasoning with foundation models, and contribute to the\ndevelopment of AGI.\n","authors":["Jiankai Sun","Chuanyang Zheng","Enze Xie","Zhengying Liu","Ruihang Chu","Jianing Qiu","Jiaqi Xu","Mingyu Ding","Hongyang Li","Mengzhe Geng","Yue Wu","Wenhai Wang","Junsong Chen","Zhangyue Yin","Xiaozhe Ren","Jie Fu","Junxian He","Wu Yuan","Qi Liu","Xihui Liu","Yu Li","Hao Dong","Yu Cheng","Ming Zhang","Pheng Ann Heng","Jifeng Dai","Ping Luo","Jingdong Wang","Ji-Rong Wen","Xipeng Qiu","Yike Guo","Hui Xiong","Qun Liu","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2312.11562v5.pdf","comment":"20 Figures, 160 Pages, 750+ References, Project Page\n  https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models"},{"id":"http://arxiv.org/abs/2306.12774v4","updated":"2024-01-25T11:17:25Z","published":"2023-06-22T10:00:33Z","title":"Pure Exploration in Bandits with Linear Constraints","summary":"  We address the problem of identifying the optimal policy with a fixed\nconfidence level in a multi-armed bandit setup, when \\emph{the arms are subject\nto linear constraints}. Unlike the standard best-arm identification problem\nwhich is well studied, the optimal policy in this case may not be deterministic\nand could mix between several arms. This changes the geometry of the problem\nwhich we characterize via an information-theoretic lower bound. We introduce\ntwo asymptotically optimal algorithms for this setting, one based on the\nTrack-and-Stop method and the other based on a game-theoretic approach. Both\nthese algorithms try to track an optimal allocation based on the lower bound\nand computed by a weighted projection onto the boundary of a normal cone.\nFinally, we provide empirical results that validate our bounds and visualize\nhow constraints change the hardness of the problem.\n","authors":["Emil Carlsson","Debabrota Basu","Fredrik D. Johansson","Devdatt Dubhashi"],"pdf_url":"https://arxiv.org/pdf/2306.12774v4.pdf","comment":"Accepted to AISTATS 2024"},{"id":"http://arxiv.org/abs/2401.14093v1","updated":"2024-01-25T11:15:51Z","published":"2024-01-25T11:15:51Z","title":"McUDI: Model-Centric Unsupervised Degradation Indicator for Failure\n  Prediction AIOps Solutions","summary":"  Due to the continuous change in operational data, AIOps solutions suffer from\nperformance degradation over time. Although periodic retraining is the\nstate-of-the-art technique to preserve the failure prediction AIOps models'\nperformance over time, this technique requires a considerable amount of labeled\ndata to retrain. In AIOps obtaining label data is expensive since it requires\nthe availability of domain experts to intensively annotate it. In this paper,\nwe present McUDI, a model-centric unsupervised degradation indicator that is\ncapable of detecting the exact moment the AIOps model requires retraining as a\nresult of changes in data. We further show how employing McUDI in the\nmaintenance pipeline of AIOps solutions can reduce the number of samples that\nrequire annotations with 30k for job failure prediction and 260k for disk\nfailure prediction while achieving similar performance with periodic\nretraining.\n","authors":["Lorena Poenaru-Olaru","Luis Cruz","Jan Rellermeyer","Arie van Deursen"],"pdf_url":"https://arxiv.org/pdf/2401.14093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14090v1","updated":"2024-01-25T11:12:16Z","published":"2024-01-25T11:12:16Z","title":"A Modular Approach to Automatic Cyber Threat Attribution using Opinion\n  Pools","summary":"  Cyber threat attribution can play an important role in increasing resilience\nagainst digital threats. Recent research focuses on automating the threat\nattribution process and on integrating it with other efforts, such as threat\nhunting. To support increasing automation of the cyber threat attribution\nprocess, this paper proposes a modular architecture as an alternative to\ncurrent monolithic automated approaches. The modular architecture can utilize\nopinion pools to combine the output of concrete attributors. The proposed\nsolution increases the tractability of the threat attribution problem and\noffers increased usability and interpretability, as opposed to monolithic\nalternatives. In addition, a Pairing Aggregator is proposed as an aggregation\nmethod that forms pairs of attributors based on distinct features to produce\nintermediary results before finally producing a single Probability Mass\nFunction (PMF) as output. The Pairing Aggregator sequentially applies both the\nlogarithmic opinion pool and the linear opinion pool. An experimental\nvalidation suggests that the modular approach does not result in decreased\nperformance and can even enhance precision and recall compared to monolithic\nalternatives. The results also suggest that the Pairing Aggregator can improve\nprecision over the linear and logarithmic opinion pools. Furthermore, the\nimproved k-accuracy in the experiment suggests that forensic experts can\nleverage the resulting PMF during their manual attribution processes to enhance\ntheir efficiency.\n","authors":["Koen T. W. Teuwen"],"pdf_url":"https://arxiv.org/pdf/2401.14090v1.pdf","comment":"For source code see:\n  https://github.com/Koen1999/modular-threat-attribution"},{"id":"http://arxiv.org/abs/2401.08703v2","updated":"2024-01-25T11:09:38Z","published":"2024-01-15T03:33:39Z","title":"Decoupled Prototype Learning for Reliable Test-Time Adaptation","summary":"  Test-time adaptation (TTA) is a task that continually adapts a pre-trained\nsource model to the target domain during inference. One popular approach\ninvolves fine-tuning model with cross-entropy loss according to estimated\npseudo-labels. However, its performance is significantly affected by noisy\npseudo-labels. This study reveals that minimizing the classification error of\neach sample causes the cross-entropy loss's vulnerability to label noise. To\naddress this issue, we propose a novel Decoupled Prototype Learning (DPL)\nmethod that features prototype-centric loss computation. First, we decouple the\noptimization of class prototypes. For each class prototype, we reduce its\ndistance with positive samples and enlarge its distance with negative samples\nin a contrastive manner. This strategy prevents the model from overfitting to\nnoisy pseudo-labels. Second, we propose a memory-based strategy to enhance\nDPL's robustness for the small batch sizes often encountered in TTA. We update\neach class's pseudo-feature from a memory in a momentum manner and insert an\nadditional DPL loss. Finally, we introduce a consistency regularization-based\napproach to leverage samples with unconfident pseudo-labels. This approach\ntransfers feature styles of samples with unconfident pseudo-labels to those\nwith confident pseudo-labels. Thus, more reliable samples for TTA are created.\nThe experimental results demonstrate that our methods achieve state-of-the-art\nperformance on domain generalization benchmarks, and reliably improve the\nperformance of self-training-based methods on image corruption benchmarks. The\ncode will be released.\n","authors":["Guowei Wang","Changxing Ding","Wentao Tan","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2401.08703v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.14086v1","updated":"2024-01-25T11:06:16Z","published":"2024-01-25T11:06:16Z","title":"Generating Likely Counterfactuals Using Sum-Product Networks","summary":"  Due to user demand and recent regulation (GDPR, AI Act), decisions made by AI\nsystems need to be explained. These decisions are often explainable only post\nhoc, where counterfactual explanations are popular. The question of what\nconstitutes the best counterfactual explanation must consider multiple aspects,\nwhere \"distance from the sample\" is the most common. We argue that this\nrequirement frequently leads to explanations that are unlikely and, therefore,\nof limited value. Here, we present a system that provides high-likelihood\nexplanations. We show that the search for the most likely explanations\nsatisfying many common desiderata for counterfactual explanations can be\nmodeled using mixed-integer optimization (MIO). In the process, we propose an\nMIO formulation of a Sum-Product Network (SPN) and use the SPN to estimate the\nlikelihood of a counterfactual, which can be of independent interest. A\nnumerical comparison against several methods for generating counterfactual\nexplanations is provided.\n","authors":["Jiri Nemecek","Tomas Pevny","Jakub Marecek"],"pdf_url":"https://arxiv.org/pdf/2401.14086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14081v1","updated":"2024-01-25T11:00:19Z","published":"2024-01-25T11:00:19Z","title":"Accelerating Fractional PINNs using Operational Matrices of Derivative","summary":"  This paper presents a novel operational matrix method to accelerate the\ntraining of fractional Physics-Informed Neural Networks (fPINNs). Our approach\ninvolves a non-uniform discretization of the fractional Caputo operator,\nfacilitating swift computation of fractional derivatives within Caputo-type\nfractional differential problems with $0<\\alpha<1$. In this methodology, the\noperational matrix is precomputed, and during the training phase, automatic\ndifferentiation is replaced with a matrix-vector product. While our methodology\nis compatible with any network, we particularly highlight its successful\nimplementation in PINNs, emphasizing the enhanced accuracy achieved when\nutilizing the Legendre Neural Block (LNB) architecture. LNB incorporates\nLegendre polynomials into the PINN structure, providing a significant boost in\naccuracy. The effectiveness of our proposed method is validated across diverse\ndifferential equations, including Delay Differential Equations (DDEs) and\nSystems of Differential Algebraic Equations (DAEs). To demonstrate its\nversatility, we extend the application of the method to systems of differential\nequations, specifically addressing nonlinear Pantograph fractional-order\nDDEs/DAEs. The results are supported by a comprehensive analysis of numerical\noutcomes.\n","authors":["Tayebeh Taheri","Alireza Afzal Aghaei","Kourosh Parand"],"pdf_url":"https://arxiv.org/pdf/2401.14081v1.pdf","comment":"19 pages, 11 figures"},{"id":"http://arxiv.org/abs/2401.14074v1","updated":"2024-01-25T10:52:36Z","published":"2024-01-25T10:52:36Z","title":"ProCNS: Progressive Prototype Calibration and Noise Suppression for\n  Weakly-Supervised Medical Image Segmentation","summary":"  Weakly-supervised segmentation (WSS) has emerged as a solution to mitigate\nthe conflict between annotation cost and model performance by adopting sparse\nannotation formats (e.g., point, scribble, block, etc.). Typical approaches\nattempt to exploit anatomy and topology priors to directly expand sparse\nannotations into pseudo-labels. However, due to a lack of attention to the\nambiguous edges in medical images and insufficient exploration of sparse\nsupervision, existing approaches tend to generate erroneous and overconfident\npseudo proposals in noisy regions, leading to cumulative model error and\nperformance degradation. In this work, we propose a novel WSS approach, named\nProCNS, encompassing two synergistic modules devised with the principles of\nprogressive prototype calibration and noise suppression. Specifically, we\ndesign a Prototype-based Regional Spatial Affinity (PRSA) loss to maximize the\npair-wise affinities between spatial and semantic elements, providing our model\nof interest with more reliable guidance. The affinities are derived from the\ninput images and the prototype-refined predictions. Meanwhile, we propose an\nAdaptive Noise Perception and Masking (ANPM) module to obtain more enriched and\nrepresentative prototype representations, which adaptively identifies and masks\nnoisy regions within the pseudo proposals, reducing potential erroneous\ninterference during prototype computation. Furthermore, we generate specialized\nsoft pseudo-labels for the noisy regions identified by ANPM, providing\nsupplementary supervision. Extensive experiments on three medical image\nsegmentation tasks involving different modalities demonstrate that the proposed\nframework significantly outperforms representative state-of-the-art methods\n","authors":["Y. Liu","L. Lin","K. K. Y. Wong","X. Tang"],"pdf_url":"https://arxiv.org/pdf/2401.14074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14069v1","updated":"2024-01-25T10:44:50Z","published":"2024-01-25T10:44:50Z","title":"Neural Sinkhorn Gradient Flow","summary":"  Wasserstein Gradient Flows (WGF) with respect to specific functionals have\nbeen widely used in the machine learning literature. Recently, neural networks\nhave been adopted to approximate certain intractable parts of the underlying\nWasserstein gradient flow and result in efficient inference procedures. In this\npaper, we introduce the Neural Sinkhorn Gradient Flow (NSGF) model, which\nparametrizes the time-varying velocity field of the Wasserstein gradient flow\nw.r.t. the Sinkhorn divergence to the target distribution starting a given\nsource distribution. We utilize the velocity field matching training scheme in\nNSGF, which only requires samples from the source and target distribution to\ncompute an empirical velocity field approximation. Our theoretical analyses\nshow that as the sample size increases to infinity, the mean-field limit of the\nempirical approximation converges to the true underlying velocity field. To\nfurther enhance model efficiency on high-dimensional tasks, a two-phase NSGF++\nmodel is devised, which first follows the Sinkhorn flow to approach the image\nmanifold quickly ($\\le 5$ NFEs) and then refines the samples along a simple\nstraight flow. Numerical experiments with synthetic and real-world benchmark\ndatasets support our theoretical results and demonstrate the effectiveness of\nthe proposed methods.\n","authors":["Huminhao Zhu","Fangyikang Wang","Chao Zhang","Hanbin Zhao","Hui Qian"],"pdf_url":"https://arxiv.org/pdf/2401.14069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14065v1","updated":"2024-01-25T10:39:40Z","published":"2024-01-25T10:39:40Z","title":"Novel application of Relief Algorithm in cascaded artificial neural\n  network to predict wind speed for wind power resource assessment in India","summary":"  Wind power generated by wind has non-schedule nature due to stochastic nature\nof meteorological variable. Hence energy business and control of wind power\ngeneration requires prediction of wind speed (WS) from few seconds to different\ntime steps in advance. To deal with prediction shortcomings, various WS\nprediction methods have been used. Predictive data mining offers variety of\nmethods for WS predictions where artificial neural network (ANN) is one of the\nreliable and accurate methods. It is observed from the result of this study\nthat ANN gives better accuracy in comparison conventional model. The accuracy\nof WS prediction models is found to be dependent on input parameters and\narchitecture type algorithms utilized. So the selection of most relevant input\nparameters is important research area in WS predicton field. The objective of\nthe paper is twofold: first extensive review of ANN for wind power and WS\nprediction is carried out. Discussion and analysis of feature selection using\nRelief Algorithm (RA) in WS prediction are considered for different Indian\nsites. RA identify atmospheric pressure, solar radiation and relative humidity\nare relevant input variables. Based on relevant input variables Cascade ANN\nmodel is developed and prediction accuracy is evaluated. It is found that root\nmean square error (RMSE) for comparison between predicted and measured WS for\ntraining and testing wind speed are found to be 1.44 m/s and 1.49 m/s\nrespectively. The developed cascade ANN model can be used to predict wind speed\nfor sites where there are not WS measuring instruments are installed in India.\n","authors":["Hasmat Malik","Amit Kumar Yadav","Fausto Pedro García Márquez","Jesús María Pinar-Pérez"],"pdf_url":"https://arxiv.org/pdf/2401.14065v1.pdf","comment":"Malik, H., Yadav, A. K., M\\'arquez, F. P. G., & Pinar-P\\'erez, J. M.\n  (2022). Novel application of Relief Algorithm in cascaded artificial neural\n  network to predict wind speed for wind power resource assessment in India.\n  Energy Strategy Reviews, 41, 100864"},{"id":"http://arxiv.org/abs/2303.04136v2","updated":"2024-01-25T10:31:28Z","published":"2023-03-07T18:50:00Z","title":"Domain Randomization for Robust, Affordable and Effective Closed-loop\n  Control of Soft Robots","summary":"  Soft robots are gaining popularity thanks to their intrinsic safety to\ncontacts and adaptability. However, the potentially infinite number of Degrees\nof Freedom makes their modeling a daunting task, and in many cases only an\napproximated description is available. This challenge makes reinforcement\nlearning (RL) based approaches inefficient when deployed on a realistic\nscenario, due to the large domain gap between models and the real platform. In\nthis work, we demonstrate, for the first time, how Domain Randomization (DR)\ncan solve this problem by enhancing RL policies for soft robots with: i)\nrobustness w.r.t. unknown dynamics parameters; ii) reduced training times by\nexploiting drastically simpler dynamic models for learning; iii) better\nenvironment exploration, which can lead to exploitation of environmental\nconstraints for optimal performance. Moreover, we introduce a novel algorithmic\nextension to previous adaptive domain randomization methods for the automatic\ninference of dynamics parameters for deformable objects. We provide an\nextensive evaluation in simulation on four different tasks and two soft robot\ndesigns, opening interesting perspectives for future research on Reinforcement\nLearning for closed-loop soft robot control.\n","authors":["Gabriele Tiboni","Andrea Protopapa","Tatiana Tommasi","Giuseppe Averta"],"pdf_url":"https://arxiv.org/pdf/2303.04136v2.pdf","comment":"Presented as conference paper at IEEE/RSJ IROS 2023, Detroit, USA.\n  Project website at https://andreaprotopapa.github.io/dr-soro/"},{"id":"http://arxiv.org/abs/2401.14057v1","updated":"2024-01-25T10:29:07Z","published":"2024-01-25T10:29:07Z","title":"Left/Right Brain, human motor control and the implications for robotics","summary":"  Neural Network movement controllers promise a variety of advantages over\nconventional control methods however they are not widely adopted due to their\ninability to produce reliably precise movements. This research explores a\nbilateral neural network architecture as a control system for motor tasks. We\naimed to achieve hemispheric specialisation similar to what is observed in\nhumans across different tasks; the dominant system (usually the right hand,\nleft hemisphere) excels at tasks involving coordination and efficiency of\nmovement, and the non-dominant system performs better at tasks requiring\npositional stability. Specialisation was achieved by training the hemispheres\nwith different loss functions tailored toward the expected behaviour of the\nrespective hemispheres. We compared bilateral models with and without\nspecialised hemispheres, with and without inter-hemispheric connectivity\n(representing the biological Corpus Callosum), and unilateral models with and\nwithout specialisation. The models were trained and tested on two tasks common\nin the human motor control literature: the random reach task, suited to the\ndominant system, a model with better coordination, and the hold position task,\nsuited to the non-dominant system, a model with more stable movement. Each\nsystem out-performed the non-favoured system in its preferred task. For both\ntasks, a bilateral model outperforms the 'non-preferred' hand, and is as good\nor better than the 'preferred' hand. The Corpus Callosum tends to improve\nperformance, but not always for the specialised models.\n","authors":["Jarrad Rinaldo","Levin Kuhlmann","Jason Friedman","Gideon Kowadlo"],"pdf_url":"https://arxiv.org/pdf/2401.14057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13265v2","updated":"2024-01-25T10:16:46Z","published":"2023-08-25T09:37:02Z","title":"Heterogeneous Federated Learning via Personalized Generative Networks","summary":"  Federated Learning (FL) allows several clients to construct a common global\nmachine-learning model without having to share their data. FL, however, faces\nthe challenge of statistical heterogeneity between the client's data, which\ndegrades performance and slows down the convergence toward the global model. In\nthis paper, we provide theoretical proof that minimizing heterogeneity between\nclients facilitates the convergence of a global model for every single client.\nThis becomes particularly important under empirical concept shifts among\nclients, rather than merely considering imbalanced classes, which have been\nstudied until now. Therefore, we propose a method for knowledge transfer\nbetween clients where the server trains client-specific generators. Each\ngenerator generates samples for the corresponding client to remove the conflict\nwith other clients' models. Experiments conducted on synthetic and real data,\nalong with a theoretical study, support the effectiveness of our method in\nconstructing a well-generalizable global model by reducing the conflict between\nlocal models.\n","authors":["Zahra Taghiyarrenani","Abdallah Alabdallah","Slawomir Nowaczyk","Sepideh Pashami"],"pdf_url":"https://arxiv.org/pdf/2308.13265v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2006.05956v5","updated":"2024-01-25T09:42:25Z","published":"2020-06-10T17:07:36Z","title":"Gradient Flows for Regularized Stochastic Control Problems","summary":"  This paper studies stochastic control problems with the action space taken to\nbe probability measures, with the objective penalised by the relative entropy.\nWe identify suitable metric space on which we construct a gradient flow for the\nmeasure-valued control process, in the set of admissible controls, along which\nthe cost functional is guaranteed to decrease. It is shown that any invariant\nmeasure of this gradient flow satisfies the Pontryagin optimality principle. If\nthe problem we work with is sufficiently convex, the gradient flow converges\nexponentially fast. Furthermore, the optimal measure-valued control process\nadmits a Bayesian interpretation which means that one can incorporate prior\nknowledge when solving such stochastic control problems. This work is motivated\nby a desire to extend the theoretical underpinning for the convergence of\nstochastic gradient type algorithms widely employed in the reinforcement\nlearning community to solve control problems.\n","authors":["David Šiška","Łukasz Szpruch"],"pdf_url":"https://arxiv.org/pdf/2006.05956v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.02107v2","updated":"2024-01-25T09:38:55Z","published":"2022-08-03T14:37:50Z","title":"Convolutional Persistence Transforms","summary":"  In this paper, we consider topological featurizations of data defined over\nsimplicial complexes, like images and labeled graphs, obtained by convolving\nthis data with various filters before computing persistence. Viewing a\nconvolution filter as a local motif, the persistence diagram of the resulting\nconvolution describes the way the motif is distributed across the simplicial\ncomplex. This pipeline, which we call convolutional persistence, extends the\ncapacity of topology to observe patterns in such data. Moreover, we prove that\n(generically speaking) for any two labeled complexes one can find some filter\nfor which they produce different persistence diagrams, so that the collection\nof all possible convolutional persistence diagrams is an injective invariant.\nThis is proven by showing convolutional persistence to be a special case of\nanother topological invariant, the Persistent Homology Transform. Other\nadvantages of convolutional persistence are improved stability, greater\nflexibility for data-dependent vectorizations, and reduced computational\ncomplexity for certain data types. Additionally, we have a suite of experiments\nshowing that convolutions greatly improve the predictive power of persistence\non a host of classification tasks, even if one uses random filters and\nvectorizes the resulting diagrams by recording only their total persistences.\n","authors":["Elchanan Solomon","Paul Bendich"],"pdf_url":"https://arxiv.org/pdf/2208.02107v2.pdf","comment":"Updated paper with new results and proofs written more clearly"},{"id":"http://arxiv.org/abs/2303.11835v2","updated":"2024-01-25T09:35:25Z","published":"2023-03-20T12:25:43Z","title":"Lipschitz-bounded 1D convolutional neural networks using the Cayley\n  transform and the controllability Gramian","summary":"  We establish a layer-wise parameterization for 1D convolutional neural\nnetworks (CNNs) with built-in end-to-end robustness guarantees. In doing so, we\nuse the Lipschitz constant of the input-output mapping characterized by a CNN\nas a robustness measure. We base our parameterization on the Cayley transform\nthat parameterizes orthogonal matrices and the controllability Gramian of the\nstate space representation of the convolutional layers. The proposed\nparameterization by design fulfills linear matrix inequalities that are\nsufficient for Lipschitz continuity of the CNN, which further enables\nunconstrained training of Lipschitz-bounded 1D CNNs. Finally, we train\nLipschitz-bounded 1D CNNs for the classification of heart arrythmia data and\nshow their improved robustness.\n","authors":["Patricia Pauli","Ruigang Wang","Ian R. Manchester","Frank Allgöwer"],"pdf_url":"https://arxiv.org/pdf/2303.11835v2.pdf","comment":"Published as a conference paper at CDC 2023"},{"id":"http://arxiv.org/abs/2401.14033v1","updated":"2024-01-25T09:23:31Z","published":"2024-01-25T09:23:31Z","title":"Novel Quadratic Constraints for Extending LipSDP beyond Slope-Restricted\n  Activations","summary":"  Recently, semidefinite programming (SDP) techniques have shown great promise\nin providing accurate Lipschitz bounds for neural networks. Specifically, the\nLipSDP approach (Fazlyab et al., 2019) has received much attention and provides\nthe least conservative Lipschitz upper bounds that can be computed with\npolynomial time guarantees. However, one main restriction of LipSDP is that its\nformulation requires the activation functions to be slope-restricted on\n$[0,1]$, preventing its further use for more general activation functions such\nas GroupSort, MaxMin, and Householder. One can rewrite MaxMin activations for\nexample as residual ReLU networks. However, a direct application of LipSDP to\nthe resultant residual ReLU networks is conservative and even fails in\nrecovering the well-known fact that the MaxMin activation is 1-Lipschitz. Our\npaper bridges this gap and extends LipSDP beyond slope-restricted activation\nfunctions. To this end, we provide novel quadratic constraints for GroupSort,\nMaxMin, and Householder activations via leveraging their underlying properties\nsuch as sum preservation. Our proposed analysis is general and provides a\nunified approach for estimating $\\ell_2$ and $\\ell_\\infty$ Lipschitz bounds for\na rich class of neural network architectures, including non-residual and\nresidual neural networks and implicit models, with GroupSort, MaxMin, and\nHouseholder activations. Finally, we illustrate the utility of our approach\nwith a variety of experiments and show that our proposed SDPs generate less\nconservative Lipschitz bounds in comparison to existing approaches.\n","authors":["Patricia Pauli","Aaron Havens","Alexandre Araujo","Siddharth Garg","Farshad Khorrami","Frank Allgöwer","Bin Hu"],"pdf_url":"https://arxiv.org/pdf/2401.14033v1.pdf","comment":"accepted as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.14031v1","updated":"2024-01-25T09:21:29Z","published":"2024-01-25T09:21:29Z","title":"Sparse and Transferable Universal Singular Vectors Attack","summary":"  The research in the field of adversarial attacks and models' vulnerability is\none of the fundamental directions in modern machine learning. Recent studies\nreveal the vulnerability phenomenon, and understanding the mechanisms behind\nthis is essential for improving neural network characteristics and\ninterpretability. In this paper, we propose a novel sparse universal white-box\nadversarial attack. Our approach is based on truncated power iteration\nproviding sparsity to $(p,q)$-singular vectors of the hidden layers of Jacobian\nmatrices. Using the ImageNet benchmark validation subset, we analyze the\nproposed method in various settings, achieving results comparable to dense\nbaselines with more than a 50% fooling rate while damaging only 5% of pixels\nand utilizing 256 samples for perturbation fitting. We also show that our\nalgorithm admits higher attack magnitude without affecting the human ability to\nsolve the task. Furthermore, we investigate that the constructed perturbations\nare highly transferable among different models without significantly decreasing\nthe fooling rate. Our findings demonstrate the vulnerability of\nstate-of-the-art models to sparse attacks and highlight the importance of\ndeveloping robust machine learning systems.\n","authors":["Kseniia Kuvshinova","Olga Tsymboi","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2401.14031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14029v1","updated":"2024-01-25T09:20:21Z","published":"2024-01-25T09:20:21Z","title":"Towards a Systems Theory of Algorithms","summary":"  Traditionally, numerical algorithms are seen as isolated pieces of code\nconfined to an {\\em in silico} existence. However, this perspective is not\nappropriate for many modern computational approaches in control, learning, or\noptimization, wherein {\\em in vivo} algorithms interact with their environment.\nExamples of such {\\em open} include various real-time optimization-based\ncontrol strategies, reinforcement learning, decision-making architectures,\nonline optimization, and many more. Further, even {\\em closed} algorithms in\nlearning or optimization are increasingly abstracted in block diagrams with\ninteracting dynamic modules and pipelines. In this opinion paper, we state our\nvision on a to-be-cultivated {\\em systems theory of algorithms} and argue in\nfavour of viewing algorithms as open dynamical systems interacting with other\nalgorithms, physical systems, humans, or databases. Remarkably, the manifold\ntools developed under the umbrella of systems theory also provide valuable\ninsights into this burgeoning paradigm shift and its accompanying challenges in\nthe algorithmic world. We survey various instances where the principles of\nalgorithmic systems theory are being developed and outline pertinent modeling,\nanalysis, and design challenges.\n","authors":["Florian Dörfler","Zhiyu He","Giuseppe Belgioioso","Saverio Bolognani","John Lygeros","Michael Muehlebach"],"pdf_url":"https://arxiv.org/pdf/2401.14029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14027v1","updated":"2024-01-25T09:18:51Z","published":"2024-01-25T09:18:51Z","title":"The Risk of Federated Learning to Skew Fine-Tuning Features and\n  Underperform Out-of-Distribution Robustness","summary":"  To tackle the scarcity and privacy issues associated with domain-specific\ndatasets, the integration of federated learning in conjunction with fine-tuning\nhas emerged as a practical solution. However, our findings reveal that\nfederated learning has the risk of skewing fine-tuning features and\ncompromising the out-of-distribution robustness of the model. By introducing\nthree robustness indicators and conducting experiments across diverse robust\ndatasets, we elucidate these phenomena by scrutinizing the diversity,\ntransferability, and deviation within the model feature space. To mitigate the\nnegative impact of federated learning on model robustness, we introduce GNP, a\n\\underline{G}eneral \\underline{N}oisy \\underline{P}rojection-based robust\nalgorithm, ensuring no deterioration of accuracy on the target distribution.\nSpecifically, the key strategy for enhancing model robustness entails the\ntransfer of robustness from the pre-trained model to the fine-tuned model,\ncoupled with adding a small amount of Gaussian noise to augment the\nrepresentative capacity of the model. Comprehensive experimental results\ndemonstrate that our approach markedly enhances the robustness across diverse\nscenarios, encompassing various parameter-efficient fine-tuning methods and\nconfronting different levels of data heterogeneity.\n","authors":["Mengyao Du","Miao Zhang","Yuwen Pu","Kai Xu","Shouling Ji","Quanjun Yin"],"pdf_url":"https://arxiv.org/pdf/2401.14027v1.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2401.14025v1","updated":"2024-01-25T09:17:19Z","published":"2024-01-25T09:17:19Z","title":"DNA Sequence Classification with Compressors","summary":"  Recent studies in DNA sequence classification have leveraged sophisticated\nmachine learning techniques, achieving notable accuracy in categorizing complex\ngenomic data. Among these, methods such as k-mer counting have proven effective\nin distinguishing sequences from varied species like chimpanzees, dogs, and\nhumans, becoming a staple in contemporary genomic research. However, these\napproaches often demand extensive computational resources, posing a challenge\nin terms of scalability and efficiency. Addressing this issue, our study\nintroduces a novel adaptation of Jiang et al.'s compressor-based,\nparameter-free classification method, specifically tailored for DNA sequence\nanalysis. This innovative approach utilizes a variety of compression\nalgorithms, such as Gzip, Brotli, and LZMA, to efficiently process and classify\ngenomic sequences. Not only does this method align with the current\nstate-of-the-art in terms of accuracy, but it also offers a more\nresource-efficient alternative to traditional machine learning methods. Our\ncomprehensive evaluation demonstrates the proposed method's effectiveness in\naccurately classifying DNA sequences from multiple species. We present a\ndetailed analysis of the performance of each algorithm used, highlighting the\nstrengths and limitations of our approach in various genomic contexts.\nFurthermore, we discuss the broader implications of our findings for\nbioinformatics, particularly in genomic data processing and analysis. The\nresults of our study pave the way for more efficient and scalable DNA sequence\nclassification methods, offering significant potential for advancements in\ngenomic research and applications.\n","authors":["Şükrü Ozan"],"pdf_url":"https://arxiv.org/pdf/2401.14025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00744v5","updated":"2024-01-25T09:16:15Z","published":"2024-01-01T12:57:15Z","title":"Harmonizing Covariance and Expressiveness for Deep Hamiltonian\n  Regression in Crystalline Material Research: a Hybrid Cascaded Regression\n  Framework","summary":"  Deep learning for Hamiltonian regression of quantum systems in material\nresearch necessitates satisfying the covariance laws, among which achieving\nSO(3)-equivariance without sacrificing the expressiveness capability of\nnetworks remains an elusive challenge due to the restriction to non-linear\nmappings on guaranteeing theoretical equivariance. To alleviate the\ncovariance-expressiveness dilemma, we propose a hybrid framework with two\ncascaded regression stages. The first stage, i.e., a theoretically-guaranteed\ncovariant neural network modeling symmetry properties of 3D atom systems,\npredicts baseline Hamiltonians with theoretically covariant features extracted,\nassisting the second stage in learning covariance. Meanwhile, the second stage,\npowered by a non-linear 3D graph Transformer network we propose for structural\nmodeling of atomic systems, refines the first stage's output as a fine-grained\nprediction of Hamiltonians with better expressiveness capability. The\ncombination of a theoretically covariant yet inevitably less expressive model\nwith a highly expressive non-linear network enables precise, generalizable\npredictions while maintaining robust covariance under coordinate\ntransformations. Our method achieves state-of-the-art performance in\nHamiltonian prediction for electronic structure calculations, confirmed through\nexperiments on six crystalline material databases. The codes and configuration\nscripts are available in the supplementary material.\n","authors":["Shi Yin","Xinyang Pan","Xudong Zhu","Tianyu Gao","Haochong Zhang","Feng Wu","Lixin He"],"pdf_url":"https://arxiv.org/pdf/2401.00744v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14021v1","updated":"2024-01-25T09:06:44Z","published":"2024-01-25T09:06:44Z","title":"Accelerating Retrieval-Augmented Language Model Serving with Speculation","summary":"  Retrieval-augmented language models (RaLM) have demonstrated the potential to\nsolve knowledge-intensive natural language processing (NLP) tasks by combining\na non-parametric knowledge base with a parametric language model. Instead of\nfine-tuning a fully parametric model, RaLM excels at its low-cost adaptation to\nthe latest data and better source attribution mechanisms. Among various RaLM\napproaches, iterative RaLM delivers a better generation quality due to a more\nfrequent interaction between the retriever and the language model. Despite the\nbenefits, iterative RaLM usually encounters high overheads due to the frequent\nretrieval step. To this end, we propose RaLMSpec, a speculation-inspired\nframework that provides generic speed-up over iterative RaLM while preserving\nthe same model outputs through speculative retrieval and batched verification.\nBy further incorporating prefetching, optimal speculation stride scheduler, and\nasynchronous verification, RaLMSpec can automatically exploit the acceleration\npotential to the fullest. For naive iterative RaLM serving, extensive\nevaluations over three language models on four downstream QA datasets\ndemonstrate that RaLMSpec can achieve a speed-up ratio of 1.75-2.39x,\n1.04-1.39x, and 1.31-1.77x when the retriever is an exact dense retriever,\napproximate dense retriever, and sparse retriever respectively compared with\nthe baseline. For KNN-LM serving, RaLMSpec can achieve a speed-up ratio up to\n7.59x and 2.45x when the retriever is an exact dense retriever and approximate\ndense retriever, respectively, compared with the baseline.\n","authors":["Zhihao Zhang","Alan Zhu","Lijie Yang","Yihua Xu","Lanting Li","Phitchaya Mangpo Phothilimthana","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2401.14021v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2103.11856v2","updated":"2024-01-25T08:55:05Z","published":"2021-03-22T13:57:45Z","title":"A Link between Coding Theory and Cross-Validation with Applications","summary":"  How many different binary classification problems a single learning algorithm\ncan solve on a fixed data with exactly zero or at most a given number of\ncross-validation errors? While the number in the former case is known to be\nlimited by the no-free-lunch theorem, we show that the exact answers are given\nby the theory of error detecting codes. As a case study, we focus on the AUC\nperformance measure and leave-pair-out cross-validation (LPOCV), in which every\npossible pair of data with different class labels is held out at a time. We\nshown that the maximal number of classification problems with fixed class\nproportion, for which a learning algorithm can achieve zero LPOCV error, equals\nthe maximal number of code words in a constant weight code (CWC), with certain\ntechnical properties. We then generalize CWCs by introducing light CWCs and\nprove an analogous result for nonzero LPOCV errors and light CWCs. Moreover, we\nprove both upper and lower bounds on the maximal numbers of code words in light\nCWCs. Finally, as an immediate practical application, we develop new LPOCV\nbased randomization tests for learning algorithms that generalize the classical\nWilcoxon-Mann-Whitney U test.\n","authors":["Tapio Pahikkala","Parisa Movahedi","Ileana Montoya","Havu Miikonen","Stephan Foldes","Antti Airola","Laszlo Major"],"pdf_url":"https://arxiv.org/pdf/2103.11856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09852v2","updated":"2024-01-25T08:53:35Z","published":"2023-11-16T12:28:31Z","title":"Short vs. Long-term Coordination of Drones: When Distributed\n  Optimization Meets Deep Reinforcement Learning","summary":"  Swarms of autonomous interactive drones, with the support of recharging\ntechnology, can provide compelling sensing capabilities in Smart Cities, such\nas traffic monitoring and disaster response. Existing approaches, including\ndistributed optimization and deep reinforcement learning (DRL), aim to\ncoordinate drones to achieve cost-effective, high-quality navigation, sensing,\nand charging. However, they face grand challenges: short-term optimization is\nnot effective in dynamic environments with unanticipated changes, while\nlong-term learning lacks scalability, resilience, and flexibility. To bridge\nthis gap, this paper introduces a new progressive approach that combines\nshort-term plan generation and selection based on distributed optimization with\na DRL-based long-term strategic scheduling of flying direction. Extensive\nexperimentation with datasets generated from realistic urban mobility\nunderscores an outstanding performance of the proposed solution compared to\nstate-of-the-art. We also provide compelling new insights about the role of\ndrones density in different sensing missions, the energy safety of drone\noperations and how to prioritize investments for key locations of charging\ninfrastructure.\n","authors":["Chuhao Qin","Evangelos Pournaras"],"pdf_url":"https://arxiv.org/pdf/2311.09852v2.pdf","comment":"12 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.05934v2","updated":"2024-01-25T08:37:45Z","published":"2023-12-10T16:52:00Z","title":"Fine-Tuning or Retrieval? Comparing Knowledge Injection in LLMs","summary":"  Large language models (LLMs) encapsulate a vast amount of factual information\nwithin their pre-trained weights, as evidenced by their ability to answer\ndiverse questions across different domains. However, this knowledge is\ninherently limited, relying heavily on the characteristics of the training\ndata. Consequently, using external datasets to incorporate new information or\nrefine the capabilities of LLMs on previously seen information poses a\nsignificant challenge. In this study, we compare two common approaches:\nunsupervised fine-tuning and retrieval-augmented generation (RAG). We evaluate\nboth approaches on a variety of knowledge-intensive tasks across different\ntopics. Our findings reveal that while unsupervised fine-tuning offers some\nimprovement, RAG consistently outperforms it, both for existing knowledge\nencountered during training and entirely new knowledge. Moreover, we find that\nLLMs struggle to learn new factual information through unsupervised\nfine-tuning, and that exposing them to numerous variations of the same fact\nduring training could alleviate this problem.\n","authors":["Oded Ovadia","Menachem Brief","Moshik Mishaeli","Oren Elisha"],"pdf_url":"https://arxiv.org/pdf/2312.05934v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03251v3","updated":"2024-01-25T08:34:05Z","published":"2023-09-06T17:37:40Z","title":"Temporal Inductive Path Neural Network for Temporal Knowledge Graph\n  Reasoning","summary":"  Temporal Knowledge Graph (TKG) is an extension of traditional Knowledge Graph\n(KG) that incorporates the dimension of time. Reasoning on TKGs is a crucial\ntask that aims to predict future facts based on historical occurrences. The key\nchallenge lies in uncovering structural dependencies within historical\nsubgraphs and temporal patterns. Most existing approaches model TKGs relying on\nentity modeling, as nodes in the graph play a crucial role in knowledge\nrepresentation. However, the real-world scenario often involves an extensive\nnumber of entities, with new entities emerging over time. This makes it\nchallenging for entity-dependent methods to cope with extensive volumes of\nentities, and effectively handling newly emerging entities also becomes a\nsignificant challenge. Therefore, we propose Temporal Inductive Path Neural\nNetwork (TiPNN), which models historical information in an entity-independent\nperspective. Specifically, TiPNN adopts a unified graph, namely history\ntemporal graph, to comprehensively capture and encapsulate information from\nhistory. Subsequently, we utilize the defined query-aware temporal paths on a\nhistory temporal graph to model historical path information related to queries\nfor reasoning. Extensive experiments illustrate that the proposed model not\nonly attains significant performance enhancements but also handles inductive\nsettings, while additionally facilitating the provision of reasoning evidence\nthrough history temporal graphs.\n","authors":["Hao Dong","Pengyang Wang","Meng Xiao","Zhiyuan Ning","Pengfei Wang","Yuanchun Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.03251v3.pdf","comment":"Accepted to Artificial Intelligence"},{"id":"http://arxiv.org/abs/2308.14104v2","updated":"2024-01-25T07:44:10Z","published":"2023-08-27T13:22:50Z","title":"Towards Generalizable Neural Solvers for Vehicle Routing Problems via\n  Ensemble with Transferrable Local Policy","summary":"  Machine learning has been adapted to help solve NP-hard combinatorial\noptimization problems. One prevalent way is learning to construct solutions by\ndeep neural networks, which has been receiving more and more attention due to\nthe high efficiency and less requirement for expert knowledge. However, many\nneural construction methods for Vehicle Routing Problems (VRPs) focus on\nsynthetic problem instances with specified node distributions and limited\nscales, leading to poor performance on real-world problems which usually\ninvolve complex and unknown node distributions together with large scales. To\nmake neural VRP solvers more practical, we design an auxiliary policy that\nlearns from the local transferable topological features, named local policy,\nand integrate it with a typical construction policy (which learns from the\nglobal information of VRP instances) to form an ensemble policy. With joint\ntraining, the aggregated policies perform cooperatively and complementarily to\nboost generalization. The experimental results on two well-known benchmarks,\nTSPLIB and CVRPLIB, of travelling salesman problem and capacitated VRP show\nthat the ensemble policy significantly improves both cross-distribution and\ncross-scale generalization performance, and even performs well on real-world\nproblems with several thousand nodes.\n","authors":["Chengrui Gao","Haopu Shang","Ke Xue","Dong Li","Chao Qian"],"pdf_url":"https://arxiv.org/pdf/2308.14104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13987v1","updated":"2024-01-25T07:05:42Z","published":"2024-01-25T07:05:42Z","title":"Cross-Domain Few-Shot Learning via Adaptive Transformer Networks","summary":"  Most few-shot learning works rely on the same domain assumption between the\nbase and the target tasks, hindering their practical applications. This paper\nproposes an adaptive transformer network (ADAPTER), a simple but effective\nsolution for cross-domain few-shot learning where there exist large domain\nshifts between the base task and the target task. ADAPTER is built upon the\nidea of bidirectional cross-attention to learn transferable features between\nthe two domains. The proposed architecture is trained with DINO to produce\ndiverse, and less biased features to avoid the supervision collapse problem.\nFurthermore, the label smoothing approach is proposed to improve the\nconsistency and reliability of the predictions by also considering the\npredicted labels of the close samples in the embedding space. The performance\nof ADAPTER is rigorously evaluated in the BSCD-FSL benchmarks in which it\noutperforms prior arts with significant margins.\n","authors":["Naeem Paeedeh","Mahardhika Pratama","Muhammad Anwar Ma'sum","Wolfgang Mayer","Zehong Cao","Ryszard Kowlczyk"],"pdf_url":"https://arxiv.org/pdf/2401.13987v1.pdf","comment":"Under Consideration in Knowledge-based Systems"},{"id":"http://arxiv.org/abs/2401.13986v1","updated":"2024-01-25T07:04:30Z","published":"2024-01-25T07:04:30Z","title":"Towards Consistent Natural-Language Explanations via\n  Explanation-Consistency Finetuning","summary":"  Large language models (LLMs) often generate convincing, fluent explanations.\nHowever, different from humans, they often generate inconsistent explanations\non different inputs. For example, an LLM may generate the explanation \"all\nbirds can fly\" when answering the question \"Can sparrows fly?\" but meanwhile\nanswer \"no\" to the related question \"Can penguins fly?\". Explanations should be\nconsistent across related examples so that they allow a human to simulate the\nLLM's decision process on multiple examples. We propose explanation-consistency\nfinetuning (EC-finetuning), a method that adapts LLMs to generate more\nconsistent natural-language explanations on related examples. EC-finetuning\ninvolves finetuning LLMs on synthetic data that is carefully constructed to\ncontain consistent explanations. Across a variety of question-answering\ndatasets in various domains, EC-finetuning yields a 10.0% relative explanation\nconsistency improvement on four finetuning datasets, and generalizes to seven\nout-of-distribution datasets not seen during finetuning (+4.5% relative). Code\nis available at https://github.com/yandachen/explanation-consistency-finetuning .\n","authors":["Yanda Chen","Chandan Singh","Xiaodong Liu","Simiao Zuo","Bin Yu","He He","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2401.13986v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2307.08678"},{"id":"http://arxiv.org/abs/2401.13530v2","updated":"2024-01-25T07:01:34Z","published":"2024-01-24T15:35:44Z","title":"Continuous-time Riemannian SGD and SVRG Flows on Wasserstein\n  Probabilistic Space","summary":"  Recently, optimization on the Riemannian manifold has provided new insights\nto the optimization community. In this regard, the manifold taken as the\nprobability measure metric space equipped with the second-order Wasserstein\ndistance is of particular interest, since optimization on it can be linked to\npractical sampling processes. In general, the oracle (continuous) optimization\nmethod on Wasserstein space is Riemannian gradient flow (i.e., Langevin\ndynamics when minimizing KL divergence). In this paper, we aim to enrich the\ncontinuous optimization methods in the Wasserstein space by extending the\ngradient flow into the stochastic gradient descent (SGD) flow and stochastic\nvariance reduction gradient (SVRG) flow. The two flows on Euclidean space are\nstandard stochastic optimization methods, while their Riemannian counterparts\nare not explored yet. By leveraging the structures in Wasserstein space, we\nconstruct a stochastic differential equation (SDE) to approximate the discrete\ndynamics of desired stochastic methods in the corresponded random vector space.\nThen, the flows of probability measures are naturally obtained by applying\nFokker-Planck equation to such SDE. Furthermore, the convergence rates of the\nproposed Riemannian stochastic flows are proven, and they match the results in\nEuclidean space.\n","authors":["Mingyang Yi","Bohan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.13530v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13979v1","updated":"2024-01-25T06:45:32Z","published":"2024-01-25T06:45:32Z","title":"Leeroo Orchestrator: Elevating LLMs Performance Through Model\n  Integration","summary":"  In this paper, we propose an architecture to harness the collective knowledge\nof multiple trained LLMs to create a new state-of-the-art. At the core of this\nframework is a LLM-based orchestrator that is adept at picking the right\nunderlying LLM experts for optimal task execution. Inspired by self-play in\nreinforcement learning, we created a loop of query generation, orchestration,\nand evaluation to generate training data for the orchestrator. Our evaluation\nfocused on the MMLU benchmark, employing models with 7B, 13B, and 34B\nparameters available on Hugging Face. The results demonstrate new\nstate-of-the-art open-source models: Our Leeroo orchestrator achieves\nperformance on par with the Mixtral model while incurring only two-thirds of\nits cost. Moreover, increasing the allowed cost surpasses Mixtral's accuracy by\nover 5% at the same cost level, reaching an accuracy of 75.9%. Further\nenhancements were observed when integrating GPT4 into the underlying model\npool. The Leeroo orchestrator nearly matches GPT4's performance at half the\ncost and even exceeds GPT4's results with a 25% cost reduction. These findings\nillustrate the potential of our architecture in creating state-of-the-art and\ncost-effective LLMs by optimizing the synergy between multiple LLMs to achieve\nsuperior performance outcomes.\n","authors":["Alireza Mohammadshahi","Ali Shaikh","Majid Yazdani"],"pdf_url":"https://arxiv.org/pdf/2401.13979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13977v1","updated":"2024-01-25T06:37:48Z","published":"2024-01-25T06:37:48Z","title":"Evaluating the Determinants of Mode Choice Using Statistical and Machine\n  Learning Techniques in the Indian Megacity of Bengaluru","summary":"  The decision making involved behind the mode choice is critical for\ntransportation planning. While statistical learning techniques like discrete\nchoice models have been used traditionally, machine learning (ML) models have\ngained traction recently among the transportation planners due to their higher\npredictive performance. However, the black box nature of ML models pose\nsignificant interpretability challenges, limiting their practical application\nin decision and policy making. This study utilised a dataset of $1350$\nhouseholds belonging to low and low-middle income bracket in the city of\nBengaluru to investigate mode choice decision making behaviour using\nMultinomial logit model and ML classifiers like decision trees, random forests,\nextreme gradient boosting and support vector machines. In terms of accuracy,\nrandom forest model performed the best ($0.788$ on training data and $0.605$ on\ntesting data) compared to all the other models. This research has adopted\nmodern interpretability techniques like feature importance and individual\nconditional expectation plots to explain the decision making behaviour using ML\nmodels. A higher travel costs significantly reduce the predicted probability of\nbus usage compared to other modes (a $0.66\\%$ and $0.34\\%$ reduction using\nRandom Forests and XGBoost model for $10\\%$ increase in travel cost). However,\nreducing travel time by $10\\%$ increases the preference for the metro ($0.16\\%$\nin Random Forests and 0.42% in XGBoost). This research augments the ongoing\nresearch on mode choice analysis using machine learning techniques, which would\nhelp in improving the understanding of the performance of these models with\nreal-world data in terms of both accuracy and interpretability.\n","authors":["Tanmay Ghosh","Nithin Nagaraj"],"pdf_url":"https://arxiv.org/pdf/2401.13977v1.pdf","comment":"65 pages, 26 figures"},{"id":"http://arxiv.org/abs/2401.13971v1","updated":"2024-01-25T06:06:31Z","published":"2024-01-25T06:06:31Z","title":"Stochastic Weakly Convex Optimization Beyond Lipschitz Continuity","summary":"  This paper considers stochastic weakly convex optimization without the\nstandard Lipschitz continuity assumption. Based on new adaptive regularization\n(stepsize) strategies, we show that a wide class of stochastic algorithms,\nincluding the stochastic subgradient method, preserve the $\\mathcal{O} ( 1 /\n\\sqrt{K})$ convergence rate with constant failure rate. Our analyses rest on\nrather weak assumptions: the Lipschitz parameter can be either bounded by a\ngeneral growth function of $\\|x\\|$ or locally estimated through independent\nrandom samples.\n","authors":["Wenzhi Gao","Qi Deng"],"pdf_url":"https://arxiv.org/pdf/2401.13971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08895v2","updated":"2024-01-25T06:04:30Z","published":"2024-01-17T00:36:58Z","title":"cedar: Composable and Optimized Machine Learning Input Data Pipelines","summary":"  The input data pipeline is an essential component of each machine learning\n(ML) training job. It is responsible for reading massive amounts of training\ndata, processing batches of samples using complex transformations, and loading\nthem onto training nodes at low latency and high throughput. Performant input\ndata systems are becoming increasingly critical, driven by skyrocketing data\nvolumes and training throughput demands. Unfortunately, current input data\nsystems cannot fully leverage key performance optimizations, resulting in\nhugely inefficient infrastructures that require significant resources -- or\nworse -- underutilize expensive accelerators.\n  To address these demands, we present cedar, a programming model and framework\nthat allows users to easily build, optimize, and execute input data pipelines.\ncedar presents an easy-to-use programming interface, allowing users to define\ninput data pipelines using composable operators that support arbitrary ML\nframeworks and libraries. Meanwhile, cedar transparently applies a complex and\nextensible set of optimization techniques (e.g., offloading, caching,\nprefetching, fusion, and reordering). It then orchestrates processing across a\ncustomizable set of local and distributed compute resources in order to\nmaximize processing performance and efficiency, all without user input. On\naverage across six diverse input data pipelines, cedar achieves a 2.49x, 1.87x,\n2.18x, and 2.74x higher performance compared to tf.data, tf.data service, Ray\nData, and PyTorch's DataLoader, respectively.\n","authors":["Mark Zhao","Emanuel Adamiak","Christos Kozyrakis"],"pdf_url":"https://arxiv.org/pdf/2401.08895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13968v1","updated":"2024-01-25T06:03:56Z","published":"2024-01-25T06:03:56Z","title":"Dynamic Long-Term Time-Series Forecasting via Meta Transformer Networks","summary":"  A reliable long-term time-series forecaster is highly demanded in practice\nbut comes across many challenges such as low computational and memory\nfootprints as well as robustness against dynamic learning environments. This\npaper proposes Meta-Transformer Networks (MANTRA) to deal with the dynamic\nlong-term time-series forecasting tasks. MANTRA relies on the concept of fast\nand slow learners where a collection of fast learners learns different aspects\nof data distributions while adapting quickly to changes. A slow learner tailors\nsuitable representations to fast learners. Fast adaptations to dynamic\nenvironments are achieved using the universal representation transformer layers\nproducing task-adapted representations with a small number of parameters. Our\nexperiments using four datasets with different prediction lengths demonstrate\nthe advantage of our approach with at least $3\\%$ improvements over the\nbaseline algorithms for both multivariate and univariate settings. Source codes\nof MANTRA are publicly available in\n\\url{https://github.com/anwarmaxsum/MANTRA}.\n","authors":["Muhammad Anwar Ma'sum","MD Rasel Sarkar","Mahardhika Pratama","Savitha Ramasamy","Sreenatha Anavatti","Lin Liu"," Habibullah","Ryszard Kowalczyk"],"pdf_url":"https://arxiv.org/pdf/2401.13968v1.pdf","comment":"Under Consideration in IEEE Transactions on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2301.02515v2","updated":"2024-01-25T05:41:47Z","published":"2023-01-06T14:04:46Z","title":"GNN-based Passenger Request Prediction","summary":"  Passenger request prediction is essential for operations planning, control,\nand management in ride-sharing platforms. While the demand prediction problem\nhas been studied extensively, the Origin-Destination (OD) flow prediction of\npassengers has received less attention from the research community. This paper\ndevelops a Graph Neural Network framework along with the Attention Mechanism to\npredict the OD flow of passengers. The proposed framework exploits various\nlinear and non-linear dependencies that arise among requests originating from\ndifferent locations and captures the repetition pattern and the contextual data\nof that place. Moreover, the optimal size of the grid cell that covers the road\nnetwork and preserves the complexity and accuracy of the model is determined.\nExtensive simulations are conducted to examine the characteristics of our\nproposed approach and its various components. The results show the superior\nperformance of our proposed model compared to the existing baselines.\n","authors":["Aqsa Ashraf Makhdomi","Iqra Altaf Gillani"],"pdf_url":"https://arxiv.org/pdf/2301.02515v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05304v2","updated":"2024-01-25T05:14:26Z","published":"2024-01-10T18:12:31Z","title":"Can Probabilistic Feedback Drive User Impacts in Online Platforms?","summary":"  A common explanation for negative user impacts of content recommender systems\nis misalignment between the platform's objective and user welfare. In this\nwork, we show that misalignment in the platform's objective is not the only\npotential cause of unintended impacts on users: even when the platform's\nobjective is fully aligned with user welfare, the platform's learning algorithm\ncan induce negative downstream impacts on users. The source of these user\nimpacts is that different pieces of content may generate observable user\nreactions (feedback information) at different rates; these feedback rates may\ncorrelate with content properties, such as controversiality or demographic\nsimilarity of the creator, that affect the user experience. Since differences\nin feedback rates can impact how often the learning algorithm engages with\ndifferent content, the learning algorithm may inadvertently promote content\nwith certain such properties. Using the multi-armed bandit framework with\nprobabilistic feedback, we examine the relationship between feedback rates and\na learning algorithm's engagement with individual arms for different no-regret\nalgorithms. We prove that no-regret algorithms can exhibit a wide range of\ndependencies: if the feedback rate of an arm increases, some no-regret\nalgorithms engage with the arm more, some no-regret algorithms engage with the\narm less, and other no-regret algorithms engage with the arm approximately the\nsame number of times. From a platform design perspective, our results highlight\nthe importance of looking beyond regret when measuring an algorithm's\nperformance, and assessing the nature of a learning algorithm's engagement with\ndifferent types of content as well as their resulting downstream impacts.\n","authors":["Jessica Dai","Bailey Flanigan","Nika Haghtalab","Meena Jagadeesan","Chara Podimata"],"pdf_url":"https://arxiv.org/pdf/2401.05304v2.pdf","comment":"Authors listed in alphabetical order. Accept as poster at AISTATS\n  2024"},{"id":"http://arxiv.org/abs/2401.13947v1","updated":"2024-01-25T05:05:55Z","published":"2024-01-25T05:05:55Z","title":"Networked Multiagent Reinforcement Learning for Peer-to-Peer Energy\n  Trading","summary":"  Utilizing distributed renewable and energy storage resources in local\ndistribution networks via peer-to-peer (P2P) energy trading has long been\ntouted as a solution to improve energy systems' resilience and sustainability.\nConsumers and prosumers (those who have energy generation resources), however,\ndo not have the expertise to engage in repeated P2P trading, and the\nzero-marginal costs of renewables present challenges in determining fair market\nprices. To address these issues, we propose multi-agent reinforcement learning\n(MARL) frameworks to help automate consumers' bidding and management of their\nsolar PV and energy storage resources, under a specific P2P clearing mechanism\nthat utilizes the so-called supply-demand ratio. In addition, we show how the\nMARL frameworks can integrate physical network constraints to realize voltage\ncontrol, hence ensuring physical feasibility of the P2P energy trading and\npaving way for real-world implementations.\n","authors":["Chen Feng","Andrew L. Liu"],"pdf_url":"https://arxiv.org/pdf/2401.13947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13360v2","updated":"2024-01-25T04:55:08Z","published":"2024-01-24T10:37:28Z","title":"Debiased Sample Selection for Combating Noisy Labels","summary":"  Learning with noisy labels aims to ensure model generalization given a\nlabel-corrupted training set. The sample selection strategy achieves promising\nperformance by selecting a label-reliable subset for model training. In this\npaper, we empirically reveal that existing sample selection methods suffer from\nboth data and training bias that are represented as imbalanced selected sets\nand accumulation errors in practice, respectively. However, only the training\nbias was handled in previous studies. To address this limitation, we propose a\nnoIse-Tolerant Expert Model (ITEM) for debiased learning in sample selection.\nSpecifically, to mitigate the training bias, we design a robust network\narchitecture that integrates with multiple experts. Compared with the\nprevailing double-branch network, our network exhibits better performance of\nselection and prediction by ensembling these experts while training with fewer\nparameters. Meanwhile, to mitigate the data bias, we propose a mixed sampling\nstrategy based on two weight-based data samplers. By training on the mixture of\ntwo class-discriminative mini-batches, the model mitigates the effect of the\nimbalanced training set while avoiding sparse representations that are easily\ncaused by sampling strategies. Extensive experiments and analyses demonstrate\nthe effectiveness of ITEM. Our code is available at this url\n\\href{https://github.com/1998v7/ITEM}{ITEM}.\n","authors":["Qi Wei","Lei Feng","Haobo Wang","Bo An"],"pdf_url":"https://arxiv.org/pdf/2401.13360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.09790v2","updated":"2024-01-25T04:39:53Z","published":"2021-11-18T16:40:44Z","title":"MCCE: Monte Carlo sampling of realistic counterfactual explanations","summary":"  We introduce MCCE: Monte Carlo sampling of valid and realistic Counterfactual\nExplanations for tabular data, a novel counterfactual explanation method that\ngenerates on-manifold, actionable and valid counterfactuals by modeling the\njoint distribution of the mutable features given the immutable features and the\ndecision. Unlike other on-manifold methods that tend to rely on variational\nautoencoders and have strict prediction model and data requirements, MCCE\nhandles any type of prediction model and categorical features with more than\ntwo levels. MCCE first models the joint distribution of the features and the\ndecision with an autoregressive generative model where the conditionals are\nestimated using decision trees. Then, it samples a large set of observations\nfrom this model, and finally, it removes the samples that do not obey certain\ncriteria. We compare MCCE with a range of state-of-the-art on-manifold\ncounterfactual methods using four well-known data sets and show that MCCE\noutperforms these methods on all common performance metrics and speed. In\nparticular, including the decision in the modeling process improves the\nefficiency of the method substantially.\n","authors":["Annabelle Redelmeier","Martin Jullum","Kjersti Aas","Anders Løland"],"pdf_url":"https://arxiv.org/pdf/2111.09790v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12689v2","updated":"2024-01-25T04:37:38Z","published":"2024-01-23T11:54:09Z","title":"Energy-based Automated Model Evaluation","summary":"  The conventional evaluation protocols on machine learning models rely heavily\non a labeled, i.i.d-assumed testing dataset, which is not often present in real\nworld applications. The Automated Model Evaluation (AutoEval) shows an\nalternative to this traditional workflow, by forming a proximal prediction\npipeline of the testing performance without the presence of ground-truth\nlabels. Despite its recent successes, the AutoEval frameworks still suffer from\nan overconfidence issue, substantial storage and computational cost. In that\nregard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that\nallows the AutoEval framework to be both more efficient and effective. The core\nof the MDE is to establish a meta-distribution statistic, on the information\n(energy) associated with individual samples, then offer a smoother\nrepresentation enabled by energy-based learning. We further provide our\ntheoretical insights by connecting the MDE with the classification loss. We\nprovide extensive experiments across modalities, datasets and different\narchitectural backbones to validate MDE's validity, together with its\nsuperiority compared with prior approaches. We also prove MDE's versatility by\nshowing its seamless integration with large-scale models, and easy adaption to\nlearning scenarios with noisy- or imbalanced- labels. Code and data are\navailable: https://github.com/pengr/Energy_AutoEval\n","authors":["Ru Peng","Heming Zou","Haobo Wang","Yawen Zeng","Zenan Huang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.12689v2.pdf","comment":"ICLR2024 poster paper"},{"id":"http://arxiv.org/abs/2401.10529v2","updated":"2024-01-25T04:11:57Z","published":"2024-01-19T07:10:13Z","title":"Mementos: A Comprehensive Benchmark for Multimodal Large Language Model\n  Reasoning over Image Sequences","summary":"  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in\nhandling a variety of visual-language tasks. However, current MLLM benchmarks\nare predominantly designed to evaluate reasoning based on static information\nabout a single image, and the ability of modern MLLMs to extrapolate from image\nsequences, which is essential for understanding our ever-changing world, has\nbeen less investigated. To address this challenge, this paper introduces\nMementos, a new benchmark designed to assess MLLMs' sequential image reasoning\nabilities. Mementos features 4,761 diverse image sequences with varying\nlengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning\nperformance. Through a careful evaluation of nine recent MLLMs on Mementos,\nincluding GPT-4V and Gemini, we find that they struggle to accurately describe\ndynamic information about given image sequences, often leading to\nhallucinations/misrepresentations of objects and their corresponding behaviors.\nOur quantitative analysis and case studies identify three key factors impacting\nMLLMs' sequential image reasoning: the correlation between object and\nbehavioral hallucinations, the influence of cooccurring behaviors, and the\ncompounding impact of behavioral hallucinations. Our dataset is available at\nhttps://github.com/umd-huang-lab/Mementos.\n","authors":["Xiyao Wang","Yuhang Zhou","Xiaoyu Liu","Hongjin Lu","Yuancheng Xu","Feihong He","Jaehong Yoon","Taixi Lu","Gedas Bertasius","Mohit Bansal","Huaxiu Yao","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.10529v2.pdf","comment":"27 pages, 23 figures"},{"id":"http://arxiv.org/abs/2401.13929v1","updated":"2024-01-25T04:03:32Z","published":"2024-01-25T04:03:32Z","title":"Reinforcement Learning with Hidden Markov Models for Discovering\n  Decision-Making Dynamics","summary":"  Major depressive disorder (MDD) presents challenges in diagnosis and\ntreatment due to its complex and heterogeneous nature. Emerging evidence\nindicates that reward processing abnormalities may serve as a behavioral marker\nfor MDD. To measure reward processing, patients perform computer-based\nbehavioral tasks that involve making choices or responding to stimulants that\nare associated with different outcomes. Reinforcement learning (RL) models are\nfitted to extract parameters that measure various aspects of reward processing\nto characterize how patients make decisions in behavioral tasks. Recent\nfindings suggest the inadequacy of characterizing reward learning solely based\non a single RL model; instead, there may be a switching of decision-making\nprocesses between multiple strategies. An important scientific question is how\nthe dynamics of learning strategies in decision-making affect the reward\nlearning ability of individuals with MDD. Motivated by the probabilistic reward\ntask (PRT) within the EMBARC study, we propose a novel RL-HMM framework for\nanalyzing reward-based decision-making. Our model accommodates learning\nstrategy switching between two distinct approaches under a hidden Markov model\n(HMM): subjects making decisions based on the RL model or opting for random\nchoices. We account for continuous RL state space and allow time-varying\ntransition probabilities in the HMM. We introduce a computationally efficient\nEM algorithm for parameter estimation and employ a nonparametric bootstrap for\ninference. We apply our approach to the EMBARC study to show that MDD patients\nare less engaged in RL compared to the healthy controls, and engagement is\nassociated with brain activities in the negative affect circuitry during an\nemotional conflict task.\n","authors":["Xingche Guo","Donglin Zeng","Yuanjia Wang"],"pdf_url":"https://arxiv.org/pdf/2401.13929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04661v3","updated":"2024-01-25T03:50:57Z","published":"2023-11-08T13:03:06Z","title":"Massive Editing for Large Language Models via Meta Learning","summary":"  While large language models (LLMs) have enabled learning knowledge from the\npre-training corpora, the acquired knowledge may be fundamentally incorrect or\noutdated over time, which necessitates rectifying the knowledge of the language\nmodel (LM) after the training. A promising approach involves employing a\nhyper-network to generate parameter shift, whereas existing hyper-networks\nsuffer from inferior scalability in synchronous editing operation amount. To\nmitigate the problem, we propose the MAssive Language Model Editing Network\n(MALMEN), which formulates the parameter shift aggregation as the least square\nproblem, subsequently updating the LM parameters using the normal equation. To\naccommodate editing multiple facts simultaneously with limited memory budgets,\nwe separate the computation on the hyper-network and LM, enabling arbitrary\nbatch size on both neural networks. Our method is evaluated by editing up to\nthousands of facts on LMs with different architectures, i.e., BERT-base, GPT-2,\nT5-XL (2.8B), and GPT-J (6B), across various knowledge-intensive NLP tasks,\ni.e., closed book fact-checking and question answering. Remarkably, MALMEN is\ncapable of editing hundreds of times more facts than strong baselines with the\nidentical hyper-network architecture and outperforms editor specifically\ndesigned for GPT. Our code is available at\nhttps://github.com/ChenmienTan/malmen.\n","authors":["Chenmien Tan","Ge Zhang","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2311.04661v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13923v1","updated":"2024-01-25T03:42:00Z","published":"2024-01-25T03:42:00Z","title":"Towards 3D Molecule-Text Interpretation in Language Models","summary":"  Language Models (LMs) have greatly influenced diverse domains. However, their\ninherent limitation in comprehending 3D molecular structures has considerably\nconstrained their potential in the biomolecular domain. To bridge this gap, we\nfocus on 3D molecule-text interpretation, and propose 3D-MoLM: 3D-Molecular\nLanguage Modeling. Specifically, 3D-MoLM enables an LM to interpret and analyze\n3D molecules by equipping the LM with a 3D molecular encoder. This integration\nis achieved by a 3D molecule-text projector, bridging the 3D molecular\nencoder's representation space and the LM's input space. Moreover, to enhance\n3D-MoLM's ability of cross-modal molecular understanding and instruction\nfollowing, we meticulously curated a 3D molecule-centric instruction tuning\ndataset -- 3D-MoIT. Through 3D molecule-text alignment and 3D molecule-centric\ninstruction tuning, 3D-MoLM establishes an integration of 3D molecular encoder\nand LM. It significantly surpasses existing baselines on downstream tasks,\nincluding molecule-text retrieval, molecule captioning, and more challenging\nopen-text molecular QA tasks, especially focusing on 3D-dependent properties.\n","authors":["Sihang Li","Zhiyuan Liu","Yanchen Luo","Xiang Wang","Xiangnan He","Kenji Kawaguchi","Tat-Seng Chua","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2401.13923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13920v1","updated":"2024-01-25T03:36:39Z","published":"2024-01-25T03:36:39Z","title":"LocMoE: A Low-overhead MoE for Large Language Model Training","summary":"  The Mixtures-of-Experts (MoE) model is a widespread distributed and\nintegrated learning method for large language models (LLM), which is favored\ndue to its ability to sparsify and expand models efficiently. However, the\nperformance of MoE is limited by load imbalance and high latency of All-To-All\ncommunication, along with relatively redundant computation owing to large\nexpert capacity. Load imbalance may result from existing routing policies that\nconsistently tend to select certain experts. The frequent inter-node\ncommunication in the All-To-All procedure also significantly prolongs the\ntraining time. To alleviate the above performance problems, we propose a novel\nrouting strategy that combines load balance and locality by converting partial\ninter-node communication to that of intra-node. Notably, we elucidate that\nthere is a minimum threshold for expert capacity, calculated through the\nmaximal angular deviation between the gating weights of the experts and the\nassigned tokens. We port these modifications on the PanGu-Sigma model based on\nthe MindSpore framework with multi-level routing and conduct experiments on\nAscend clusters. The experiment results demonstrate that the proposed LocMoE\nreduces training time per epoch by 12.68% to 22.24% compared to classical\nrouters, such as hash router and switch router, without impacting the model\naccuracy.\n","authors":["Jing Li","Zhijie Sun","Xuan He","Li Zeng","Yi Lin","Entong Li","Binfan Zheng","Rongqian Zhao","Xin Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13496v3","updated":"2024-01-25T03:20:12Z","published":"2023-03-23T17:56:12Z","title":"The effectiveness of MAE pre-pretraining for billion-scale pretraining","summary":"  This paper revisits the standard pretrain-then-finetune paradigm used in\ncomputer vision for visual recognition tasks. Typically, state-of-the-art\nfoundation models are pretrained using large scale (weakly) supervised datasets\nwith billions of images. We introduce an additional pre-pretraining stage that\nis simple and uses the self-supervised MAE technique to initialize the model.\nWhile MAE has only been shown to scale with the size of models, we find that it\nscales with the size of the training dataset as well. Thus, our MAE-based\npre-pretraining scales with both model and data size making it applicable for\ntraining foundation models. Pre-pretraining consistently improves both the\nmodel convergence and the downstream transfer performance across a range of\nmodel scales (millions to billions of parameters), and dataset sizes (millions\nto billions of images). We measure the effectiveness of pre-pretraining on 10\ndifferent visual recognition tasks spanning image classification, video\nrecognition, object detection, low-shot classification and zero-shot\nrecognition. Our largest model achieves new state-of-the-art results on\niNaturalist-18 (91.7%), ImageNet-ReaL (91.1%), 1-shot ImageNet-1k (63.6%), and\nzero-shot transfer on Food-101 (96.2%). Our study reveals that model\ninitialization plays a significant role, even for web-scale pretraining with\nbillions of images, and our models are available publicly.\n","authors":["Mannat Singh","Quentin Duval","Kalyan Vasudev Alwala","Haoqi Fan","Vaibhav Aggarwal","Aaron Adcock","Armand Joulin","Piotr Dollár","Christoph Feichtenhofer","Ross Girshick","Rohit Girdhar","Ishan Misra"],"pdf_url":"https://arxiv.org/pdf/2303.13496v3.pdf","comment":"ICCV 2023. Models available at\n  https://github.com/facebookresearch/maws/"},{"id":"http://arxiv.org/abs/2401.13913v1","updated":"2024-01-25T03:17:03Z","published":"2024-01-25T03:17:03Z","title":"Spectral Clustering for Discrete Distributions","summary":"  Discrete distribution clustering (D2C) was often solved by Wasserstein\nbarycenter methods. These methods are under a common assumption that clusters\ncan be well represented by barycenters, which may not hold in many real\napplications. In this work, we propose a simple yet effective framework based\non spectral clustering and distribution affinity measures (e.g., maximum mean\ndiscrepancy and Wasserstein distance) for D2C. To improve the scalability, we\npropose to use linear optimal transport to construct affinity matrices\nefficiently on large datasets. We provide theoretical guarantees for the\nsuccess of the proposed methods in clustering distributions. Experiments on\nsynthetic and real data show that our methods outperform the baselines largely\nin terms of both clustering accuracy and computational efficiency.\n","authors":["Zixiao Wang","Dong Qiao","Jicong Fan"],"pdf_url":"https://arxiv.org/pdf/2401.13913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13912v1","updated":"2024-01-25T03:14:07Z","published":"2024-01-25T03:14:07Z","title":"A Survey of Deep Learning and Foundation Models for Time Series\n  Forecasting","summary":"  Deep Learning has been successfully applied to many application domains, yet\nits advantages have been slow to emerge for time series forecasting. For\nexample, in the well-known Makridakis (M) Competitions, hybrids of traditional\nstatistical or machine learning techniques have only recently become the top\nperformers. With the recent architectural advances in deep learning being\napplied to time series forecasting (e.g., encoder-decoders with attention,\ntransformers, and graph neural networks), deep learning has begun to show\nsignificant advantages. Still, in the area of pandemic prediction, there remain\nchallenges for deep learning models: the time series is not long enough for\neffective training, unawareness of accumulated scientific knowledge, and\ninterpretability of the model. To this end, the development of foundation\nmodels (large deep learning models with extensive pre-training) allows models\nto understand patterns and acquire knowledge that can be applied to new related\nproblems before extensive training data becomes available. Furthermore, there\nis a vast amount of knowledge available that deep learning models can tap into,\nincluding Knowledge Graphs and Large Language Models fine-tuned with scientific\ndomain knowledge. There is ongoing research examining how to utilize or inject\nsuch knowledge into deep learning models. In this survey, several\nstate-of-the-art modeling techniques are reviewed, and suggestions for further\nwork are provided.\n","authors":["John A. Miller","Mohammed Aldosari","Farah Saeed","Nasid Habib Barna","Subas Rana","I. Budak Arpinar","Ninghao Liu"],"pdf_url":"https://arxiv.org/pdf/2401.13912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04234v2","updated":"2024-01-25T03:00:54Z","published":"2023-11-06T03:16:18Z","title":"Leveraging sinusoidal representation networks to predict fMRI signals\n  from EEG","summary":"  In modern neuroscience, functional magnetic resonance imaging (fMRI) has been\na crucial and irreplaceable tool that provides a non-invasive window into the\ndynamics of whole-brain activity. Nevertheless, fMRI is limited by hemodynamic\nblurring as well as high cost, immobility, and incompatibility with metal\nimplants. Electroencephalography (EEG) is complementary to fMRI and can\ndirectly record the cortical electrical activity at high temporal resolution,\nbut has more limited spatial resolution and is unable to recover information\nabout deep subcortical brain structures. The ability to obtain fMRI information\nfrom EEG would enable cost-effective, imaging across a wider set of brain\nregions. Further, beyond augmenting the capabilities of EEG, cross-modality\nmodels would facilitate the interpretation of fMRI signals. However, as both\nEEG and fMRI are high-dimensional and prone to artifacts, it is currently\nchallenging to model fMRI from EEG. To address this challenge, we propose a\nnovel architecture that can predict fMRI signals directly from multi-channel\nEEG without explicit feature engineering. Our model achieves this by\nimplementing a Sinusoidal Representation Network (SIREN) to learn frequency\ninformation in brain dynamics from EEG, which serves as the input to a\nsubsequent encoder-decoder to effectively reconstruct the fMRI signal from a\nspecific brain region. We evaluate our model using a simultaneous EEG-fMRI\ndataset with 8 subjects and investigate its potential for predicting\nsubcortical fMRI signals. The present results reveal that our model outperforms\na recent state-of-the-art model, and indicates the potential of leveraging\nperiodic activation functions in deep neural networks to model functional\nneuroimaging data.\n","authors":["Yamin Li","Ange Lou","Ziyuan Xu","Shiyu Wang","Catie Chang"],"pdf_url":"https://arxiv.org/pdf/2311.04234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00162v2","updated":"2024-01-25T02:50:53Z","published":"2023-06-30T22:36:41Z","title":"What do self-supervised speech models know about words?","summary":"  Many self-supervised speech models (S3Ms) have been introduced over the last\nfew years, improving performance and data efficiency on various speech tasks.\nHowever, these empirical successes alone do not give a complete picture of what\nis learned during pre-training. Recent work has begun analyzing how S3Ms encode\ncertain properties, such as phonetic and speaker information, but we still lack\na proper understanding of knowledge encoded at the word level and beyond. In\nthis work, we use lightweight analysis methods to study segment-level\nlinguistic properties -- word identity, boundaries, pronunciation, syntactic\nfeatures, and semantic features -- encoded in S3Ms. We present a comparative\nstudy of layer-wise representations from ten S3Ms and find that (i) the\nframe-level representations within each word segment are not all equally\ninformative, and (ii) the pre-training objective and model size heavily\ninfluence the accessibility and distribution of linguistic information across\nlayers. We also find that on several tasks -- word discrimination, word\nsegmentation, and semantic sentence similarity -- S3Ms trained with visual\ngrounding outperform their speech-only counterparts. Finally, our task-based\nanalyses demonstrate an improved performance on word segmentation and acoustic\nword discrimination while using simpler methods than prior work.\n","authors":["Ankita Pasad","Chung-Ming Chien","Shane Settle","Karen Livescu"],"pdf_url":"https://arxiv.org/pdf/2307.00162v2.pdf","comment":"This is a pre-MIT Press publication version"},{"id":"http://arxiv.org/abs/2401.13904v1","updated":"2024-01-25T02:48:44Z","published":"2024-01-25T02:48:44Z","title":"Empowering Machines to Think Like Chemists: Unveiling Molecular\n  Structure-Polarity Relationships with Hierarchical Symbolic Regression","summary":"  Thin-layer chromatography (TLC) is a crucial technique in molecular polarity\nanalysis. Despite its importance, the interpretability of predictive models for\nTLC, especially those driven by artificial intelligence, remains a challenge.\nCurrent approaches, utilizing either high-dimensional molecular fingerprints or\ndomain-knowledge-driven feature engineering, often face a dilemma between\nexpressiveness and interpretability. To bridge this gap, we introduce\nUnsupervised Hierarchical Symbolic Regression (UHiSR), combining hierarchical\nneural networks and symbolic regression. UHiSR automatically distills\nchemical-intuitive polarity indices, and discovers interpretable equations that\nlink molecular structure to chromatographic behavior.\n","authors":["Siyu Lou","Chengchun Liu","Yuntian Chen","Fanyang Mo"],"pdf_url":"https://arxiv.org/pdf/2401.13904v1.pdf","comment":"33 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.11819v2","updated":"2024-01-25T02:46:06Z","published":"2023-12-19T03:24:55Z","title":"An Adaptive Placement and Parallelism Framework for Accelerating RLHF\n  Training","summary":"  Recently, ChatGPT or InstructGPT like large language models (LLM) has made a\nsignificant impact in the AI world. Many works have attempted to reproduce the\ncomplex InstructGPT's training pipeline, namely Reinforcement Learning with\nHuman Feedback (RLHF). However, the mainstream distributed RLHF training\nmethods typically adopt a fixed model placement strategy, referred to as the\nFlattening strategy. This strategy treats all four interdependent models\ninvolved in RLHF as a single entity, distributing them across all devices and\napplying parallelism techniques designed for a single model, regardless of the\ndifferent workloads inherent to each model. As a result, this strategy\nexacerbates the generation bottlenecks in the RLHF training and degrades the\noverall training efficiency. To address these issues, we propose an adaptive\nmodel placement framework that offers two flexible model placement strategies.\nThe Interleaving strategy helps reduce memory redundancy and communication\ncosts of RLHF training by placing models without dependencies on exclusive\ndevices with careful orchestration. On the other hand, the Separation strategy\nimproves the throughput of model training by separating the training and\ninference runtime of the RLHF pipeline with additional shadow models.\nFurthermore, our framework provides a simple user interface and allows for the\nagile allocation of models across devices in a fine-grained manner for various\ntraining scenarios, involving models of varying sizes and devices of different\nscales. Extensive experiments have demonstrated that our Interleaving and\nSeparation strategies can achieve notable improvements up to 11X, compared to\nthe current SOTA approaches. The results highlight the effectiveness and\nadaptability of our approaches in accelerating the training of distributed\nRLHF.\n","authors":["Youshao Xiao","Weichang Wu","Zhenglei Zhou","Fagui Mao","Shangchun Zhao","Lin Ju","Lei Liang","Xiaolu Zhang","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.11819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13119v2","updated":"2024-01-25T02:44:52Z","published":"2023-06-22T17:44:22Z","title":"Adversarial Resilience in Sequential Prediction via Abstention","summary":"  We study the problem of sequential prediction in the stochastic setting with\nan adversary that is allowed to inject clean-label adversarial (or\nout-of-distribution) examples. Algorithms designed to handle purely stochastic\ndata tend to fail in the presence of such adversarial examples, often leading\nto erroneous predictions. This is undesirable in many high-stakes applications\nsuch as medical recommendations, where abstaining from predictions on\nadversarial examples is preferable to misclassification. On the other hand,\nassuming fully adversarial data leads to very pessimistic bounds that are often\nvacuous in practice.\n  To capture this motivation, we propose a new model of sequential prediction\nthat sits between the purely stochastic and fully adversarial settings by\nallowing the learner to abstain from making a prediction at no cost on\nadversarial examples. Assuming access to the marginal distribution on the\nnon-adversarial examples, we design a learner whose error scales with the VC\ndimension (mirroring the stochastic setting) of the hypothesis class, as\nopposed to the Littlestone dimension which characterizes the fully adversarial\nsetting. Furthermore, we design a learner for VC dimension~1 classes, which\nworks even in the absence of access to the marginal distribution. Our key\ntechnical contribution is a novel measure for quantifying uncertainty for\nlearning VC classes, which may be of independent interest.\n","authors":["Surbhi Goel","Steve Hanneke","Shay Moran","Abhishek Shetty"],"pdf_url":"https://arxiv.org/pdf/2306.13119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.07295v4","updated":"2024-01-25T02:42:20Z","published":"2021-03-12T14:11:36Z","title":"Adversarial Graph Disentanglement","summary":"  A real-world graph has a complex topological structure, which is often formed\nby the interaction of different latent factors. However, most existing methods\nlack consideration of the intrinsic differences in relations between nodes\ncaused by factor entanglement. In this paper, we propose an\n\\underline{\\textbf{A}}dversarial \\underline{\\textbf{D}}isentangled\n\\underline{\\textbf{G}}raph \\underline{\\textbf{C}}onvolutional\n\\underline{\\textbf{N}}etwork (ADGCN) for disentangled graph representation\nlearning. To begin with, we point out two aspects of graph disentanglement that\nneed to be considered, i.e., micro-disentanglement and macro-disentanglement.\nFor them, a component-specific aggregation approach is proposed to achieve\nmicro-disentanglement by inferring latent components that cause the links\nbetween nodes. On the basis of micro-disentanglement, we further propose a\nmacro-disentanglement adversarial regularizer to improve the separability among\ncomponent distributions, thus restricting the interdependence among components.\nAdditionally, to reveal the topological graph structure, a diversity-preserving\nnode sampling approach is proposed, by which the graph structure can be\nprogressively refined in a way of local structure awareness. The experimental\nresults on various real-world graph data verify that our ADGCN obtains more\nfavorable performance over currently available alternatives. The source codes\nof ADGCN are available at \\textit{\\url{https://github.com/SsGood/ADGCN}}.\n","authors":["Shuai Zheng","Zhenfeng Zhu","Zhizhe Liu","Jian Cheng","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2103.07295v4.pdf","comment":"Accepted by IEEE Transactions on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2211.12612v2","updated":"2024-01-25T02:31:43Z","published":"2022-11-22T22:24:28Z","title":"Transfer Learning for Contextual Multi-armed Bandits","summary":"  Motivated by a range of applications, we study in this paper the problem of\ntransfer learning for nonparametric contextual multi-armed bandits under the\ncovariate shift model, where we have data collected on source bandits before\nthe start of the target bandit learning. The minimax rate of convergence for\nthe cumulative regret is established and a novel transfer learning algorithm\nthat attains the minimax regret is proposed. The results quantify the\ncontribution of the data from the source domains for learning in the target\ndomain in the context of nonparametric contextual multi-armed bandits.\n  In view of the general impossibility of adaptation to unknown smoothness, we\ndevelop a data-driven algorithm that achieves near-optimal statistical\nguarantees (up to a logarithmic factor) while automatically adapting to the\nunknown parameters over a large collection of parameter spaces under an\nadditional self-similarity assumption. A simulation study is carried out to\nillustrate the benefits of utilizing the data from the auxiliary source domains\nfor learning in the target domain.\n","authors":["Changxiao Cai","T. Tony Cai","Hongzhe Li"],"pdf_url":"https://arxiv.org/pdf/2211.12612v2.pdf","comment":"Accepted to the Annals of Statistics"},{"id":"http://arxiv.org/abs/2401.08655v2","updated":"2024-01-25T02:29:00Z","published":"2023-12-25T04:40:32Z","title":"SAiD: Speech-driven Blendshape Facial Animation with Diffusion","summary":"  Speech-driven 3D facial animation is challenging due to the scarcity of\nlarge-scale visual-audio datasets despite extensive research. Most prior works,\ntypically focused on learning regression models on a small dataset using the\nmethod of least squares, encounter difficulties generating diverse lip\nmovements from speech and require substantial effort in refining the generated\noutputs. To address these issues, we propose a speech-driven 3D facial\nanimation with a diffusion model (SAiD), a lightweight Transformer-based U-Net\nwith a cross-modality alignment bias between audio and visual to enhance lip\nsynchronization. Moreover, we introduce BlendVOCA, a benchmark dataset of pairs\nof speech audio and parameters of a blendshape facial model, to address the\nscarcity of public resources. Our experimental results demonstrate that the\nproposed approach achieves comparable or superior performance in lip\nsynchronization to baselines, ensures more diverse lip movements, and\nstreamlines the animation editing process.\n","authors":["Inkyu Park","Jaewoong Cho"],"pdf_url":"https://arxiv.org/pdf/2401.08655v2.pdf","comment":"Fix bug related to the font size"},{"id":"http://arxiv.org/abs/2401.13898v1","updated":"2024-01-25T02:25:23Z","published":"2024-01-25T02:25:23Z","title":"Cross-Modal Prototype based Multimodal Federated Learning under Severely\n  Missing Modality","summary":"  Multimodal federated learning (MFL) has emerged as a decentralized machine\nlearning paradigm, allowing multiple clients with different modalities to\ncollaborate on training a machine learning model across diverse data sources\nwithout sharing their private data. However, challenges, such as data\nheterogeneity and severely missing modalities, pose crucial hindrances to the\nrobustness of MFL, significantly impacting the performance of global model. The\nabsence of a modality introduces misalignment during the local training phase,\nstemming from zero-filling in the case of clients with missing modalities.\nConsequently, achieving robust generalization in global model becomes\nimperative, especially when dealing with clients that have incomplete data. In\nthis paper, we propose Multimodal Federated Cross Prototype Learning (MFCPL), a\nnovel approach for MFL under severely missing modalities by conducting the\ncomplete prototypes to provide diverse modality knowledge in modality-shared\nlevel with the cross-modal regularization and modality-specific level with\ncross-modal contrastive mechanism. Additionally, our approach introduces the\ncross-modal alignment to provide regularization for modality-specific features,\nthereby enhancing overall performance, particularly in scenarios involving\nseverely missing modalities. Through extensive experiments on three multimodal\ndatasets, we demonstrate the effectiveness of MFCPL in mitigating these\nchallenges and improving the overall performance.\n","authors":["Huy Q. Le","Chu Myaet Thwal","Yu Qiao","Ye Lin Tun","Minh N. H. Nguyen","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2401.13898v1.pdf","comment":"12 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2401.13652v2","updated":"2024-01-25T02:10:47Z","published":"2024-01-24T18:44:14Z","title":"Graph-Informed Neural Networks for Sparse Grid-Based Discontinuity\n  Detectors","summary":"  In this paper, we present a novel approach for detecting the discontinuity\ninterfaces of a discontinuous function. This approach leverages Graph-Informed\nNeural Networks (GINNs) and sparse grids to address discontinuity detection\nalso in domains of dimension larger than 3. GINNs, trained to identify troubled\npoints on sparse grids, exploit graph structures built on the grids to achieve\nefficient and accurate discontinuity detection performances. We also introduce\na recursive algorithm for general sparse grid-based detectors, characterized by\nconvergence properties and easy applicability. Numerical experiments on\nfunctions with dimensions n = 2 and n = 4 demonstrate the efficiency and robust\ngeneralization of GINNs in detecting discontinuity interfaces. Notably, the\ntrained GINNs offer portability and versatility, allowing integration into\nvarious algorithms and sharing among users.\n","authors":["Francesco Della Santa","Sandra Pieraccini"],"pdf_url":"https://arxiv.org/pdf/2401.13652v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13887v1","updated":"2024-01-25T02:05:31Z","published":"2024-01-25T02:05:31Z","title":"A comparative study of zero-shot inference with large language models\n  and supervised modeling in breast cancer pathology classification","summary":"  Although supervised machine learning is popular for information extraction\nfrom clinical notes, creating large annotated datasets requires extensive\ndomain expertise and is time-consuming. Meanwhile, large language models (LLMs)\nhave demonstrated promising transfer learning capability. In this study, we\nexplored whether recent LLMs can reduce the need for large-scale data\nannotations. We curated a manually-labeled dataset of 769 breast cancer\npathology reports, labeled with 13 categories, to compare zero-shot\nclassification capability of the GPT-4 model and the GPT-3.5 model with\nsupervised classification performance of three model architectures: random\nforests classifier, long short-term memory networks with attention (LSTM-Att),\nand the UCSF-BERT model. Across all 13 tasks, the GPT-4 model performed either\nsignificantly better than or as well as the best supervised model, the LSTM-Att\nmodel (average macro F1 score of 0.83 vs. 0.75). On tasks with high imbalance\nbetween labels, the differences were more prominent. Frequent sources of GPT-4\nerrors included inferences from multiple samples and complex task design. On\ncomplex tasks where large annotated datasets cannot be easily collected, LLMs\ncan reduce the burden of large-scale data labeling. However, if the use of LLMs\nis prohibitive, the use of simpler supervised models with large annotated\ndatasets can provide comparable results. LLMs demonstrated the potential to\nspeed up the execution of clinical NLP studies by reducing the need for\ncurating large annotated datasets. This may result in an increase in the\nutilization of NLP-based variables and outcomes in observational clinical\nstudies.\n","authors":["Madhumita Sushil","Travis Zack","Divneet Mandair","Zhiwei Zheng","Ahmed Wali","Yan-Ning Yu","Yuwei Quan","Atul J. Butte"],"pdf_url":"https://arxiv.org/pdf/2401.13887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13884v1","updated":"2024-01-25T02:01:53Z","published":"2024-01-25T02:01:53Z","title":"Constant Stepsize Q-learning: Distributional Convergence, Bias and\n  Extrapolation","summary":"  Stochastic Approximation (SA) is a widely used algorithmic approach in\nvarious fields, including optimization and reinforcement learning (RL). Among\nRL algorithms, Q-learning is particularly popular due to its empirical success.\nIn this paper, we study asynchronous Q-learning with constant stepsize, which\nis commonly used in practice for its fast convergence. By connecting the\nconstant stepsize Q-learning to a time-homogeneous Markov chain, we show the\ndistributional convergence of the iterates in Wasserstein distance and\nestablish its exponential convergence rate. We also establish a Central Limit\nTheory for Q-learning iterates, demonstrating the asymptotic normality of the\naveraged iterates. Moreover, we provide an explicit expansion of the asymptotic\nbias of the averaged iterate in stepsize. Specifically, the bias is\nproportional to the stepsize up to higher-order terms and we provide an\nexplicit expression for the linear coefficient. This precise characterization\nof the bias allows the application of Richardson-Romberg (RR) extrapolation\ntechnique to construct a new estimate that is provably closer to the optimal Q\nfunction. Numerical results corroborate our theoretical finding on the\nimprovement of the RR extrapolation method.\n","authors":["Yixuan Zhang","Qiaomin Xie"],"pdf_url":"https://arxiv.org/pdf/2401.13884v1.pdf","comment":"41 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.08543v2","updated":"2024-01-25T01:46:03Z","published":"2023-11-14T21:16:40Z","title":"2D-RC: Two-Dimensional Neural Network Approach for OTFS Symbol Detection","summary":"  Orthogonal time frequency space (OTFS) is a promising modulation scheme for\nwireless communication in high-mobility scenarios. Recently, a reservoir\ncomputing (RC) based approach has been introduced for online subframe-based\nsymbol detection in the OTFS system, where only a limited number of\nover-the-air (OTA) pilot symbols are utilized for training. However, this\napproach does not leverage the domain knowledge specific to the OTFS system to\nfully unlock the potential of RC. This paper introduces a novel two-dimensional\nRC (2D-RC) method that incorporates the domain knowledge of the OTFS system\ninto the design for symbol detection in an online subframe-based manner.\nSpecifically, as the channel interaction in the delay-Doppler (DD) domain is a\ntwo-dimensional (2D) circular operation, the 2D-RC is designed to have the 2D\ncircular padding procedure and the 2D filtering structure to embed this\nknowledge. With the introduced architecture, 2D-RC can operate in the DD domain\nwith only a single neural network, instead of necessitating multiple RCs to\ntrack channel variations in the time domain as in previous work. Numerical\nexperiments demonstrate the advantages of the 2D-RC approach over the previous\nRC-based approach and compared model-based methods across different OTFS system\nvariants and modulation orders.\n","authors":["Jiarui Xu","Karim Said","Lizhong Zheng","Lingjia Liu"],"pdf_url":"https://arxiv.org/pdf/2311.08543v2.pdf","comment":"15 pages, journal submission"},{"id":"http://arxiv.org/abs/2309.06548v3","updated":"2024-01-25T01:25:33Z","published":"2023-09-08T21:34:52Z","title":"Online Infinite-Dimensional Regression: Learning Linear Operators","summary":"  We consider the problem of learning linear operators under squared loss\nbetween two infinite-dimensional Hilbert spaces in the online setting. We show\nthat the class of linear operators with uniformly bounded $p$-Schatten norm is\nonline learnable for any $p \\in [1, \\infty)$. On the other hand, we prove an\nimpossibility result by showing that the class of uniformly bounded linear\noperators with respect to the operator norm is \\textit{not} online learnable.\nMoreover, we show a separation between sequential uniform convergence and\nonline learnability by identifying a class of bounded linear operators that is\nonline learnable but uniform convergence does not hold. Finally, we prove that\nthe impossibility result and the separation between uniform convergence and\nlearnability also hold in the batch setting.\n","authors":["Vinod Raman","Unique Subedi","Ambuj Tewari"],"pdf_url":"https://arxiv.org/pdf/2309.06548v3.pdf","comment":"21 pages, ALT 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2401.13875v1","updated":"2024-01-25T01:09:09Z","published":"2024-01-25T01:09:09Z","title":"Is Temperature Sample Efficient for Softmax Gaussian Mixture of Experts?","summary":"  Dense-to-sparse gating mixture of experts (MoE) has recently become an\neffective alternative to a well-known sparse MoE. Rather than fixing the number\nof activated experts as in the latter model, which could limit the\ninvestigation of potential experts, the former model utilizes the temperature\nto control the softmax weight distribution and the sparsity of the MoE during\ntraining in order to stabilize the expert specialization. Nevertheless, while\nthere are previous attempts to theoretically comprehend the sparse MoE, a\ncomprehensive analysis of the dense-to-sparse gating MoE has remained elusive.\nTherefore, we aim to explore the impacts of the dense-to-sparse gate on the\nmaximum likelihood estimation under the Gaussian MoE in this paper. We\ndemonstrate that due to interactions between the temperature and other model\nparameters via some partial differential equations, the convergence rates of\nparameter estimations are slower than any polynomial rates, and could be as\nslow as $\\mathcal{O}(1/\\log(n))$, where $n$ denotes the sample size. To address\nthis issue, we propose using a novel activation dense-to-sparse gate, which\nroutes the output of a linear layer to an activation function before delivering\nthem to the softmax function. By imposing linearly independence conditions on\nthe activation function and its derivatives, we show that the parameter\nestimation rates are significantly improved to polynomial rates.\n","authors":["Huy Nguyen","Pedram Akbarian","Nhat Ho"],"pdf_url":"https://arxiv.org/pdf/2401.13875v1.pdf","comment":"53 pages"},{"id":"http://arxiv.org/abs/2307.03334v3","updated":"2024-01-25T01:01:33Z","published":"2023-07-07T00:30:16Z","title":"Variational quantum regression algorithm with encoded data structure","summary":"  Hybrid variational quantum algorithms (VQAs) are promising for solving\npractical problems such as combinatorial optimization, quantum chemistry\nsimulation, quantum machine learning, and quantum error correction on noisy\nquantum computers. However, with typical random ansatz or quantum alternating\noperator ansatz, derived variational quantum algorithms become a black box for\nmodel interpretation. In this paper we construct a quantum regression algorithm\nwherein the quantum state directly encodes the classical data table and the\nvariational parameters correspond directly to the regression coefficients which\nare real numbers by construction, providing a high degree of model\ninterpretability and minimal cost to optimize with the right expressiveness.\nInstead of assuming the state preparation is given by granted, we discuss the\nstate preparation with different encoders and their time complexity and overall\nresource cost. We can take advantage of the encoded data structure to cut down\nthe algorithm time complexity. To the best of our knowledge, we show for the\nfirst time explicitly how the linkage of the classical data structure can be\ntaken advantage of directly through quantum subroutines by construction. For\nnonlinear regression, our algorithm can be extended by building nonlinear\nfeatures into the training data as demonstrated by numerical results. In\naddition, we demonstrate that the model trainability is achievable only when\nthe number of features $M$ is much less than the number of records $L$ for the\nencoded data structure to justify $L\\gg M$ in our resource estimation.\n","authors":["C. -C. Joseph Wang","Ryan S. Bennink"],"pdf_url":"https://arxiv.org/pdf/2307.03334v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13872v1","updated":"2024-01-25T00:47:44Z","published":"2024-01-25T00:47:44Z","title":"Edge Conditional Node Update Graph Neural Network for Multi-variate Time\n  Series Anomaly Detection","summary":"  With the rapid advancement in cyber-physical systems, the increasing number\nof sensors has significantly complicated manual monitoring of system states.\nConsequently, graph-based time-series anomaly detection methods have gained\nattention due to their ability to explicitly represent relationships between\nsensors. However, these methods often apply a uniform source node\nrepresentation across all connected target nodes, even when updating different\ntarget node representations. Moreover, the graph attention mechanism, commonly\nused to infer unknown graph structures, could constrain the diversity of source\nnode representations. In this paper, we introduce the Edge Conditional\nNode-update Graph Neural Network (ECNU-GNN). Our model, equipped with an edge\nconditional node update module, dynamically transforms source node\nrepresentations based on connected edges to represent target nodes aptly. We\nvalidate performance on three real-world datasets: SWaT, WADI, and PSM. Our\nmodel demonstrates 5.4%, 12.4%, and 6.0% higher performance, respectively,\ncompared to best F1 baseline models.\n","authors":["Hayoung Jo","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2401.13872v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2304.09571v3","updated":"2024-01-25T09:10:16Z","published":"2023-04-19T11:19:10Z","title":"LLIC: Large Receptive Field Transform Coding with Adaptive Weights for\n  Learned Image Compression","summary":"  Effective Receptive field (ERF) plays an important role in transform coding,\nwhich determines how much redundancy can be removed at most during transform\nand how many spatial priors can be utilized to synthesize textures during\ninverse transform. Existing methods rely on stacks of small kernels, whose ERF\nremains not large enough instead, or heavy non-local attention mechanisms,\nwhich limit the potential of high resolution image coding. To tackle this\nissue, we propose Large Receptive Field Transform Coding with Adaptive Weights\nfor Learned Image Compression (LLIC). Specifically, for the first time in\nlearned image compression community, we introduce a few large kernel-based\ndepth-wise convolutions to reduce more redundancy while maintaining modest\ncomplexity. Due to wide range of image diversity, we propose to enhance the\nadaptability of convolutions via generating weights in a self-conditioned\nmanner. The large kernels cooperate with non-linear embedding and gate\nmechanisms for better expressiveness and lighter point-wise interactions. We\nalso investigate improved training techniques to fully exploit the potential of\nlarge kernels. In addition, to enhance the interactions among channels, we\npropose the adaptive channel-wise bit allocation via generating channel\nimportance factor in a self-conditioned manner. To demonstrate the\neffectiveness of proposed transform coding, we align the entropy model to\ncompare with existing transform methods and obtain models LLIC-STF, LLIC-ELIC,\nLLIC-TCM. Extensive experiments demonstrate our proposed LLIC models have\nsignificant improvements over corresponding baselines and achieve\nstate-of-the-art performances and better trade-off between performance and\ncomplexity.\n","authors":["Wei Jiang","Peirong Ning","Jiayu Yang","Yongqi Zhai","Feng Gao","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.09571v3.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2401.14011v1","updated":"2024-01-25T08:22:10Z","published":"2024-01-25T08:22:10Z","title":"CMMU: A Benchmark for Chinese Multi-modal Multi-type Question\n  Understanding and Reasoning","summary":"  Multi-modal large language models(MLLMs) have achieved remarkable progress\nand demonstrated powerful knowledge comprehension and reasoning abilities.\nHowever, the mastery of domain-specific knowledge, which is essential for\nevaluating the intelligence of MLLMs, continues to be a challenge. Current\nmulti-modal benchmarks for domain-specific knowledge concentrate on\nmultiple-choice questions and are predominantly available in English, which\nimposes limitations on the comprehensiveness of the evaluation. To this end, we\nintroduce CMMU, a novel benchmark for multi-modal and multi-type question\nunderstanding and reasoning in Chinese. CMMU consists of 3,603 questions in 7\nsubjects, covering knowledge from primary to high school. The questions can be\ncategorized into 3 types: multiple-choice, multiple-response, and\nfill-in-the-blank, bringing greater challenges to MLLMs. In addition, we\npropose a rigorous evaluation strategy called ShiftCheck for assessing\nmultiple-choice questions. The strategy aims to reduce position bias, minimize\nthe influence of randomness on correctness, and perform a quantitative analysis\nof position bias. We evaluate seven open-source MLLMs along with GPT4-V,\nGemini-Pro, and Qwen-VL-Plus. The results demonstrate that CMMU poses a\nsignificant challenge to the recent MLLMs.\n","authors":["Zheqi He","Xinya Wu","Pengfei Zhou","Richeng Xuan","Guang Liu","Xi Yang","Qiannan Zhu","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2401.14011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10475v2","updated":"2024-01-25T06:58:17Z","published":"2024-01-19T03:54:58Z","title":"CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short\n  Video Search Scenarios","summary":"  Vision-Language Models pre-trained on large-scale image-text datasets have\nshown superior performance in downstream tasks such as image retrieval. Most of\nthe images for pre-training are presented in the form of open domain\ncommon-sense visual elements. Differently, video covers in short video search\nscenarios are presented as user-originated contents that provide important\nvisual summaries of videos. In addition, a portion of the video covers come\nwith manually designed cover texts that provide semantic complements. In order\nto fill in the gaps in short video cover data, we establish the first\nlarge-scale cover-text benchmark for Chinese short video search scenarios.\nSpecifically, we release two large-scale datasets CBVS-5M/10M to provide short\nvideo covers, and the manual fine-labeling dataset CBVS-20K to provide real\nuser queries, which serves as an image-text benchmark test in the Chinese short\nvideo search field. To integrate the semantics of cover text in the case of\nmodality missing, we propose UniCLIP where cover texts play a guiding role\nduring training, however are not relied upon by inference. Extensive evaluation\non CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has\nbeen deployed to Tencent's online video search systems with hundreds of\nmillions of visits and achieved significant gains. The dataset and code are\navailable at https://github.com/QQBrowserVideoSearch/CBVS-UniCLIP.\n","authors":["Xiangshuo Qiao","Xianxin Li","Xiaozhe Qu","Jie Zhang","Yang Liu","Yu Luo","Cihang Jin","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2401.10475v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13967v1","updated":"2024-01-25T05:58:04Z","published":"2024-01-25T05:58:04Z","title":"Perceptual-oriented Learned Image Compression with Dynamic Kernel","summary":"  In this paper, we extend our prior research named DKIC and propose the\nperceptual-oriented learned image compression method, PO-DKIC. Specifically,\nDKIC adopts a dynamic kernel-based dynamic residual block group to enhance the\ntransform coding and an asymmetric space-channel context entropy model to\nfacilitate the estimation of gaussian parameters. Based on DKIC, PO-DKIC\nintroduces PatchGAN and LPIPS loss to enhance visual quality. Furthermore, to\nmaximize the overall perceptual quality under a rate constraint, we formulate\nthis challenge into a constrained programming problem and use the Linear\nInteger Programming method for resolution. The experiments demonstrate that our\nproposed method can generate realistic images with richer textures and finer\ndetails when compared to state-of-the-art image compression techniques.\n","authors":["Nianxiang Fu","Junxi Zhang","Huairui Wang","Zhenzhong Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09085v4","updated":"2024-01-25T03:00:56Z","published":"2023-09-16T19:40:30Z","title":"SynthTab: Leveraging Synthesized Data for Guitar Tablature Transcription","summary":"  Guitar tablature is a form of music notation widely used among guitarists. It\ncaptures not only the musical content of a piece, but also its implementation\nand ornamentation on the instrument. Guitar Tablature Transcription (GTT) is an\nimportant task with broad applications in music education, composition, and\nentertainment. Existing GTT datasets are quite limited in size and scope,\nrendering models trained on them prone to overfitting and incapable of\ngeneralizing to out-of-domain data. In order to address this issue, we present\na methodology for synthesizing large-scale GTT audio using commercial acoustic\nand electric guitar plugins. We procure SynthTab, a dataset derived from\nDadaGP, which is a vast and diverse collection of richly annotated symbolic\ntablature. The proposed synthesis pipeline produces audio which faithfully\nadheres to the original fingerings and a subset of techniques specified in the\ntablature, and covers multiple guitars and styles for each track. Experiments\nshow that pre-training a baseline GTT model on SynthTab can improve\ntranscription performance when fine-tuning and testing on an individual\ndataset. More importantly, cross-dataset experiments show that pre-training\nsignificantly mitigates issues with overfitting.\n","authors":["Yongyi Zang","Yi Zhong","Frank Cwitkowitz","Zhiyao Duan"],"pdf_url":"https://arxiv.org/pdf/2309.09085v4.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.08655v2","updated":"2024-01-25T02:29:00Z","published":"2023-12-25T04:40:32Z","title":"SAiD: Speech-driven Blendshape Facial Animation with Diffusion","summary":"  Speech-driven 3D facial animation is challenging due to the scarcity of\nlarge-scale visual-audio datasets despite extensive research. Most prior works,\ntypically focused on learning regression models on a small dataset using the\nmethod of least squares, encounter difficulties generating diverse lip\nmovements from speech and require substantial effort in refining the generated\noutputs. To address these issues, we propose a speech-driven 3D facial\nanimation with a diffusion model (SAiD), a lightweight Transformer-based U-Net\nwith a cross-modality alignment bias between audio and visual to enhance lip\nsynchronization. Moreover, we introduce BlendVOCA, a benchmark dataset of pairs\nof speech audio and parameters of a blendshape facial model, to address the\nscarcity of public resources. Our experimental results demonstrate that the\nproposed approach achieves comparable or superior performance in lip\nsynchronization to baselines, ensures more diverse lip movements, and\nstreamlines the animation editing process.\n","authors":["Inkyu Park","Jaewoong Cho"],"pdf_url":"https://arxiv.org/pdf/2401.08655v2.pdf","comment":"Fix bug related to the font size"},{"id":"http://arxiv.org/abs/2401.13249v2","updated":"2024-01-25T02:24:21Z","published":"2024-01-24T06:19:44Z","title":"MOS-FAD: Improving Fake Audio Detection Via Automatic Mean Opinion Score\n  Prediction","summary":"  Automatic Mean Opinion Score (MOS) prediction is employed to evaluate the\nquality of synthetic speech. This study extends the application of predicted\nMOS to the task of Fake Audio Detection (FAD), as we expect that MOS can be\nused to assess how close synthesized speech is to the natural human voice. We\npropose MOS-FAD, where MOS can be leveraged at two key points in FAD: training\ndata selection and model fusion. In training data selection, we demonstrate\nthat MOS enables effective filtering of samples from unbalanced datasets. In\nthe model fusion, our results demonstrate that incorporating MOS as a gating\nmechanism in FAD model fusion enhances overall performance.\n","authors":["Wangjin Zhou","Zhengdong Yang","Chenhui Chu","Sheng Li","Raj Dabre","Yi Zhao","Tatsuya Kawahara"],"pdf_url":"https://arxiv.org/pdf/2401.13249v2.pdf","comment":"Accepted in ICASSP2024"}]}}
\ No newline at end of file
diff --git a/favicon.ico b/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5
GIT binary patch
literal 15086
zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau
zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}<z`M#I>4!8Q=syhFvI(6#Q
zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c
zg{r6iyXi3T|KFt>>TsDWWe%JEG<m`Z;&-3^R0ug1f<^}$q5Q=P?#uie-}>eI;m)t_
z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa
zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y<ax-x_)V}OWlBT0T`Px7TNY}*zI;?
zWn^JY)))ky4lEHv(5WD_*xdquU*8ck(q|ykPW+@)bE-C$rN!lK{RRgcVzzm<##`_U
zhBRv$;y#E&mknJ|`AlUeIZXK`M0_2g$!Ip4kuom@NlD3tvX9Ccg@E$`PIbJ+;%m9!
z<G~o6H5!>2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8`
zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%P<t#lxR;fx2$gDB*dT
z-+m<>AnhXUA$><WLApjNA(g#c7XMh?0~WKTNN+d>^4n&yX>&2hmV*Ry0tB<smelLF
ztRFHgMKX(=h8&WgVVQw@_BMsOVbpSXo@0Ik=~dE0q{g&!VYIhlK_u#FczLLMOi)ko
z+Z57n(iM_JDs#0Ax^C!-1?B}BcHt@g_Dc3Eh}sc_4%<2)dUZ4gEg6IkTRNc3wK5L1
z6{?)5f*}isz;3of^glU085Y(z<Pg1^YMF|L2O1XD6WkV32U49XP;J{o4SyMGej3!O
zHBg^!hWca^)bbCZX3l{c@B&m*+OSZ{)`NA{lQ<?~=R0ohh!k^5A$GAB-B&+}N~ay>
zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j
zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z
z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4<Ggka
zN-&m0Qby!Ce-0`$i+Xi0!o%ue+*@xUD=Q20&GR)q^1tFcc0&for^wLqi<X-H+iiBl
z#>AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u
zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B
zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El
zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE<x*gAYQrYe#$37OG7fouZ<ko_rE&?p&yo
zCyTcs<FJ?VMnpipMqVCl%uPv=##(hlptfu+uJ3~V+i#)z_SJEY8wa&^E!0<ELH+Op
z)Q>+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+
zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_
zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~
zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M
zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm
z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2
zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R
zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BG<cICna&vQY(Nf}SL
zjkW~;^jT0XN{Rn#!5;v%dTnw2F?h*S?8%NDFg0ri)4X}b^+C$8-{qT>k+NDH$2J`E
zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL
zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JL<XQxi*{lO~cQ!n*ZwBj{Fu`
zPo2`&;BhzI^F<PkXB({|^>y9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^
z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0<rr~E<k87UPdvlyVcO_;~
z+9dH1Sx0(2qxYG+F<_sW-hEfc_BefP@ueNdN9ypMiQhE*EbDgu3icyMpqel?^>(R|
z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE
z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;<V&^EBZkW(`MPt+n<=pJk
z$Nm1v6JBrL%bXQIqD4!nXNbKtHor(~w4ChSo9}9Y>DK9ZoqOLJHETi*8l?M+!q*#o
zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*Dx<aiO|CwQ)fvke~&&#7;MztE%p`Pxc
z@5_b1JAEh#vwRc%mwMtGHgAQ-4I3e0Y64QNsd#8pCrCWeS1k%U8HDE-4??0j5ic%z
z5pj#+FxN7-F#aYn#3{BZ2+au9eZ#wGCm74t_Tw_IJ<Ul&7Qrj>N;%mZkENXqy&csb
zsD_E}O+<E9w#I9oc^*1#?1ZwHxDWO+1)dGWphbhU7_RhnmW@QbM*LHr(=5}_WM4tt
zUG@eeNOc&?l>J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS
zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9T<bF4y0;bcEGh6t6X6&+-)P5}%CR$#vK<
zu8~jt5(j;O)PPv(*MoYP``Tfnpb`=pZ%ARhgmS&Zc-NpIP*EMYu5$j|OZzyEbq<jV
z_UIA=e{$`UFk8$TKds-ac1*QS#j9DbVzO;AlFUh%xnL$*?&3UmN%wDSgOs!sP0t;%
zOJ2qyJ}Vv-&a<!GCGoM2knE(Pq%NeEB#D_fB}Fpdo%A~CBhnuvH?f5ZxfKwzJVuN4
z8yG|m;jhg<=H)D}&q~@r7r##8rIDYv#=F*cwQ~X~FXuF7#?mF9Xh}q!AM3b7@BB`1
z$$BAim*XVy0nU?Tth`8-PgTZ07yc7=ThaN0A&We-GqN%4gW>LDq~u-ur`m{J5sTMs
zuVdouiO8@OoM~_@pI-BHF}c0L<ZGE)Kh@IW^qLM4vw`1YwP1Go9Q0fnjgroS#dcLV
zS3x^}O=tfba*Ga?&n@poFZznur9Atw+sEBBJutA_*?)bP_eHHAYat+)dnzwdrPGxV
zZ;ywZIc1nL(EG#QJQw!eE_=hvwM-M9FD5TY)^u0&_kHvG2t7i(?PHR2*daV~?M~ZJ
z+3_sL$98<>>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI<ni3ebD5Irm*N}m
z5ae+_=7k*!&!fJ3;xDIy|Fs0|&*?_c|Nr<8^nlUta!)Iq)R9zpPm6cCiv2OtNj>}D
zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50
zU`*>4B=PI|ujbte-Xj>la6GD=q@VAS<lMgAv3lsbsVma0>BzQb<MA!rv)_VL`<L37
zY3Cgh+Iv~|2d)~3M&C1L!MjY-e$d$<L~o47+>E(Mou3Mu)rQ&jISBtLywuzSE(YL*
zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut
zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&<
zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`-
zv1q-oHJX3j0-ZkZ<kYuvPa<k}6#A|0kH@w?hDkY-bi6hzdTj22vIn>?7)E-Ue)=q)
z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ
zL&}654a0<m6Od!iLHrx9pmN6usKjaXuTSWEpX&vCQ`!3%B=3ZNJj%1^cr=M>gZHLP
z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDU<D_7}zFX3hEaUJ(yk{8~_H;JRC&PQmwrl|Hs
z@LzE9&?}5F`#;8UUd-5(W7~2kLt+}smT`}}fP7M+va`AW+gTKcGx}uu4BkPZ{>XWm
zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J
z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*<Lp%UOTTUSl=Fn+?@R1Q@D7Wk43tCo
z@{Xj&Th7U)O+ycizLBTAld+LI9HU1=rKRb0&*yS(c7fXNLm!Oc9JjD<MMjCs$vNcI
zsegUGA=tKU(=kXL?Op}j7(bqI1TXoEo#`J0wP|aS-U*Hk8}zu4jIE>Gj(PbecRHl3
z)RAxU<Q_x4rqAuaL?L$c*&5X6Jw^Pb9XXekc_sbqssH7@l##LqF5$n3b7e&F*oIpy
zSNXOt=j_(cocT+8BA?7V(KF#=@Q}Q`lW(s5BmB8HRx+mErs2ezv$&`a4L%%CEmq#7
z3Gc$t<{Ev$yovARTd`QL@SjZGndUWj+7y<Jf7MwQ;l((UDKZkKTD5rh@jLyy$jiLE
zlW$rq%5i)ya$ffP2h_(t@IT11-J8?i{Njv3&I;r_ry*^F*djxI=H;Dyb2?*?`LOFt
zu3NPk3-p1%#Ii<lyyU)xv|;qW$VwYvdgBde9g}k!c_;S4as2bB7b9r1X8ORpKu7$(
zQYhDnXFl7hhcbo+2GMPq*T!$-DeoNhUT}Xv9qLJ4_VK(&_OtEww!`zo2Oy|3ZI|3%
za>QWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG
z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY
zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q
zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D
zrv-UL^G+<+sG)<QVxQt2G}?qg0qiH1DOm5D9&CzPcK$yB+Mfls1$Zat9VBKYVuo!7
z?*67A-b@r#&Q!s3s|RR)zn$xj+)EZeRs2N*W8qU`h~JYol9rH`k+zUz&6c&pF>xM6
z?TJFFEmf0C{C}YwOAfki>*iPz<?_mD{9;zdV2*i?hI#mk5h$DIrQLRFtq6t0^LlOQ
zrTL-a+nY1Y7_)ecll(=c={GYDHek&FWSg@wa@k0fyIjsg%Z{~5&dXkBo^_rUZyUXI
zH2=RyV2S!z^LR~k-q9Jgexsee1xfY-G7c}TcnS4?s_)ujB6<AViF@fp#@Q1YA9vy<
z@q%J8f&btOiBm{eBAiqhr{MoJEA~f=#(r8?`9kG&)z4Pf^77jw-B9Zq9BZ(y=INS7
z83g49q1V=4h}|3uzduX%hpL{higC-wVc@2L2tFT-`oGr4gk=+;#cQ|q*7ZmBqGw1)
z3P|qFGnKto7Jat$(R4k<o`MJVJ%Cy#YawoZ940NBgeglWBm8UH<;9`&ZE4dMPp9ol
zWWF!wa|j;cnNh;x1hhTa7I*x52dZ4CqP6Rv?~kgvRndQ8ESkJE66(ucw1c#}vd&#5
zT_o+;j#jS^N0Tq_bk#HcH*JLff|f9S`@Md@ZQVNk{C&w1-UnJu`%Qm&AJ^=xJ7~{!
zd7S<|Xd>QOx^KY$zohMj<o=SZe{#Q&cg;0hoH?F;vr|o-|2o?7-8=PjTDjvc>$PFW
zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD
z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^
zpZemTUj}!jMxoMTuBTTxc8<P;=0jXXV5S|FR#8reRN+bmJi7Z)#BGS<8uA_@lG<}$
z_cr%v->|>VvOmu-_V5+=E%E4@&<i714M*KS>Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM
z<F}jxOX_l6iz2n5?$u^rm{9XfO}x4CP0Y=niz$nyBH}35Z>Ac2-+N1Mj9EDb0eKE%
zBWGqVy3+V)V<crI70w@)*dtUWx5}P~pCa~f{%OCB_pZ-!SIg~bzrVvCV$sVLGYG!D
F_!mjUQ#=3w

literal 0
HcmV?d00001

diff --git a/index.css b/index.css
new file mode 100644
index 00000000..9ded9d94
--- /dev/null
+++ b/index.css
@@ -0,0 +1,355 @@
+:root {
+    /* Palette: Nord (https://www.nordtheme.com)*/
+    --nord00: #2e3440;
+    --nord01: #3b4252;
+    --nord02: #434c5e;
+    --nord03: #4c566a;
+    --nord04: #d8dee9;
+    --nord05: #e5e9f0;
+    --nord06: #eceff4;
+    --nord07: #8fbcbb;
+    --nord08: #88c0d0;
+    --nord09: #81a1c1;
+    --nord0A: #5e81ac;
+    --nord0B: #bf616a;
+    --nord0C: #d08770;
+    --nord0D: #ebcb8b;
+    --nord0E: #a3be8c;
+    --nord0F: #b48ead;
+
+
+    /* Typograph */
+    --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue",
+    sans-serif;
+    --font-size-scaler: 62.5%;
+    --font-size-m: 1.6rem;
+    --font-size-s: 1.4rem;
+
+    /* Components */
+    --body-color: var(--nord06);
+    --body-bg: var(--nord00);
+
+    --header-title: var(--nord06);
+    --header-container: var(--nord00);
+    --header-title-preffix: var(--nord0F);
+
+    --chip-font: var(--nord08);
+    --chip-color: var(--nord0B);
+
+    --icons: var(--nord06);
+    --icons-hover: var(--nord0F);
+
+    --day-container: var(--nord01);
+    --date: var(--nord09);
+
+    --summary: var(--nord0E);
+    --summary-hover: var(--nord0F);
+
+    --details-open: var(--nord02);
+    --details-content: var(--nord05);
+    --details-a: var(--nord07);
+    --details-a-hover: var(--nord0F);
+
+    --highlight-title: var(--nord0B);
+    --highlight-author: var(--nord0B);
+
+    --article-summary-hover-color: var(--nord0D);
+    --article-summary-color: var(--nord04);
+
+    --article-title-color: var(--nord05);
+    --article-title-hover-color: var(--nord0E);
+
+    --accordion-content-rail-color: var(--nord01);
+    --accordion-content-hover-rail-color: var(--nord0D);
+    --accordion-title-marker-color: var(--nord01);
+    --accordion-title-hover-marker-color: var(--nord0E);
+
+    --footer-color: var(--nord04);
+    --footer-link-hover-color: var(--nord0D);
+}
+
+[data-theme="light"] {
+    /* Theme design */
+
+    --color-primary: var(--nord07);
+    --color-primary-second: var(--nord00);
+    --color-info: var(--nord0A);
+    --color-success: var(--nord0E);
+    --color-warning: var(--nord0C);
+    --color-danger: var(--nord0B);
+
+    --color-text: var(--nord00);
+    --color-hover: var(--nord0D);
+    --color-shadow: var(--nord03);
+
+    --color-primary-h: var(--nord09);
+    --color-primary-s: var(--nord08);
+    --color-primary-l: var(--nord07);
+
+    --color-contrast-higher-h: var(--nord01);
+    --color-contrast-higher-l: var(--nord02);
+    --color-contrast-higher-s: var(--nord03);
+
+    --color-content: white;
+
+    --background: var(--nord06);
+    --background-content: var(--nord05);
+    --background-color: var(--nord04);
+
+    /* Components */
+
+    --chip-font: var(--nord06);
+    --chip-color: var(--nord09);
+
+    --body-color: var(--background-color);
+    --body-bg: var(--background);
+
+    --header-title: var(--color-shadow);
+    --header-container: var(--background);
+    --header-title-preffix: var(--color-primary-h);
+
+    --icons: var(--color-shadow);
+    --icons-hover: var(--color-hover);
+
+    --day-container: var(--background-content);
+    --date: var(--color-primary-l);
+
+    --summary: var(--color-info);
+    --summary-hover: var(--color-success);
+
+    --details-open: var(--color-content);
+    --details-content: var(--color-text);
+    --details-a: var(--color-primary-h);
+    --details-a-hover: var(--color-hover);
+
+    --highlight-title: var(--color-danger);
+    --highlight-author: var(--color-warning);
+
+    --article-summary-color: var(--color-text);
+    --article-summary-hover-color: var(--color-primary-s);
+
+    --article-title-color: var(--color-primary);
+    --article-title-hover-color: var(--color-success);
+
+    --accordion-content-rail-color: var(--color-warning);
+    --accordion-content-hover-rail-color: var(--color-warning);
+    --accordion-title-marker-color: var(--color-success);
+    --accordion-title-hover-marker-color: var(--color-success);
+
+    --footer-color: var(--color-text);
+    --footer-link-hover-color: var(--color-hover);
+}
+
+html {
+    font-size: var(--font-size-scaler);
+}
+
+body {
+    background-color: var(--body-bg);
+    font-family: var(--font-family-default);
+    color: var(--body-color);
+    margin: 0;
+    padding-top: 16px;
+    display: grid;
+}
+
+.header-container {
+    width: 90%;
+    max-width: 1200px;
+    background: var(--header-container);
+    margin: 0 auto;
+}
+
+.header-title {
+    font-size: 32px;
+    font-weight: bold;
+    color: var(--header-title);
+    margin: 0;
+    padding-bottom: 14px;
+}
+
+.header-title-preffix {
+    color: var(--header-title-preffix);
+}
+
+.icons {
+    color: var(--icons);
+    padding-bottom: 16px;
+}
+
+.icons a {
+    color: var(--icons);
+    text-decoration: none;
+}
+
+.icons a:hover {
+    color: var(--icons-hover);
+}
+
+.day-container {
+    padding: 16px 16px 16px 16px;
+    background: var(--day-container);
+    width: 90%;
+    max-width: 1200px;
+    margin: 0 auto;
+    margin-bottom: 8px;
+    border-radius: 10px;
+}
+
+.date {
+    font-size: 24px;
+    font-weight: 700;
+    margin: 0;
+    color: var(--date);
+}
+
+p {
+    margin: 0;
+}
+
+summary {
+    font-weight: 600;
+    color: var(--summary);
+}
+
+summary:hover {
+    text-decoration: underline;
+    cursor: pointer;
+    color: var(--summary-hover);
+}
+
+details {
+    --border-color: transparent;
+
+    padding: 2px 4px;
+    font-size: 20px;
+    border: 1px solid var(--border-color);
+    border-radius: 4px;
+}
+
+details[open] {
+    background-color: var(--details-open);
+    margin-bottom: 8px;
+}
+
+.details-content {
+    padding: 12px 3px;
+    gap: 16px;
+    color: var(--details-content);
+}
+
+details a {
+    color: var(--details-a);
+}
+
+details a:hover {
+    color: var(--details-a-hover);
+}
+
+footer {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    justify-content: space-between;
+}
+
+.description {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    text-align: center;
+}
+
+.highlight-author {
+    color: var(--highlight-author);
+    font-weight: bold;
+}
+
+.highlight-title {
+    color: var(--highlight-title);
+    font-weight: bold;
+}
+
+.channel-description {
+    text-align: center;
+    font-size: var(--font-size-scaler);
+}
+
+.article-summary-link {
+    color: var(--article-summary-color);
+    font-size: var(--font-size-s);
+    text-decoration: none;
+}
+
+.article-summary-link:hover {
+    color: var(--article-summary-hover-color);
+    --accordion-content-rail-color: var(--accordion-content-hover-rail-color);
+}
+
+.article-summary-box-outer {
+    display: block;
+    padding: 4px 8px 8px 4px;
+}
+
+.article-summary-box-inner {
+    padding-left: 8px;
+    border-left: 1px solid var(--accordion-content-rail-color);
+    font-size: var(--font-size-m);
+}
+
+.article-expander {
+    padding: 10px 4px;
+    border-radius: 4px;
+}
+
+.article-authors {
+    font-size: var(--font-size-m);
+    padding: 0.25em 1em;
+}
+
+.article-authors a {
+    text-decoration: none;
+}
+
+.article-expander-title {
+    font-size: var(--font-size-m);
+    font-weight: 600;
+}
+
+.article-expander-title:hover {
+    cursor: pointer;
+}
+
+.article-expander-title::marker {
+    color: var(--accordion-title-marker-color);
+}
+
+.article-expander-title:hover::marker {
+    color: var(--accordion-title-hover-marker-color);
+}
+
+/* for switcher */
+.theme-switch {
+    display: inline-block;
+    position: relative;
+}
+
+.theme-switch input {
+    display: none;
+}
+
+/* chip */
+.chip {
+    font-size: 90%;
+    align-items: center;
+    color: var(--chip-font);
+    background: var(--chip-color);
+    border-radius: 5rem;
+    display: inline-flex;
+    padding: .2rem .4rem;
+    vertical-align: middle;
+}
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 00000000..5b7b24e2
--- /dev/null
+++ b/index.html
@@ -0,0 +1,70843 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>MyArxiv</title>
+    <meta charset="utf-8"/>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
+    <meta name="robots" content="noindex, nofollow"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1"/>
+    <link rel="shortcut icon" type="image/x-icon" href="favicon.ico"/>
+    <link href="index.css" rel="stylesheet"/>
+    <link href="https://cdn.jsdelivr.net/npm/remixicon@2.5.0/fonts/remixicon.css" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.js"
+            integrity="sha384-z1fJDqw8ZApjGO3/unPWUPsIymfsJmyrDVWC8Tv/a1HeOtGmkwNd/7xUS0Xcnvsx"
+            crossorigin="anonymous"></script>
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/contrib/auto-render.min.js"
+            integrity="sha384-+XBljXPPiv+OzfbB3cVmLHf4hdUFHlWNZN5spNQ7rmHTXpd7WvJum6fIACpNNfIR"
+            crossorigin="anonymous"></script>
+    <script>
+        document.addEventListener("DOMContentLoaded", function () {
+            renderMathInElement(document.body, {
+                // customised options
+                // • auto-render specific keys, e.g.:
+                delimiters: [
+                    {left: '$$', right: '$$', display: true},
+                    {left: '$', right: '$', display: false},
+                    {left: '\\(', right: '\\)', display: false},
+                    {left: '\\[', right: '\\]', display: true},
+                    {left: "\\begin{equation}", right: "\\end{equation}", display: true},
+                    {left: "\\begin{align}", right: "\\end{align}", display: true},
+                    {left: "\\begin{alignat}", right: "\\end{alignat}", display: true},
+                    {left: "\\begin{gather}", right: "\\end{gather}", display: true},
+                    {left: "\\begin{CD}", right: "\\end{CD}", display: true},
+                ],
+                // • rendering keys, e.g.:
+                throwOnError: false
+            });
+        });
+    </script>
+</head>
+
+<body>
+<section class="header-container">
+    <div style="display:flex; justify-content:space-between; align-items:flex-end;">
+        <div>
+            <div class="header-title">
+                MyArxiv
+            </div>
+        </div>
+
+        <div class=icons>
+            <label class="theme-switch" for="checkbox">
+                <input type="checkbox" id="checkbox"/>
+                <i id="theme-icon" class="ri-moon-line" style="font-size: 32px" rel="noopener noreferrer"></i>
+            </label>
+        </div>
+    </div>
+</section>
+
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-01-25T00:00:00Z">2024-01-25</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">70</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modular Adaptation of Multilingual Encoders to Written Swiss German
+  Dialect 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannis Vamvas, Noëmi Aepli, Rico Sennrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating neural text encoders for written Swiss German is challenging due to
+a dearth of training data combined with dialectal variation. In this paper, we
+build on several existing multilingual encoders and adapt them to Swiss German
+using continued pre-training. Evaluation on three diverse downstream tasks
+shows that simply adding a Swiss German adapter to a modular encoder achieves
+97.5% of fully monolithic adaptation performance. We further find that for the
+task of retrieving Swiss German sentences given Standard German queries,
+adapting a character-level model is more effective than the other adaptation
+strategies. We release our code and the models trained for our experiments at
+https://github.com/ZurichNLP/swiss-german-text-encoders
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First Workshop on Modular and Open Multilingual NLP (MOOMIN 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TURNA: A Turkish Encoder-Decoder Language Model for Enhanced
+  Understanding and Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gökçe Uludoğan, Zeynep Yirmibeşoğlu Balal, Furkan Akkurt, Melikşah Türker, Onur Güngör, Susan Üsküdarlı
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advances in natural language processing have predominantly favored
+well-resourced English-centric models, resulting in a significant gap with
+low-resource languages. In this work, we introduce the language model TURNA,
+which is developed for the low-resource language Turkish and is capable of both
+natural language understanding and generation tasks. TURNA is pretrained with
+an encoder-decoder architecture based on the unified framework UL2 with a
+diverse corpus that we specifically curated for this purpose. We evaluated
+TURNA with three generation tasks and five understanding tasks for Turkish. The
+results show that TURNA outperforms several multilingual models in both
+understanding and generation tasks, and competes with monolingual Turkish
+models in understanding tasks. TURNA is made available at
+https://huggingface.co/boun-tabi-LMG/TURNA .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Genie: Achieving Human Parity in Content-Grounded <span class="highlight-title">Dataset</span>s Generation <span class="chip">ICLR24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14367v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14367v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asaf Yehudai, Boaz Carmeli, Yosi Mass, Ofir Arviv, Nathaniel Mills, Assaf Toledo, Eyal Shnarch, Leshem Choshen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lack of high-quality data for content-grounded generation tasks has been
+identified as a major obstacle to advancing these tasks. To address this gap,
+we propose Genie, a novel method for automatically generating high-quality
+content-grounded data. It consists of three stages: (a) Content Preparation,
+(b) Generation: creating task-specific examples from the content (e.g.,
+question-answer pairs or summaries). (c) Filtering mechanism aiming to ensure
+the quality and faithfulness of the generated data. We showcase this
+methodology by generating three large-scale synthetic data, making wishes, for
+Long-Form Question-Answering (LFQA), summarization, and information extraction.
+In a human evaluation, our generated data was found to be natural and of high
+quality. Furthermore, we compare models trained on our data with models trained
+on human-written data -- ELI5 and ASQA for LFQA and CNN-DailyMail for
+Summarization. We show that our models are on par with or outperforming models
+trained on human-generated data and consistently outperforming them in
+faithfulness. Finally, we applied our method to create LFQA data within the
+medical domain and compared a model trained on it with models trained on other
+domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Analysis of Noise Reduction Methods in Sentiment Analysis
+  on Noisy Bengali Texts <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazi Toufique Elahi, Tasnuva Binte Rahman, Shakil Shahriar, Samir Sarker, Md. Tanvir Rouf Shawon, G. M. Shahariar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Bengali is considered a language with limited resources, sentiment
+analysis has been a subject of extensive research in the literature.
+Nevertheless, there is a scarcity of exploration into sentiment analysis
+specifically in the realm of noisy Bengali texts. In this paper, we introduce a
+dataset (NC-SentNoB) that we annotated manually to identify ten different types
+of noise found in a pre-existing sentiment analysis dataset comprising of
+around 15K noisy Bengali texts. At first, given an input noisy text, we
+identify the noise type, addressing this as a multi-label classification task.
+Then, we introduce baseline noise reduction methods to alleviate noise prior to
+conducting sentiment analysis. Finally, we assess the performance of fine-tuned
+sentiment analysis models with both noisy and noise-reduced texts to make
+comparisons. The experimental findings indicate that the noise reduction
+methods utilized are not satisfactory, highlighting the need for more suitable
+noise reduction methods in future research endeavors. We have made the
+implementation and dataset presented in this paper publicly available at
+https://github.com/ktoufiquee/A-Comparative-Analysis-of-Noise-Reduction-Methods-in-Sentiment-Analysis-on-Noisy-Bengali-Texts
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in The 9th Workshop on Noisy and User-generated Text
+  (W-NUT), 18th Conference of the European Chapter of the Association for
+  Computational Linguistics (EACL 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topologies of Reasoning: Demystifying Chains, Trees, and Graphs of
+  Thoughts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maciej Besta, Florim Memedi, Zhenyu Zhang, Robert Gerstenberger, Nils Blach, Piotr Nyczyk, Marcin Copik, Grzegorz Kwaśniewski, Jürgen Müller, Lukas Gianinazzi, Ales Kubicek, Hubert Niewiadomski, Onur Mutlu, Torsten Hoefler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of natural language processing (NLP) has witnessed significant
+progress in recent years, with a notable focus on improving large language
+models' (LLM) performance through innovative prompting techniques. Among these,
+prompt engineering coupled with structures has emerged as a promising paradigm,
+with designs such as Chain-of-Thought, Tree of Thoughts, or Graph of Thoughts,
+in which the overall LLM reasoning is guided by a structure such as a graph. As
+illustrated with numerous examples, this paradigm significantly enhances the
+LLM's capability to solve numerous tasks, ranging from logical or mathematical
+reasoning to planning or creative writing. To facilitate the understanding of
+this growing field and pave the way for future developments, we devise a
+general blueprint for effective and efficient LLM reasoning schemes. For this,
+we conduct an in-depth analysis of the prompt execution pipeline, clarifying
+and clearly defining different concepts. We then build the first taxonomy of
+structure-enhanced LLM reasoning schemes. We focus on identifying fundamental
+classes of harnessed structures, and we analyze the representations of these
+structures, algorithms executed with these structures, and many others. We
+refer to these structures as reasoning topologies, because their representation
+becomes to a degree spatial, as they are contained within the LLM context. Our
+study compares existing prompting schemes using the proposed taxonomy,
+discussing how certain design choices lead to different patterns in performance
+and cost. We also outline theoretical underpinnings, relationships between
+prompting and others parts of the LLM ecosystem such as knowledge bases, and
+the associated research challenges. Our work will help to advance future prompt
+engineering techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RomanSetu: Efficiently unlocking multilingual capabilities of Large
+  Language Models models via Romanization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaavid Aktar Husain, Raj Dabre, Aswanth Kumar, Ratish Puduppully, Anoop Kunchukuttan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the challenge of extending Large Language Models (LLMs)
+to non-English languages, specifically those using non-Latin scripts. We
+propose an innovative approach that utilizes the romanized form of text as an
+interface for LLMs, hypothesizing that its frequent informal use and shared
+tokens with English enhance cross-lingual alignment. Focusing on Hindi, we
+demonstrate through Hindi-to-English translation and sentiment analysis tasks
+that romanized text not only significantly improves inference efficiency due to
+its lower fertility compared to native text but also achieves competitive
+performance with limited pre-training. Additionally, our novel multi-script
+prompting approach, which combines romanized and native texts, shows promise in
+further enhancing task performance. These findings suggest the potential of
+romanization in bridging the language gap for LLM applications, with future
+work aimed at expanding this approach to more languages and tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s and Cortical Waves: Encoders for Pulling In Context Across
+  Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lyle Muller, Patricia S. Churchland, Terrence J. Sejnowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The capabilities of transformer networks such as ChatGPT and other Large
+Language Models (LLMs) have captured the world's attention. The crucial
+computational mechanism underlying their performance relies on transforming a
+complete input sequence - for example, all the words in a sentence into a long
+"encoding vector" - that allows transformers to learn long-range temporal
+dependencies in naturalistic sequences. Specifically, "self-attention" applied
+to this encoding vector enhances temporal context in transformers by computing
+associations between pairs of words in the input sequence. We suggest that
+waves of neural activity, traveling across single cortical regions or across
+multiple regions at the whole-brain scale, could implement a similar encoding
+principle. By encapsulating recent input history into a single spatial pattern
+at each moment in time, cortical waves may enable temporal context to be
+extracted from sequences of sensory inputs, the same computational principle
+used in transformers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Natural Language Capability of Code Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14242v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14242v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Li, Daoguang Zan, Bei Guan, Ailun Yu, Xiaolin Chen, Yongji Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Code large language models (Code LLMs) have demonstrated remarkable
+performance in code generation. Nonetheless, most existing works focus on
+boosting code LLMs from the perspective of programming capabilities, while
+their natural language capabilities receive less attention. To fill this gap,
+we thus propose a novel framework, comprising two modules: AttentionExtractor,
+which is responsible for extracting key phrases from the user's natural
+language requirements, and AttentionCoder, which leverages these extracted
+phrases to generate target code to solve the requirement. This framework
+pioneers an innovative idea by seamlessly integrating code LLMs with
+traditional natural language processing tools. To validate the effectiveness of
+the framework, we craft a new code generation benchmark, called MultiNL-H,
+covering five natural languages. Extensive experimental results demonstrate the
+effectiveness of our proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Labeling Technique for Reddit Text and Fine-Tuned Longformer
+  Models for Classifying Depression Severity in English and Luganda 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Kimera, Daniela N. Rim, Joseph Kirabira, Ubong Godwin Udomah, Heeyoul Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depression is a global burden and one of the most challenging mental health
+conditions to control. Experts can detect its severity early using the Beck
+Depression Inventory (BDI) questionnaire, administer appropriate medication to
+patients, and impede its progression. Due to the fear of potential
+stigmatization, many patients turn to social media platforms like Reddit for
+advice and assistance at various stages of their journey. This research
+extracts text from Reddit to facilitate the diagnostic process. It employs a
+proposed labeling approach to categorize the text and subsequently fine-tunes
+the Longformer model. The model's performance is compared against baseline
+models, including Naive Bayes, Random Forest, Support Vector Machines, and
+Gradient Boosting. Our findings reveal that the Longformer model outperforms
+the baseline models in both English (48%) and Luganda (45%) languages on a
+custom-made dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In IEEE Proceedings of the 14th International Conference on ICT
+  Convergence (ICTC), Jeju, Korea, October 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing the Portability of Parameter Matrices Trained by
+  Parameter-Efficient Finetuning Methods <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Sabry, Anya Belz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the cost of training ever larger language models has grown, so has the
+interest in reusing previously learnt knowledge. Transfer learning methods have
+shown how reusing non-task-specific knowledge can help in subsequent
+task-specific learning. In this paper, we investigate the inverse: porting
+whole functional modules that encode task-specific knowledge from one model to
+another. We designed a study comprising 1,440 training/testing runs to test the
+portability of modules trained by parameter-efficient finetuning (PEFT)
+techniques, using sentiment analysis as an example task. We test portability in
+a wide range of scenarios, involving different PEFT techniques and different
+pretrained host models, among other dimensions. We compare the performance of
+ported modules with that of equivalent modules trained (i) from scratch, and
+(ii) from parameters sampled from the same distribution as the ported module.
+We find that the ported modules far outperform the two alternatives tested, but
+that there are interesting performance differences between the four PEFT
+techniques. We conclude that task-specific knowledge in the form of
+structurally modular sets of parameters as produced by PEFT techniques is
+highly portable, but that degree of success depends on type of PEFT and on
+differences between originating and receiving pretrained models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of EACL 2024. Camera ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Commonsense-augmented Memory Construction and Management in Long-term
+  Conversations via Context-aware Persona Refinement <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hana Kim, Kai Tzu-iunn Ong, Seoyeon Kim, Dongha Lee, Jinyoung Yeo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Memorizing and utilizing speakers' personas is a common practice for response
+generation in long-term conversations. Yet, human-authored datasets often
+provide uninformative persona sentences that hinder response quality. This
+paper presents a novel framework that leverages commonsense-based persona
+expansion to address such issues in long-term conversation. While prior work
+focuses on not producing personas that contradict others, we focus on
+transforming contradictory personas into sentences that contain rich speaker
+information, by refining them based on their contextual backgrounds with
+designed strategies. As the pioneer of persona expansion in multi-session
+settings, our framework facilitates better response generation via human-like
+persona refinement. The supplementary video of our work is available at
+https://caffeine-15bbf.web.app/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explicitly Representing Syntax Improves Sentence-to-layout Prediction of
+  Unexpected Situations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wolf Nuyts, Ruben Cartuyvels, Marie-Francine Moens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognizing visual entities in a natural language sentence and arranging them
+in a 2D spatial layout require a compositional understanding of language and
+space. This task of layout prediction is valuable in text-to-image synthesis as
+it allows localized and controlled in-painting of the image. In this
+comparative study it is shown that we can predict layouts from language
+representations that implicitly or explicitly encode sentence syntax, if the
+sentences mention similar entity-relationships to the ones seen during
+training. To test compositional understanding, we collect a test set of
+grammatically correct sentences and layouts describing compositions of entities
+and relations that unlikely have been seen during training. Performance on this
+test set substantially drops, showing that current models rely on correlations
+in the training data and have difficulties in understanding the structure of
+the input sentences. We propose a novel structural loss function that better
+enforces the syntactic structure of the input sentence and show large
+performance gains in the task of 2D spatial layout prediction conditioned on
+text. The loss has the potential to be used in other generation tasks where a
+tree-like structure underlies the conditioning modality. Code, trained models
+and the USCOCO evaluation set will be made available via github.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepSeek-Coder: When the Large Language Model Meets Programming -- The
+  Rise of Code Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14196v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14196v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daya Guo, Qihao Zhu, Dejian Yang, Zhenda Xie, Kai Dong, Wentao Zhang, Guanting Chen, Xiao Bi, Y. Wu, Y. K. Li, Fuli Luo, Yingfei Xiong, Wenfeng Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of large language models has revolutionized code
+intelligence in software development. However, the predominance of
+closed-source models has restricted extensive research and development. To
+address this, we introduce the DeepSeek-Coder series, a range of open-source
+code models with sizes from 1.3B to 33B, trained from scratch on 2 trillion
+tokens. These models are pre-trained on a high-quality project-level code
+corpus and employ a fill-in-the-blank task with a 16K window to enhance code
+generation and infilling. Our extensive evaluations demonstrate that
+DeepSeek-Coder not only achieves state-of-the-art performance among open-source
+code models across multiple benchmarks but also surpasses existing
+closed-source models like Codex and GPT-3.5. Furthermore, DeepSeek-Coder models
+are under a permissive license that allows for both research and unrestricted
+commercial use.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parameter-Efficient Conversational Recommender System as a Language
+  Processing Task <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Ravaut, Hao Zhang, Lu Xu, Aixin Sun, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational recommender systems (CRS) aim to recommend relevant items to
+users by eliciting user preference through natural language conversation. Prior
+work often utilizes external knowledge graphs for items' semantic information,
+a language model for dialogue generation, and a recommendation module for
+ranking relevant items. This combination of multiple components suffers from a
+cumbersome training process, and leads to semantic misalignment issues between
+dialogue generation and item recommendation. In this paper, we represent items
+in natural language and formulate CRS as a natural language processing task.
+Accordingly, we leverage the power of pre-trained language models to encode
+items, understand user intent via conversation, perform item recommendation
+through semantic matching, and generate dialogues. As a unified model, our
+PECRS (Parameter-Efficient CRS), can be optimized in a single stage, without
+relying on non-textual metadata such as a knowledge graph. Experiments on two
+benchmark CRS datasets, ReDial and INSPIRED, demonstrate the effectiveness of
+PECRS on recommendation and conversation. Our code is available at:
+https://github.com/Ravoxsg/efficient_unified_crs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, 7 tables, EACL 2024 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Can Large Language Models Understand Spatial-Temporal Data? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Liu, Shuo Yu, Runze Wang, Zhenxun Ma, Yanming Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Large Language Models (LLMs) dominate tasks like natural language
+processing and computer vision, harnessing their power for spatial-temporal
+forecasting remains challenging. The disparity between sequential text and
+complex spatial-temporal data hinders this application. To address this issue,
+this paper introduces STG-LLM, an innovative approach empowering LLMs for
+spatial-temporal forecasting. We tackle the data mismatch by proposing: 1)
+STG-Tokenizer: This spatial-temporal graph tokenizer transforms intricate graph
+data into concise tokens capturing both spatial and temporal relationships; 2)
+STG-Adapter: This minimalistic adapter, consisting of linear encoding and
+decoding layers, bridges the gap between tokenized data and LLM comprehension.
+By fine-tuning only a small set of parameters, it can effectively grasp the
+semantics of tokens generated by STG-Tokenizer, while preserving the original
+natural language understanding capabilities of LLMs. Extensive experiments on
+diverse spatial-temporal benchmark datasets show that STG-LLM successfully
+unlocks LLM potential for spatial-temporal forecasting. Remarkably, our
+approach achieves competitive performance on par with dedicated SOTA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayes<span class="highlight-title">Prompt</span>: <span class="highlight-title">Prompt</span>ing Large-Scale <span class="highlight-title">Pre-Train</span>ed Language Models on
+  Few-shot Inference via Debiased Domain Abstraction <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangmeng Li, Fei Song, Yifan Jin, Wenwen Qiang, Changwen Zheng, Fuchun Sun, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a novel and effective fine-tuning paradigm based on large-scale
+pre-trained language models (PLMs), prompt-tuning aims to reduce the gap
+between downstream tasks and pre-training objectives. While prompt-tuning has
+yielded continuous advancements in various tasks, such an approach still
+remains a persistent defect: prompt-tuning methods fail to generalize to
+specific few-shot patterns. From the perspective of distribution analyses, we
+disclose that the intrinsic issues behind the phenomenon are the
+over-multitudinous conceptual knowledge contained in PLMs and the abridged
+knowledge for target downstream domains, which jointly result in that PLMs
+mis-locate the knowledge distributions corresponding to the target domains in
+the universal knowledge embedding space. To this end, we intuitively explore to
+approximate the unabridged target domains of downstream tasks in a debiased
+manner, and then abstract such domains to generate discriminative prompts,
+thereby providing the de-ambiguous guidance for PLMs. Guided by such an
+intuition, we propose a simple yet effective approach, namely BayesPrompt, to
+learn prompts that contain the domain discriminative information against the
+interference from domain-irrelevant knowledge. BayesPrompt primitively
+leverages known distributions to approximate the debiased factual distributions
+of target domains and further uniformly samples certain representative features
+from the approximated distributions to generate the ultimate prompts for PLMs.
+We provide theoretical insights with the connection to domain adaptation.
+Empirically, our method achieves state-of-the-art performance on benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ True Knowledge Comes from Practice: Aligning LLMs with Embodied
+  Environments via Reinforcement Learning <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Tan, Wentao Zhang, Shanqi Liu, Longtao Zheng, Xinrun Wang, Bo An
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the impressive performance across numerous tasks, large language
+models (LLMs) often fail in solving simple decision-making tasks due to the
+misalignment of the knowledge in LLMs with environments. On the contrary,
+reinforcement learning (RL) agents learn policies from scratch, which makes
+them always align with environments but difficult to incorporate prior
+knowledge for efficient explorations. To narrow the gap, we propose TWOSOME, a
+novel general online framework that deploys LLMs as decision-making agents to
+efficiently interact and align with embodied environments via RL without
+requiring any prepared datasets or prior knowledge of the environments.
+Firstly, we query the joint probabilities of each valid action with LLMs to
+form behavior policies. Then, to enhance the stability and robustness of the
+policies, we propose two normalization methods and summarize four prompt design
+principles. Finally, we design a novel parameter-efficient training
+architecture where the actor and critic share one frozen LLM equipped with
+low-rank adapters (LoRA) updated by PPO. We conduct extensive experiments to
+evaluate TWOSOME. i) TWOSOME exhibits significantly better sample efficiency
+and performance compared to the conventional RL method, PPO, and prompt tuning
+method, SayCan, in both classical decision-making environment, Overcooked, and
+simulated household environment, VirtualHome. ii) Benefiting from LLMs'
+open-vocabulary feature, TWOSOME shows superior generalization ability to
+unseen tasks. iii) Under our framework, there is no significant loss of the
+LLMs' original ability during online PPO finetuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convolutional Neural Networks can achieve binary bail judgement
+  classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Barman, Devangan Roy, Debapriya Paul, Indranil Dutta, Shouvik Kumar Guha, Samir Karmakar, Sudip Kumar Naskar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is an evident lack of implementation of Machine Learning (ML) in the
+legal domain in India, and any research that does take place in this domain is
+usually based on data from the higher courts of law and works with English
+data. The lower courts and data from the different regional languages of India
+are often overlooked. In this paper, we deploy a Convolutional Neural Network
+(CNN) architecture on a corpus of Hindi legal documents. We perform a bail
+Prediction task with the help of a CNN model and achieve an overall accuracy of
+93\% which is an improvement on the benchmark accuracy, set by Kapoor et al.
+(2022), albeit in data from 20 districts of the Indian state of Uttar Pradesh.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted on 20th International Conference on Natural Language
+  Processing (ICON)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Affinity, Rationality, and Diversity of Hierarchical Topic
+  Modeling <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaobao Wu, Fengjun Pan, Thong Nguyen, Yichao Feng, Chaoqun Liu, Cong-Duy Nguyen, Anh Tuan Luu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hierarchical topic modeling aims to discover latent topics from a corpus and
+organize them into a hierarchy to understand documents with desirable semantic
+granularity. However, existing work struggles with producing topic hierarchies
+of low affinity, rationality, and diversity, which hampers document
+understanding. To overcome these challenges, we in this paper propose Transport
+Plan and Context-aware Hierarchical Topic Model (TraCo). Instead of early
+simple topic dependencies, we propose a transport plan dependency method. It
+constrains dependencies to ensure their sparsity and balance, and also
+regularizes topic hierarchy building with them. This improves affinity and
+diversity of hierarchies. We further propose a context-aware disentangled
+decoder. Rather than previously entangled decoding, it distributes different
+semantic granularity to topics at different levels by disentangled decoding.
+This facilitates the rationality of hierarchies. Experiments on benchmark
+datasets demonstrate that our method surpasses state-of-the-art baselines,
+effectively improving the affinity, rationality, and diversity of hierarchical
+topic modeling with better performance on downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI2024 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CompactifAI: Extreme Compression of Large Language Models using
+  Quantum-Inspired Tensor Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrei Tomut, Saeed S. Jahromi, Sukhbinder Singh, Faysal Ishtiaq, Cesar Muñoz, Prabdeep Singh Bajaj, Ali Elborady, Gianni del Bimbo, Mehrazin Alizadeh, David Montero, Pablo Martin-Ramiro, Muhammad Ibrahim, Oussama Tahiri Alaoui, John Malcolm, Samuel Mugel, Roman Orus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) such as ChatGPT and LlaMA are advancing rapidly
+in generative Artificial Intelligence (AI), but their immense size poses
+significant challenges, such as huge training and inference costs, substantial
+energy demands, and limitations for on-site deployment. Traditional compression
+methods such as pruning, distillation, and low-rank approximation focus on
+reducing the effective number of neurons in the network, while quantization
+focuses on reducing the numerical precision of individual weights to reduce the
+model size while keeping the number of neurons fixed. While these compression
+methods have been relatively successful in practice, there's no compelling
+reason to believe that truncating the number of neurons is an optimal strategy.
+In this context, this paper introduces CompactifAI, an innovative LLM
+compression approach using quantum-inspired Tensor Networks that focuses on the
+model's correlation space instead, allowing for a more controlled, refined and
+interpretable model compression. Our method is versatile and can be implemented
+with - or on top of - other compression techniques. As a benchmark, we
+demonstrate that CompactifAI alone enables compression of the LlaMA-2 7B model
+to only $30\%$ of its original size while recovering over $90\%$ of the
+original accuracy after a brief distributed retraining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ta'keed: The First Generative Fact-Checking System for Arabic Claims 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saud Althabiti, Mohammad Ammar Alsalka, Eric Atwell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Ta'keed, an explainable Arabic automatic fact-checking
+system. While existing research often focuses on classifying claims as "True"
+or "False," there is a limited exploration of generating explanations for claim
+credibility, particularly in Arabic. Ta'keed addresses this gap by assessing
+claim truthfulness based on retrieved snippets, utilizing two main components:
+information retrieval and LLM-based claim verification. We compiled the
+ArFactEx, a testing gold-labelled dataset with manually justified references,
+to evaluate the system. The initial model achieved a promising F1 score of 0.72
+in the classification task. Meanwhile, the system's generated explanations are
+compared with gold-standard explanations syntactically and semantically. The
+study recommends evaluating using semantic similarities, resulting in an
+average cosine similarity score of 0.76. Additionally, we explored the impact
+of varying snippet quantities on claim classification accuracy, revealing a
+potential correlation, with the model using the top seven hits outperforming
+others with an F1 score of 0.77.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Goal-oriented Large Language Model <span class="highlight-title">Prompt</span>ing: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14043v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14043v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Li, Jonathan Leung, Zhiqi Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown prominent performance in various
+downstream tasks in which prompt engineering plays a pivotal role in optimizing
+LLMs' performance. This paper, not as an overview of current prompt engineering
+methods, aims to highlight the limitation of designing prompts while holding an
+anthropomorphic assumption that expects LLMs to think like humans. From our
+review of 35 representative studies, we demonstrate that a goal-oriented prompt
+formulation, which guides LLMs to follow established human logical thinking,
+significantly improves the performance of LLMs. Furthermore, We introduce a
+novel taxonomy that categorizes goal-oriented prompting methods into five
+interconnected stages and we demonstrate the broad applicability of our
+framework by summarizing ten applicable tasks. With four future directions
+proposed, we hope to further emphasize and promote goal-oriented prompt
+engineering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ (Chat)<span class="highlight-title">GPT</span> v <span class="highlight-title">BERT</span>: Dawn of Justice for Semantic Change Detection <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Periti, Haim Dubossarsky, Nina Tahmasebi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the universe of Natural Language Processing, Transformer-based language
+models like BERT and (Chat)GPT have emerged as lexical superheroes with great
+power to solve open research problems. In this paper, we specifically focus on
+the temporal problem of semantic change, and evaluate their ability to solve
+two diachronic extensions of the Word-in-Context (WiC) task: TempoWiC and
+HistoWiC. In particular, we investigate the potential of a novel, off-the-shelf
+technology like ChatGPT (and GPT) 3.5 compared to BERT, which represents a
+family of models that currently stand as the state-of-the-art for modeling
+semantic change. Our experiments represent the first attempt to assess the use
+of (Chat)GPT for studying semantic change. Our results indicate that ChatGPT
+performs significantly worse than the foundational GPT version. Furthermore,
+our results demonstrate that (Chat)GPT achieves slightly lower performance than
+BERT in detecting long-term changes but performs significantly worse in
+detecting short-term changes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Findings of EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerating Retrieval-Augmented Language Model Serving with Speculation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Zhang, Alan Zhu, Lijie Yang, Yihua Xu, Lanting Li, Phitchaya Mangpo Phothilimthana, Zhihao Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented language models (RaLM) have demonstrated the potential to
+solve knowledge-intensive natural language processing (NLP) tasks by combining
+a non-parametric knowledge base with a parametric language model. Instead of
+fine-tuning a fully parametric model, RaLM excels at its low-cost adaptation to
+the latest data and better source attribution mechanisms. Among various RaLM
+approaches, iterative RaLM delivers a better generation quality due to a more
+frequent interaction between the retriever and the language model. Despite the
+benefits, iterative RaLM usually encounters high overheads due to the frequent
+retrieval step. To this end, we propose RaLMSpec, a speculation-inspired
+framework that provides generic speed-up over iterative RaLM while preserving
+the same model outputs through speculative retrieval and batched verification.
+By further incorporating prefetching, optimal speculation stride scheduler, and
+asynchronous verification, RaLMSpec can automatically exploit the acceleration
+potential to the fullest. For naive iterative RaLM serving, extensive
+evaluations over three language models on four downstream QA datasets
+demonstrate that RaLMSpec can achieve a speed-up ratio of 1.75-2.39x,
+1.04-1.39x, and 1.31-1.77x when the retriever is an exact dense retriever,
+approximate dense retriever, and sparse retriever respectively compared with
+the baseline. For KNN-LM serving, RaLMSpec can achieve a speed-up ratio up to
+7.59x and 2.45x when the retriever is an exact dense retriever and approximate
+dense retriever, respectively, compared with the baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation
+  for Generative AI <span class="chip">NAACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elron Bandel, Yotam Perlitz, Elad Venezian, Roni Friedman-Melamed, Ofir Arviv, Matan Orbach, Shachar Don-Yehyia, Dafna Sheinwald, Ariel Gera, Leshem Choshen, Michal Shmueli-Scheuer, Yoav Katz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the dynamic landscape of generative NLP, traditional text processing
+pipelines limit research flexibility and reproducibility, as they are tailored
+to specific dataset, task, and model combinations. The escalating complexity,
+involving system prompts, model-specific formats, instructions, and more, calls
+for a shift to a structured, modular, and customizable solution. Addressing
+this need, we present Unitxt, an innovative library for customizable textual
+data preparation and evaluation tailored to generative language models. Unitxt
+natively integrates with common libraries like HuggingFace and LM-eval-harness
+and deconstructs processing flows into modular components, enabling easy
+customization and sharing between practitioners. These components encompass
+model-specific formats, task prompts, and many other comprehensive dataset
+processing definitions. The Unitxt-Catalog centralizes these components,
+fostering collaboration and exploration in modern textual data workflows.
+Beyond being a tool, Unitxt is a community-driven platform, empowering users to
+build, share, and advance their pipelines collaboratively. Join the Unitxt
+community at https://github.com/IBM/unitxt!
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NAACL demo track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Uncertainty-Aware Language Agent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiuzhou Han, Wray Buntine, Ehsan Shareghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Language Agents have achieved promising success by placing Large
+Language Models at the core of a more versatile design that dynamically
+interacts with the external world, the existing approaches neglect the notion
+of uncertainty during these interactions. We present the Uncertainty-Aware
+Language Agent (UALA), a framework that orchestrates the interaction between
+the agent and the external world using uncertainty quantification. Compared
+with other well-known counterparts like ReAct, our extensive experiments across
+3 representative tasks (HotpotQA, StrategyQA, MMLU) and various LLM sizes
+demonstrates that UALA brings a significant improvement of performance, while
+having a substantially lower reliance on the external world (i.e., reduced
+number of tool calls and tokens). Our analyses provide various insights
+including the great potential of UALA compared with agent fine-tuning, and
+underscoring the unreliably of verbalised confidence of LLMs as a proxy for
+uncertainty.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and data are at https://uala-agent.github.io. arXiv admin
+  note: substantial text overlap with arXiv:2310.05915</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CMMU: A Benchmark for Chinese Multi-modal Multi-type Question
+  Understanding and Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheqi He, Xinya Wu, Pengfei Zhou, Richeng Xuan, Guang Liu, Xi Yang, Qiannan Zhu, Hua Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal large language models(MLLMs) have achieved remarkable progress
+and demonstrated powerful knowledge comprehension and reasoning abilities.
+However, the mastery of domain-specific knowledge, which is essential for
+evaluating the intelligence of MLLMs, continues to be a challenge. Current
+multi-modal benchmarks for domain-specific knowledge concentrate on
+multiple-choice questions and are predominantly available in English, which
+imposes limitations on the comprehensiveness of the evaluation. To this end, we
+introduce CMMU, a novel benchmark for multi-modal and multi-type question
+understanding and reasoning in Chinese. CMMU consists of 3,603 questions in 7
+subjects, covering knowledge from primary to high school. The questions can be
+categorized into 3 types: multiple-choice, multiple-response, and
+fill-in-the-blank, bringing greater challenges to MLLMs. In addition, we
+propose a rigorous evaluation strategy called ShiftCheck for assessing
+multiple-choice questions. The strategy aims to reduce position bias, minimize
+the influence of randomness on correctness, and perform a quantitative analysis
+of position bias. We evaluate seven open-source MLLMs along with GPT4-V,
+Gemini-Pro, and Qwen-VL-Plus. The results demonstrate that CMMU poses a
+significant challenge to the recent MLLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConstraintChecker: A Plugin for Large Language Models to Reason on
+  Commonsense Knowledge Bases <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quyet V. Do, Tianqing Fang, Shizhe Diao, Zhaowei Wang, Yangqiu Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning over Commonsense Knowledge Bases (CSKB), i.e. CSKB reasoning, has
+been explored as a way to acquire new commonsense knowledge based on reference
+knowledge in the original CSKBs and external prior knowledge. Despite the
+advancement of Large Language Models (LLM) and prompt engineering techniques in
+various reasoning tasks, they still struggle to deal with CSKB reasoning. One
+of the problems is that it is hard for them to acquire explicit relational
+constraints in CSKBs from only in-context exemplars, due to a lack of symbolic
+reasoning capabilities (Bengio et al., 2021). To this end, we proposed
+**ConstraintChecker**, a plugin over prompting techniques to provide and check
+explicit constraints. When considering a new knowledge instance,
+ConstraintChecker employs a rule-based module to produce a list of constraints,
+then it uses a zero-shot learning module to check whether this knowledge
+instance satisfies all constraints. The acquired constraint-checking result is
+then aggregated with the output of the main prompting technique to produce the
+final output. Experimental results on CSKB Reasoning benchmarks demonstrate the
+effectiveness of our method by bringing consistent improvements over all
+prompting methods. Codes and data are available at
+\url{https://github.com/HKUST-KnowComp/ConstraintChecker}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigate-Consolidate-Exploit: A General Strategy for Inter-Task Agent
+  Self-Evolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Qian, Shihao Liang, Yujia Qin, Yining Ye, Xin Cong, Yankai Lin, Yesai Wu, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Investigate-Consolidate-Exploit (ICE), a novel strategy
+for enhancing the adaptability and flexibility of AI agents through inter-task
+self-evolution. Unlike existing methods focused on intra-task learning, ICE
+promotes the transfer of knowledge between tasks for genuine self-evolution,
+similar to human experience learning. The strategy dynamically investigates
+planning and execution trajectories, consolidates them into simplified
+workflows and pipelines, and exploits them for improved task execution. Our
+experiments on the XAgent framework demonstrate ICE's effectiveness, reducing
+API calls by as much as 80% and significantly decreasing the demand for the
+model's capability. Specifically, when combined with GPT-3.5, ICE's performance
+matches that of raw GPT-4 across various agent tasks. We argue that this
+self-evolution approach represents a paradigm shift in agent design,
+contributing to a more robust AI community and ecosystem, and moving a step
+closer to full autonomy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Consistent Natural-Language Explanations via
+  Explanation-Consistency Finetuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanda Chen, Chandan Singh, Xiaodong Liu, Simiao Zuo, Bin Yu, He He, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often generate convincing, fluent explanations.
+However, different from humans, they often generate inconsistent explanations
+on different inputs. For example, an LLM may generate the explanation "all
+birds can fly" when answering the question "Can sparrows fly?" but meanwhile
+answer "no" to the related question "Can penguins fly?". Explanations should be
+consistent across related examples so that they allow a human to simulate the
+LLM's decision process on multiple examples. We propose explanation-consistency
+finetuning (EC-finetuning), a method that adapts LLMs to generate more
+consistent natural-language explanations on related examples. EC-finetuning
+involves finetuning LLMs on synthetic data that is carefully constructed to
+contain consistent explanations. Across a variety of question-answering
+datasets in various domains, EC-finetuning yields a 10.0% relative explanation
+consistency improvement on four finetuning datasets, and generalizes to seven
+out-of-distribution datasets not seen during finetuning (+4.5% relative). Code
+is available at https://github.com/yandachen/explanation-consistency-finetuning .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2307.08678</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leeroo Orchestrator: Elevating LLMs Performance Through Model
+  Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13979v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13979v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alireza Mohammadshahi, Ali Shaikh, Majid Yazdani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose an architecture to harness the collective knowledge
+of multiple trained LLMs to create a new state-of-the-art. At the core of this
+framework is a LLM-based orchestrator that is adept at picking the right
+underlying LLM experts for optimal task execution. Inspired by self-play in
+reinforcement learning, we created a loop of query generation, orchestration,
+and evaluation to generate training data for the orchestrator. Our evaluation
+focused on the MMLU benchmark, employing models with 7B, 13B, and 34B
+parameters available on Hugging Face. The results demonstrate new
+state-of-the-art open-source models: Our Leeroo orchestrator achieves
+performance on par with the Mixtral model while incurring only two-thirds of
+its cost. Moreover, increasing the allowed cost surpasses Mixtral's accuracy by
+over 5% at the same cost level, reaching an accuracy of 75.9%. Further
+enhancements were observed when integrating GPT4 into the underlying model
+pool. The Leeroo orchestrator nearly matches GPT4's performance at half the
+cost and even exceeds GPT4's results with a 25% cost reduction. These findings
+illustrate the potential of our architecture in creating state-of-the-art and
+cost-effective LLMs by optimizing the synergy between multiple LLMs to achieve
+superior performance outcomes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Text Watermark for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yepeng Liu, Yuheng Bu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of Large Language Models (LLMs) has led to increasing
+concerns about the misuse of AI-generated text, and watermarking for
+LLM-generated text has emerged as a potential solution. However, it is
+challenging to generate high-quality watermarked text while maintaining strong
+security, robustness, and the ability to detect watermarks without prior
+knowledge of the prompt or model. This paper proposes an adaptive watermarking
+strategy to address this problem. To improve the text quality and maintain
+robustness, we adaptively add watermarking to token distributions with high
+entropy measured using an auxiliary model and keep the low entropy token
+distributions untouched. For the sake of security and to further minimize the
+watermark's impact on text quality, instead of using a fixed green/red list
+generated from a random secret key, which can be vulnerable to decryption and
+forgery, we adaptively scale up the output logits in proportion based on the
+semantic embedding of previously generated text using a well designed semantic
+mapping model. Our experiments involving various LLMs demonstrate that our
+approach achieves comparable robustness performance to existing watermark
+methods. Additionally, the text generated by our method has perplexity
+comparable to that of \emph{un-watermarked} LLMs while maintaining security
+even under various attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LocMoE: A Low-overhead MoE for Large Language Model Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13920v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13920v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Li, Zhijie Sun, Xuan He, Li Zeng, Yi Lin, Entong Li, Binfan Zheng, Rongqian Zhao, Xin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Mixtures-of-Experts (MoE) model is a widespread distributed and
+integrated learning method for large language models (LLM), which is favored
+due to its ability to sparsify and expand models efficiently. However, the
+performance of MoE is limited by load imbalance and high latency of All-To-All
+communication, along with relatively redundant computation owing to large
+expert capacity. Load imbalance may result from existing routing policies that
+consistently tend to select certain experts. The frequent inter-node
+communication in the All-To-All procedure also significantly prolongs the
+training time. To alleviate the above performance problems, we propose a novel
+routing strategy that combines load balance and locality by converting partial
+inter-node communication to that of intra-node. Notably, we elucidate that
+there is a minimum threshold for expert capacity, calculated through the
+maximal angular deviation between the gating weights of the experts and the
+assigned tokens. We port these modifications on the PanGu-Sigma model based on
+the MindSpore framework with multi-level routing and conduct experiments on
+Ascend clusters. The experiment results demonstrate that the proposed LocMoE
+reduces training time per epoch by 12.68% to 22.24% compared to classical
+routers, such as hash router and switch router, without impacting the model
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WebVoyager: Building an End-to-End Web Agent with Large Multimodal
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13919v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13919v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongliang He, Wenlin Yao, Kaixin Ma, Wenhao Yu, Yong Dai, Hongming Zhang, Zhenzhong Lan, Dong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of large language models (LLMs) leads to a new era marked by
+the development of autonomous applications in the real world, which drives
+innovation in the creation of advanced web-based agents. Existing web agents
+typically only handle one input modality and are evaluated only in simplified
+web simulators or static web snapshots, greatly limiting their applicability in
+real-world scenarios. To bridge this gap, we introduce WebVoyager, an
+innovative Large Multimodal Model (LMM) powered web agent that can complete
+user instructions end-to-end by interacting with real-world websites. Moreover,
+we propose a new evaluation protocol for web agents to address the challenges
+of automatic evaluation of open-ended web agent tasks, leveraging the robust
+multimodal comprehension capabilities of GPT-4V. We create a new benchmark by
+gathering real-world tasks from 15 widely used websites to evaluate our agents.
+We show that WebVoyager achieves a 55.7% task success rate, significantly
+surpassing the performance of both GPT-4 (All Tools) and the WebVoyager
+(text-only) setups, underscoring the exceptional capability of WebVoyager in
+practical applications. We found that our proposed automatic evaluation
+achieves 85.3% agreement with human judgment, paving the way for further
+development of web agents in a real-world setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ No More Distractions: an Adaptive Up-Sampling Algorithm to Reduce Data
+  Artifacts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Researchers recently found out that sometimes language models achieve high
+accuracy on benchmark data set, but they can not generalize very well with even
+little changes to the original data set. This is sometimes due to data
+artifacts, model is learning the spurious correlation between tokens and
+labels, instead of the semantics and logic. In this work, we analyzed SNLI data
+and visualized such spurious correlations. We proposed an adaptive up-sampling
+algorithm to correct the data artifacts, which is simple and effective, and
+does not need human edits or annotation. We did an experiment applying the
+algorithm to fix the data artifacts in SNLI data and the model trained with
+corrected data performed significantly better than the model trained with raw
+SNLI data, overall, as well as on the subset we corrected.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic embedded topic models and change-point detection for exploring
+  literary-historical hypotheses <span class="chip">EACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hale Sirin, Tom Lippincott
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel combination of dynamic embedded topic models and
+change-point detection to explore diachronic change of lexical semantic
+modality in classical and early Christian Latin. We demonstrate several methods
+for finding and characterizing patterns in the output, and relating them to
+traditional scholarship in Comparative Literature and Classics. This simple
+approach to unsupervised models of semantic change can be applied to any
+suitable corpus, and we conclude with future directions and refinements aiming
+to allow noisier, less-curated materials to meet that threshold.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to LaTeCH@EACL2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A comparative study of zero-shot inference with large language models
+  and supervised modeling in breast cancer pathology classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Madhumita Sushil, Travis Zack, Divneet Mandair, Zhiwei Zheng, Ahmed Wali, Yan-Ning Yu, Yuwei Quan, Atul J. Butte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although supervised machine learning is popular for information extraction
+from clinical notes, creating large annotated datasets requires extensive
+domain expertise and is time-consuming. Meanwhile, large language models (LLMs)
+have demonstrated promising transfer learning capability. In this study, we
+explored whether recent LLMs can reduce the need for large-scale data
+annotations. We curated a manually-labeled dataset of 769 breast cancer
+pathology reports, labeled with 13 categories, to compare zero-shot
+classification capability of the GPT-4 model and the GPT-3.5 model with
+supervised classification performance of three model architectures: random
+forests classifier, long short-term memory networks with attention (LSTM-Att),
+and the UCSF-BERT model. Across all 13 tasks, the GPT-4 model performed either
+significantly better than or as well as the best supervised model, the LSTM-Att
+model (average macro F1 score of 0.83 vs. 0.75). On tasks with high imbalance
+between labels, the differences were more prominent. Frequent sources of GPT-4
+errors included inferences from multiple samples and complex task design. On
+complex tasks where large annotated datasets cannot be easily collected, LLMs
+can reduce the burden of large-scale data labeling. However, if the use of LLMs
+is prohibitive, the use of simpler supervised models with large annotated
+datasets can provide comparable results. LLMs demonstrated the potential to
+speed up the execution of clinical NLP studies by reducing the need for
+curating large annotated datasets. This may result in an increase in the
+utilization of NLP-based variables and outcomes in observational clinical
+studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unmasking and Quantifying Racial Bias of Large Language Models in
+  Medical Report Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Yang, Xiaoyu Liu, Qiao Jin, Furong Huang, Zhiyong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models like GPT-3.5-turbo and GPT-4 hold promise for
+healthcare professionals, but they may inadvertently inherit biases during
+their training, potentially affecting their utility in medical applications.
+Despite few attempts in the past, the precise impact and extent of these biases
+remain uncertain. Through both qualitative and quantitative analyses, we find
+that these models tend to project higher costs and longer hospitalizations for
+White populations and exhibit optimistic views in challenging medical scenarios
+with much higher survival rates. These biases, which mirror real-world
+healthcare disparities, are evident in the generation of patient backgrounds,
+the association of specific diseases with certain races, and disparities in
+treatment recommendations, etc. Our findings underscore the critical need for
+future research to address and mitigate biases in language models, especially
+in critical healthcare applications, to ensure fair and accurate outcomes for
+all patients.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenPI2.0: An Improved <span class="highlight-title">Dataset</span> for Entity Tracking in Texts <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14603v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14603v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Zhang, Hainiu Xu, Abhinav Kommula, Chris Callison-Burch, Niket Tandon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Much text describes a changing world (e.g., procedures, stories, newswires),
+and understanding them requires tracking how entities change. An earlier
+dataset, OpenPI, provided crowdsourced annotations of entity state changes in
+text. However, a major limitation was that those annotations were free-form and
+did not identify salient changes, hampering model evaluation. To overcome these
+limitations, we present an improved dataset, OpenPI2.0, where entities and
+attributes are fully canonicalized and additional entity salience annotations
+are added. On our fairer evaluation setting, we find that current
+state-of-the-art language models are far from competent. We also show that
+using state changes of salient entities as a chain-of-thought prompt,
+downstream performance is improved on tasks such as question answering and
+classical planning, outperforming the setting involving all related entities
+indiscriminately. We offer OpenPI2.0 for the continued development of models
+that can understand the dynamics of entities in text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Topic Bias in Emotion Classification <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Wegge, Roman Klinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotion corpora are typically sampled based on keyword/hashtag search or by
+asking study participants to generate textual instances. In any case, these
+corpora are not uniform samples representing the entirety of a domain. We
+hypothesize that this practice of data acquisition leads to unrealistic
+correlations between overrepresented topics in these corpora that harm the
+generalizability of models. Such topic bias could lead to wrong predictions for
+instances like "I organized the service for my aunt's funeral." when funeral
+events are over-represented for instances labeled with sadness, despite the
+emotion of pride being more appropriate here. In this paper, we study this
+topic bias both from the data and the modeling perspective. We first label a
+set of emotion corpora automatically via topic modeling and show that emotions
+in fact correlate with specific topics. Further, we see that emotion
+classifiers are confounded by such topics. Finally, we show that the
+established debiasing method of adversarial correction via gradient reversal
+mitigates the issue. Our work points out issues with existing emotion corpora
+and that more representative resources are required for fair evaluation of
+models predicting affective concepts from text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to W-NUT at EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ System-Level Natural Language Feedback <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13588v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13588v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weizhe Yuan, Kyunghyun Cho, Jason Weston
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language (NL) feedback offers rich insights into user experience.
+While existing studies focus on an instance-level approach, where feedback is
+used to refine specific examples, we introduce a framework for system-level use
+of NL feedback. We show how to use feedback to formalize system-level design
+decisions in a human-in-the-loop-process -- in order to produce better models.
+In particular this is done through: (i) metric design for tasks; and (ii)
+language model prompt design for refining model responses. We conduct two case
+studies of this approach for improving search query and dialog response
+generation, demonstrating the effectiveness of system-level feedback. We show
+the combination of system-level and instance-level feedback brings further
+gains, and that human written instance-level feedback results in more grounded
+refinements than GPT-3.5 written ones, underlying the importance of human
+feedback for building systems. We release our code and data at
+https://github.com/yyy-Apple/Sys-NL-Feedback.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TrustLLM: Trustworthiness in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.05561v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.05561v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lichao Sun, Yue Huang, Haoran Wang, Siyuan Wu, Qihui Zhang, Chujie Gao, Yixin Huang, Wenhan Lyu, Yixuan Zhang, Xiner Li, Zhengliang Liu, Yixin Liu, Yijue Wang, Zhikun Zhang, Bhavya Kailkhura, Caiming Xiong, Chaowei Xiao, Chunyuan Li, Eric Xing, Furong Huang, Hao Liu, Heng Ji, Hongyi Wang, Huan Zhang, Huaxiu Yao, Manolis Kellis, Marinka Zitnik, Meng Jiang, Mohit Bansal, James Zou, Jian Pei, Jian Liu, Jianfeng Gao, Jiawei Han, Jieyu Zhao, Jiliang Tang, Jindong Wang, John Mitchell, Kai Shu, Kaidi Xu, Kai-Wei Chang, Lifang He, Lifu Huang, Michael Backes, Neil Zhenqiang Gong, Philip S. Yu, Pin-Yu Chen, Quanquan Gu, Ran Xu, Rex Ying, Shuiwang Ji, Suman Jana, Tianlong Chen, Tianming Liu, Tianyi Zhou, William Wang, Xiang Li, Xiangliang Zhang, Xiao Wang, Xing Xie, Xun Chen, Xuyu Wang, Yan Liu, Yanfang Ye, Yinzhi Cao, Yong Chen, Yue Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs), exemplified by ChatGPT, have gained
+considerable attention for their excellent natural language processing
+capabilities. Nonetheless, these LLMs present many challenges, particularly in
+the realm of trustworthiness. Therefore, ensuring the trustworthiness of LLMs
+emerges as an important topic. This paper introduces TrustLLM, a comprehensive
+study of trustworthiness in LLMs, including principles for different dimensions
+of trustworthiness, established benchmark, evaluation, and analysis of
+trustworthiness for mainstream LLMs, and discussion of open challenges and
+future directions. Specifically, we first propose a set of principles for
+trustworthy LLMs that span eight different dimensions. Based on these
+principles, we further establish a benchmark across six dimensions including
+truthfulness, safety, fairness, robustness, privacy, and machine ethics. We
+then present a study evaluating 16 mainstream LLMs in TrustLLM, consisting of
+over 30 datasets. Our findings firstly show that in general trustworthiness and
+utility (i.e., functional effectiveness) are positively related. Secondly, our
+observations reveal that proprietary LLMs generally outperform most open-source
+counterparts in terms of trustworthiness, raising concerns about the potential
+risks of widely accessible open-source LLMs. However, a few open-source LLMs
+come very close to proprietary ones. Thirdly, it is important to note that some
+LLMs may be overly calibrated towards exhibiting trustworthiness, to the extent
+that they compromise their utility by mistakenly treating benign prompts as
+harmful and consequently not responding. Finally, we emphasize the importance
+of ensuring transparency not only in the models themselves but also in the
+technologies that underpin trustworthiness. Knowing the specific trustworthy
+technologies that have been employed is crucial for analyzing their
+effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is still under work and we welcome your contribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Speech<span class="highlight-title">GPT</span>-Gen: Scaling Chain-of-Information Speech Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13527v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13527v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Zhang, Xin Zhang, Jun Zhan, Shimin Li, Yaqian Zhou, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benefiting from effective speech modeling, current Speech Large Language
+Models (SLLMs) have demonstrated exceptional capabilities in in-context speech
+generation and efficient generalization to unseen speakers. However, the
+prevailing information modeling process is encumbered by certain redundancies,
+leading to inefficiencies in speech generation. We propose Chain-of-Information
+Generation (CoIG), a method for decoupling semantic and perceptual information
+in large-scale speech generation. Building on this, we develop SpeechGPT-Gen,
+an 8-billion-parameter SLLM efficient in semantic and perceptual information
+modeling. It comprises an autoregressive model based on LLM for semantic
+information modeling and a non-autoregressive model employing flow matching for
+perceptual information modeling. Additionally, we introduce the novel approach
+of infusing semantic information into the prior distribution to enhance the
+efficiency of flow matching. Extensive experimental results demonstrate that
+SpeechGPT-Gen markedly excels in zero-shot text-to-speech, zero-shot voice
+conversion, and speech-to-speech dialogue, underscoring CoIG's remarkable
+proficiency in capturing and modeling speech's semantic and perceptual
+dimensions. Code and models are available at
+https://github.com/0nutation/SpeechGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ General Phrase Debiaser: Debiasing Masked Language Models at a
+  Multi-Token Level <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.13892v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.13892v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingkang Shi, Xiaodan Zhang, Dehan Kong, Yulei Wu, Zongzhen Liu, Honglei Lyu, Longtao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The social biases and unwelcome stereotypes revealed by pretrained language
+models are becoming obstacles to their application. Compared to numerous
+debiasing methods targeting word level, there has been relatively less
+attention on biases present at phrase level, limiting the performance of
+debiasing in discipline domains. In this paper, we propose an automatic
+multi-token debiasing pipeline called \textbf{General Phrase Debiaser}, which
+is capable of mitigating phrase-level biases in masked language models.
+Specifically, our method consists of a \textit{phrase filter stage} that
+generates stereotypical phrases from Wikipedia pages as well as a \textit{model
+debias stage} that can debias models at the multi-token level to tackle bias
+challenges on phrases. The latter searches for prompts that trigger model's
+bias, and then uses them for debiasing. State-of-the-art results on standard
+datasets and metrics show that our approach can significantly reduce gender
+biases on both career and multiple disciplines, across models with varying
+parameter sizes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2024 as mian conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Zero to Hero: Harnessing <span class="highlight-title">Transformer</span>s for Biomedical Named Entity
+  Recognition in Zero- and Few-shot Contexts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04928v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04928v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miloš Košprdić, Nikola Prodanović, Adela Ljajić, Bojana Bašaragin, Nikola Milošević
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised named entity recognition (NER) in the biomedical domain depends on
+large sets of annotated texts with the given named entities. The creation of
+such datasets can be time-consuming and expensive, while extraction of new
+entities requires additional annotation tasks and retraining the model. To
+address these challenges, this paper proposes a method for zero- and few-shot
+NER in the biomedical domain. The method is based on transforming the task of
+multi-class token classification into binary token classification and
+pre-training on a large amount of datasets and biomedical entities, which allow
+the model to learn semantic relations between the given and potentially novel
+named entity labels. We have achieved average F1 scores of 35.44% for zero-shot
+NER, 50.10% for one-shot NER, 69.94% for 10-shot NER, and 79.51% for 100-shot
+NER on 9 diverse evaluated biomedical entities with fine-tuned PubMedBERT-based
+model. The results demonstrate the effectiveness of the proposed method for
+recognizing new biomedical entities with no or limited number of examples,
+outperforming previous transformer-based methods, and being comparable to
+GPT3-based models using models with over 1000 times fewer parameters. We make
+models and developed code publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Collaboration between Bayer Pharma R&D and Serbian Institute for
+  Artificial Intelligence Research and Development</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Language Models Generation Can Be Halted Early 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10818v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10818v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sofia Maria Lo Cicero Vaina, Nikita Balagansky, Daniil Gavrilov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Language models (DLMs) are a promising avenue for text generation
+due to their practical properties on tractable controllable generation. They
+also have the advantage of not having to predict text autoregressively.
+However, despite these notable features, DLMs have not yet reached the
+performance levels of their autoregressive counterparts. One of the ways to
+reduce the performance gap between these two types of language models is to
+speed up the generation of DLMs. Therefore, we propose a novel methodology to
+address this issue in this work. It enables the execution of more generation
+steps within a given time frame, leading to higher-quality outputs.
+Specifically, our methods estimate DLMs completeness of text generation and
+allow adaptive halting of the generation process. We evaluate our methods on
+Plaid, SSD, and CDCD DLMs and create a cohesive perspective on their generation
+workflows. Finally, we confirm that our methods allow halting these models and
+decrease the generation time by $10$-$40$\% without a drop in the quality of
+model samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Culture<span class="highlight-title">BERT</span>: Measuring Corporate Culture With <span class="highlight-title">Transformer</span>-Based Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.00509v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.00509v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Koch, Stefan Pasch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces transformer-based language models to the literature
+measuring corporate culture from text documents. We compile a unique data set
+of employee reviews that were labeled by human evaluators with respect to the
+information the reviews reveal about the firms' corporate culture. Using this
+data set, we fine-tune state-of-the-art transformer-based language models to
+perform the same classification task. In out-of-sample predictions, our
+language models classify 17 to 30 percentage points more of employee reviews in
+line with human evaluators than traditional approaches of text classification.
+We make our models publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What the Weight?! A Unified Framework for Zero-Shot Knowledge
+  Composition <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12756v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12756v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carolin Holtermann, Markus Frohmann, Navid Rekabsaz, Anne Lauscher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The knowledge encapsulated in a model is the core factor determining its
+final performance on downstream tasks. Much research in NLP has focused on
+efficient methods for storing and adapting different types of knowledge, e.g.,
+in dedicated modularized structures, and on how to effectively combine these,
+e.g., by learning additional parameters. However, given the many possible
+options, a thorough understanding of the mechanisms involved in these
+compositions is missing, and hence it remains unclear which strategies to
+utilize. To address this research gap, we propose a novel framework for
+zero-shot module composition, which encompasses existing and some novel
+variations for selecting, weighting, and combining parameter modules under a
+single unified notion. Focusing on the scenario of domain knowledge and adapter
+layers, our framework provides a systematic unification of concepts, allowing
+us to conduct the first comprehensive benchmarking study of various zero-shot
+knowledge composition strategies. In particular, we test two module combination
+methods and five selection and weighting strategies for their effectiveness and
+efficiency in an extensive experimental setup. Our results highlight the
+efficacy of ensembling but also hint at the power of simple though
+often-ignored weighting methods. Further in-depth analyses allow us to
+understand the role of weighting vs. top-k selection, and show that, to a
+certain extent, the performance of adapter composition can even be predicted.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of the ACL: EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BiTA: Bi-Directional Tuning for Lossless Acceleration in Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Lin, Hanling Yi, Hongbin Li, Yifan Yang, Xiaotian Yu, Guangming Lu, Rong Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) commonly employ autoregressive generation during
+inference, leading to high memory bandwidth demand and consequently extended
+latency. To mitigate this inefficiency, we present Bi-directional Tuning for
+lossless Acceleration (BiTA), an innovative method expediting LLMs via
+streamlined semi-autoregressive generation and draft verification. Inspired by
+the concept of prompt tuning, we enhance LLMs with a parameter-efficient design
+called bi-directional tuning for the capability in semi-autoregressive
+generation. Employing efficient tree-based decoding, the models perform draft
+candidate generation and verification in parallel, ensuring outputs identical
+to their autoregressive counterparts under greedy sampling. BiTA serves as a
+lightweight plug-in module, seamlessly boosting the inference efficiency of
+existing LLMs without requiring additional assistance models or incurring
+significant extra memory costs. Applying the proposed BiTA, LLaMA-2-70B-Chat
+achieves a 2.7$\times$ speedup on the MT-Bench benchmark. Extensive experiments
+confirm our method surpasses state-of-the-art acceleration techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An appendix has been included. Source code at
+  https://github.com/linfeng93/BiTA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meta <span class="highlight-title">Prompt</span>ing for AGI Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.11482v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.11482v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a comprehensive study of Meta Prompting, an innovative
+technique reshaping the utilization of large language models (LLMs),
+multi-modal foundation models, and AI systems in problem-solving and data
+interpretation. Grounded in type theory and category theory, Meta Prompting
+emphasizes the structure and syntax of information over traditional
+content-centric methods. The paper explores the formal definitions of Meta
+Prompting (MP), sets it apart from Few-Shot Prompting, and underlines its
+effectiveness in various AI applications. A key focus is on extending Meta
+Prompting to complex reasoning tasks, showing how it effectively deconstructs
+intricate problems into simpler sub-problems, enhancing token efficiency and
+enabling more equitable problem-solving comparisons, especially against
+few-shot example methods. Additionally, the paper introduces Meta Prompting for
+Prompting Tasks, allowing LLMs to self-generate new prompts in an iterative,
+metaprogramming-like manner. This innovative approach marks a significant leap
+in AI's autonomous and adaptive capabilities. The paper also pioneers the
+integration of Meta Prompting into multi-modal foundation model settings,
+tackling the challenges and opportunities of incorporating varied data types
+such as images, audio, and video within the structured Meta Prompting
+framework. (The code is available at
+https://github.com/meta-prompting/meta-prompting)
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentanglement in a GAN for Unconditional Speech Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01673v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01673v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Baas, Herman Kamper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can we develop a model that can synthesize realistic speech directly from a
+latent space, without explicit conditioning? Despite several efforts over the
+last decade, previous adversarial and diffusion-based approaches still struggle
+to achieve this, even on small-vocabulary datasets. To address this, we propose
+AudioStyleGAN (ASGAN) -- a generative adversarial network for unconditional
+speech synthesis tailored to learn a disentangled latent space. Building upon
+the StyleGAN family of image synthesis models, ASGAN maps sampled noise to a
+disentangled latent vector which is then mapped to a sequence of audio features
+so that signal aliasing is suppressed at every layer. To successfully train
+ASGAN, we introduce a number of new techniques, including a modification to
+adaptive discriminator augmentation which probabilistically skips discriminator
+updates. We apply it on the small-vocabulary Google Speech Commands digits
+dataset, where it achieves state-of-the-art results in unconditional speech
+synthesis. It is also substantially faster than existing top-performing
+diffusion models. We confirm that ASGAN's latent space is disentangled: we
+demonstrate how simple linear operations in the space can be used to perform
+several tasks unseen during training. Specifically, we perform evaluations in
+voice conversion, speech enhancement, speaker verification, and keyword
+classification. Our work indicates that GANs are still highly competitive in
+the unconditional speech synthesis landscape, and that disentangled latent
+spaces can be used to aid generalization to unseen tasks. Code, models,
+samples: https://github.com/RF5/simple-asgan/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 tables, 4 figures. Accepted to IEEE TASLP. arXiv admin
+  note: substantial text overlap with arXiv:2210.05271</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can AI Be as Creative as Humans? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01623v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01623v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Wang, James Zou, Michael Mozer, Anirudh Goyal, Alex Lamb, Linjun Zhang, Weijie J Su, Zhun Deng, Michael Qizhe Xie, Hannah Brown, Kenji Kawaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creativity serves as a cornerstone for societal progress and innovation. With
+the rise of advanced generative AI models capable of tasks once reserved for
+human creativity, the study of AI's creative potential becomes imperative for
+its responsible development and application. In this paper, we prove in theory
+that AI can be as creative as humans under the condition that it can properly
+fit the data generated by human creators. Therefore, the debate on AI's
+creativity is reduced into the question of its ability to fit a sufficient
+amount of data. To arrive at this conclusion, this paper first addresses the
+complexities in defining creativity by introducing a new concept called
+Relative Creativity. Rather than attempting to define creativity universally,
+we shift the focus to whether AI can match the creative abilities of a
+hypothetical human. The methodological shift leads to a statistically
+quantifiable assessment of AI's creativity, term Statistical Creativity. This
+concept, statistically comparing the creative abilities of AI with those of
+specific human groups, facilitates theoretical exploration of AI's creative
+potential. Our analysis reveals that by fitting extensive conditional data
+without marginalizing out the generative conditions, AI can emerge as a
+hypothetical new creator. The creator possesses the same creative abilities on
+par with the human creators it was trained on. Building on theoretical
+findings, we discuss the application in prompt-conditioned autoregressive
+models, providing a practical means for evaluating creative abilities of
+generative AI models, such as Large Language Models (LLMs). Additionally, this
+study provides an actionable training guideline, bridging the theoretical
+quantification of creativity with practical model training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper examines AI's creativity, introducing Relative and
+  Statistical Creativity for theoretical and practical analysis, along with
+  practical training guidelines. Project Page: ai-relative-creativity.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KoBBQ: Korean Bias Benchmark for Question Answering <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16778v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16778v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiho Jin, Jiseon Kim, Nayeon Lee, Haneul Yoo, Alice Oh, Hwaran Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Bias Benchmark for Question Answering (BBQ) is designed to evaluate
+social biases of language models (LMs), but it is not simple to adapt this
+benchmark to cultural contexts other than the US because social biases depend
+heavily on the cultural context. In this paper, we present KoBBQ, a Korean bias
+benchmark dataset, and we propose a general framework that addresses
+considerations for cultural adaptation of a dataset. Our framework includes
+partitioning the BBQ dataset into three classes--Simply-Transferred (can be
+used directly after cultural translation), Target-Modified (requires
+localization in target groups), and Sample-Removed (does not fit Korean
+culture)-- and adding four new categories of bias specific to Korean culture.
+We conduct a large-scale survey to collect and validate the social biases and
+the targets of the biases that reflect the stereotypes in Korean culture. The
+resulting KoBBQ dataset comprises 268 templates and 76,048 samples across 12
+categories of social bias. We use KoBBQ to measure the accuracy and bias scores
+of several state-of-the-art multilingual LMs. The results clearly show
+differences in the bias of LMs as measured by KoBBQ and a machine-translated
+version of BBQ, demonstrating the need for and utility of a well-constructed,
+culturally-aware social bias benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TACL 2024 (pre-MIT Press publication version)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Reasoning with Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11562v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11562v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiankai Sun, Chuanyang Zheng, Enze Xie, Zhengying Liu, Ruihang Chu, Jianing Qiu, Jiaqi Xu, Mingyu Ding, Hongyang Li, Mengzhe Geng, Yue Wu, Wenhai Wang, Junsong Chen, Zhangyue Yin, Xiaozhe Ren, Jie Fu, Junxian He, Wu Yuan, Qi Liu, Xihui Liu, Yu Li, Hao Dong, Yu Cheng, Ming Zhang, Pheng Ann Heng, Jifeng Dai, Ping Luo, Jingdong Wang, Ji-Rong Wen, Xipeng Qiu, Yike Guo, Hui Xiong, Qun Liu, Zhenguo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning, a crucial ability for complex problem-solving, plays a pivotal
+role in various real-world settings such as negotiation, medical diagnosis, and
+criminal investigation. It serves as a fundamental methodology in the field of
+Artificial General Intelligence (AGI). With the ongoing development of
+foundation models, e.g., Large Language Models (LLMs), there is a growing
+interest in exploring their abilities in reasoning tasks. In this paper, we
+introduce seminal foundation models proposed or adaptable for reasoning,
+highlighting the latest advancements in various reasoning tasks, methods, and
+benchmarks. We then delve into the potential future directions behind the
+emergence of reasoning abilities within foundation models. We also discuss the
+relevance of multimodal learning, autonomous agents, and super alignment in the
+context of reasoning. By discussing these future research directions, we hope
+to inspire researchers in their exploration of this field, stimulate further
+advancements in reasoning with foundation models, and contribute to the
+development of AGI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 Figures, 160 Pages, 750+ References, Project Page
+  https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Positive-Unlabeled Metric Learning Framework for Document-Level
+  Relation Extraction with Incomplete Labeling <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14806v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14806v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Wang, Huazheng Pan, Tao Zhang, Wen Wu, Wenxin Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of document-level relation extraction (RE) is to identify relations
+between entities that span multiple sentences. Recently, incomplete labeling in
+document-level RE has received increasing attention, and some studies have used
+methods such as positive-unlabeled learning to tackle this issue, but there is
+still a lot of room for improvement. Motivated by this, we propose a
+positive-augmentation and positive-mixup positive-unlabeled metric learning
+framework (P3M). Specifically, we formulate document-level RE as a metric
+learning problem. We aim to pull the distance closer between entity pair
+embedding and their corresponding relation embedding, while pushing it farther
+away from the none-class relation embedding. Additionally, we adapt the
+positive-unlabeled learning to this loss objective. In order to improve the
+generalizability of the model, we use dropout to augment positive samples and
+propose a positive-none-class mixup method. Extensive experiments show that P3M
+improves the F1 score by approximately 4-10 points in document-level RE with
+incomplete labeling, and achieves state-of-the-art results in fully labeled
+scenarios. Furthermore, P3M has also demonstrated robustness to prior
+estimation bias in incomplete labeled scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Arithmetic with Language Models: from Memorization to Computation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01154v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01154v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Maltoni, Matteo Ferrara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A better understanding of the emergent computation and problem-solving
+capabilities of recent large language models is of paramount importance to
+further improve them and broaden their applicability. This work investigates
+how a language model, trained to predict the next token, can perform arithmetic
+computations generalizing beyond training data. Binary addition and
+multiplication constitute a good testbed for this purpose, since they require a
+very small vocabulary and exhibit relevant input/output discontinuities making
+smooth input interpolation ineffective for novel data. We successfully trained
+a light language model to learn these tasks and ran a number of experiments to
+investigate the extrapolation capabilities and internal information processing.
+Our findings support the hypothesis that the language model works as an
+Encoding-Regression-Decoding machine where the computation takes place in the
+value space once the input token representation is mapped to an appropriate
+internal representation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ERNetCL: A novel emotion recognition network in textual conversation
+  based on curriculum learning strategy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiang Li, Xiaoping Wang, Yingjian Liu, Zhigang Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotion recognition in conversation (ERC) has emerged as a research hotspot
+in domains such as conversational robots and question-answer systems. How to
+efficiently and adequately retrieve contextual emotional cues has been one of
+the key challenges in the ERC task. Existing efforts do not fully model the
+context and employ complex network structures, resulting in limited performance
+gains. In this paper, we propose a novel emotion recognition network based on
+curriculum learning strategy (ERNetCL). The proposed ERNetCL primarily consists
+of temporal encoder (TE), spatial encoder (SE), and curriculum learning (CL)
+loss. We utilize TE and SE to combine the strengths of previous methods in a
+simplistic manner to efficiently capture temporal and spatial contextual
+information in the conversation. To ease the harmful influence resulting from
+emotion shift and simulate the way humans learn curriculum from easy to hard,
+we apply the idea of CL to the ERC task to progressively optimize the network
+parameters. At the beginning of training, we assign lower learning weights to
+difficult samples. As the epoch increases, the learning weights for these
+samples are gradually raised. Extensive experiments on four datasets exhibit
+that our proposed method is effective and dramatically beats other baseline
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Knowledge-Based Systems (KBS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retrieval augmentation of large language models for lay language
+  generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.03818v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.03818v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Guo, Wei Qiu, Gondy Leroy, Sheng Wang, Trevor Cohen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent lay language generation systems have used Transformer models trained
+on a parallel corpus to increase health information accessibility. However, the
+applicability of these models is constrained by the limited size and topical
+breadth of available corpora. We introduce CELLS, the largest (63k pairs) and
+broadest-ranging (12 journals) parallel corpus for lay language generation. The
+abstract and the corresponding lay language summary are written by domain
+experts, assuring the quality of our dataset. Furthermore, qualitative
+evaluation of expert-authored plain language summaries has revealed background
+explanation as a key strategy to increase accessibility. Such explanation is
+challenging for neural models to generate because it goes beyond simplification
+by adding content absent from the source. We derive two specialized paired
+corpora from CELLS to address key challenges in lay language generation:
+generating background explanations and simplifying the original abstract. We
+adopt retrieval-augmented models as an intuitive fit for the task of background
+explanation generation, and show improvements in summary quality and simplicity
+while maintaining factual correctness. Taken together, this work presents the
+first comprehensive study of background explanation for lay language
+generation, paving the path for disseminating scientific knowledge to a broader
+audience. CELLS is publicly available at:
+https://github.com/LinguisticAnomalies/pls_retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-Tuning or Retrieval? Comparing Knowledge Injection in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.05934v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.05934v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oded Ovadia, Menachem Brief, Moshik Mishaeli, Oren Elisha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) encapsulate a vast amount of factual information
+within their pre-trained weights, as evidenced by their ability to answer
+diverse questions across different domains. However, this knowledge is
+inherently limited, relying heavily on the characteristics of the training
+data. Consequently, using external datasets to incorporate new information or
+refine the capabilities of LLMs on previously seen information poses a
+significant challenge. In this study, we compare two common approaches:
+unsupervised fine-tuning and retrieval-augmented generation (RAG). We evaluate
+both approaches on a variety of knowledge-intensive tasks across different
+topics. Our findings reveal that while unsupervised fine-tuning offers some
+improvement, RAG consistently outperforms it, both for existing knowledge
+encountered during training and entirely new knowledge. Moreover, we find that
+LLMs struggle to learn new factual information through unsupervised
+fine-tuning, and that exposing them to numerous variations of the same fact
+during training could alleviate this problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Top in Chinese Data Processing: English Code Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10286v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10286v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linghan Zheng, Hui Liu, Xiaojun Lin, Jiayuan Dong, Yue Sheng, Gang Shi, Zhiwei Liu, Hongwei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the alignment between tasks and training corpora is a fundamental
+consensus in the application of language models, our series of experiments and
+the metrics we designed reveal that code-based Large Language Models (LLMs)
+significantly outperform models trained on data that is closely matched to the
+tasks in non-coding Chinese tasks. Moreover, in tasks high sensitivity to
+Chinese hallucinations, models exhibiting fewer linguistic features of the
+Chinese language achieve better performance. Our experimental results can be
+easily replicated in Chinese data processing tasks, such as preparing data for
+Retrieval-Augmented Generation (RAG), by simply replacing the base model with a
+code-based model. Additionally, our research offers a distinct perspective for
+discussion on the philosophical "Chinese Room" thought experiment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Cross-Lingual Transfer Learning For Automatic Speech
+  Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00789v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00789v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sameer Khurana, Nauman Dawalatabad, Antoine Laurent, Luis Vicente, Pablo Gimeno, Victoria Mingote, James Glass
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research in multilingual speech-to-text translation is topical. Having a
+single model that supports multiple translation tasks is desirable. The goal of
+this work it to improve cross-lingual transfer learning in multilingual
+speech-to-text translation via semantic knowledge distillation. We show that by
+initializing the encoder of the encoder-decoder sequence-to-sequence
+translation model with SAMU-XLS-R, a multilingual speech transformer encoder
+trained using multi-modal (speech-text) semantic knowledge distillation, we
+achieve significantly better cross-lingual task knowledge transfer than the
+baseline XLS-R, a multilingual speech transformer encoder trained via
+self-supervised learning. We demonstrate the effectiveness of our approach on
+two popular datasets, namely, CoVoST-2 and Europarl. On the 21 translation
+tasks of the CoVoST-2 benchmark, we achieve an average improvement of 12.8 BLEU
+points over the baselines. In the zero-shot translation scenario, we achieve an
+average gain of 18.8 and 11.9 average BLEU points on unseen medium and
+low-resource languages. We make similar observations on Europarl speech
+translation benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advancing Abductive Reasoning in Knowledge Graphs through Complex
+  Logical Hypothesis Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.15643v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.15643v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Bai, Yicheng Wang, Tianshi Zheng, Yue Guo, Xin Liu, Yangqiu Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abductive reasoning is the process of making educated guesses to provide
+explanations for observations. Although many applications require the use of
+knowledge for explanations, the utilization of abductive reasoning in
+conjunction with structured knowledge, such as a knowledge graph, remains
+largely unexplored. To fill this gap, this paper introduces the task of complex
+logical hypothesis generation, as an initial step towards abductive logical
+reasoning with KG. In this task, we aim to generate a complex logical
+hypothesis so that it can explain a set of observations. We find that the
+supervised trained generative model can generate logical hypotheses that are
+structurally closer to the reference hypothesis. However, when generalized to
+unseen observations, this training objective does not guarantee better
+hypothesis generation. To address this, we introduce the Reinforcement Learning
+from Knowledge Graph (RLF-KG) method, which minimizes differences between
+observations and conclusions drawn from generated hypotheses according to the
+KG. Experiments show that, with RLF-KG's assistance, the generated hypotheses
+provide better explanations, and achieve state-of-the-art results on three
+widely used KGs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Energy-based Automated Model Evaluation <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12689v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12689v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ru Peng, Heming Zou, Haobo Wang, Yawen Zeng, Zenan Huang, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional evaluation protocols on machine learning models rely heavily
+on a labeled, i.i.d-assumed testing dataset, which is not often present in real
+world applications. The Automated Model Evaluation (AutoEval) shows an
+alternative to this traditional workflow, by forming a proximal prediction
+pipeline of the testing performance without the presence of ground-truth
+labels. Despite its recent successes, the AutoEval frameworks still suffer from
+an overconfidence issue, substantial storage and computational cost. In that
+regard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that
+allows the AutoEval framework to be both more efficient and effective. The core
+of the MDE is to establish a meta-distribution statistic, on the information
+(energy) associated with individual samples, then offer a smoother
+representation enabled by energy-based learning. We further provide our
+theoretical insights by connecting the MDE with the classification loss. We
+provide extensive experiments across modalities, datasets and different
+architectural backbones to validate MDE's validity, together with its
+superiority compared with prior approaches. We also prove MDE's versatility by
+showing its seamless integration with large-scale models, and easy adaption to
+learning scenarios with noisy- or imbalanced- labels. Code and data are
+available: https://github.com/pengr/Energy_AutoEval
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR2024 poster paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mementos: A Comprehensive Benchmark for Multimodal Large Language Model
+  Reasoning over Image Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10529v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10529v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiyao Wang, Yuhang Zhou, Xiaoyu Liu, Hongjin Lu, Yuancheng Xu, Feihong He, Jaehong Yoon, Taixi Lu, Gedas Bertasius, Mohit Bansal, Huaxiu Yao, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in
+handling a variety of visual-language tasks. However, current MLLM benchmarks
+are predominantly designed to evaluate reasoning based on static information
+about a single image, and the ability of modern MLLMs to extrapolate from image
+sequences, which is essential for understanding our ever-changing world, has
+been less investigated. To address this challenge, this paper introduces
+Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning
+abilities. Mementos features 4,761 diverse image sequences with varying
+lengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning
+performance. Through a careful evaluation of nine recent MLLMs on Mementos,
+including GPT-4V and Gemini, we find that they struggle to accurately describe
+dynamic information about given image sequences, often leading to
+hallucinations/misrepresentations of objects and their corresponding behaviors.
+Our quantitative analysis and case studies identify three key factors impacting
+MLLMs' sequential image reasoning: the correlation between object and
+behavioral hallucinations, the influence of cooccurring behaviors, and the
+compounding impact of behavioral hallucinations. Our dataset is available at
+https://github.com/umd-huang-lab/Mementos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 23 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Large Language Models for Clinical Named Entity Recognition
+  via <span class="highlight-title">Prompt</span> Engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16416v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16416v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Hu, Qingyu Chen, Jingcheng Du, Xueqing Peng, Vipina Kuttichi Keloth, Xu Zuo, Yujia Zhou, Zehan Li, Xiaoqian Jiang, Zhiyong Lu, Kirk Roberts, Hua Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: This study quantifies the capabilities of GPT-3.5 and GPT-4 for
+clinical named entity recognition (NER) tasks and proposes task-specific
+prompts to improve their performance. Materials and Methods: We evaluated these
+models on two clinical NER tasks: (1) to extract medical problems, treatments,
+and tests from clinical notes in the MTSamples corpus, following the 2010 i2b2
+concept extraction shared task, and (2) identifying nervous system
+disorder-related adverse events from safety reports in the vaccine adverse
+event reporting system (VAERS). To improve the GPT models' performance, we
+developed a clinical task-specific prompt framework that includes (1) baseline
+prompts with task description and format specification, (2) annotation
+guideline-based prompts, (3) error analysis-based instructions, and (4)
+annotated samples for few-shot learning. We assessed each prompt's
+effectiveness and compared the models to BioClinicalBERT. Results: Using
+baseline prompts, GPT-3.5 and GPT-4 achieved relaxed F1 scores of 0.634, 0.804
+for MTSamples, and 0.301, 0.593 for VAERS. Additional prompt components
+consistently improved model performance. When all four components were used,
+GPT-3.5 and GPT-4 achieved relaxed F1 socres of 0.794, 0.861 for MTSamples and
+0.676, 0.736 for VAERS, demonstrating the effectiveness of our prompt
+framework. Although these results trail BioClinicalBERT (F1 of 0.901 for the
+MTSamples dataset and 0.802 for the VAERS), it is very promising considering
+few training samples are needed. Conclusion: While direct application of GPT
+models to clinical NER tasks falls short of optimal performance, our
+task-specific prompt framework, incorporating medical knowledge and training
+samples, significantly enhances GPT models' feasibility for potential clinical
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5 tables, 6 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Massive Editing for Large Language Models via Meta Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.04661v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.04661v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenmien Tan, Ge Zhang, Jie Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) have enabled learning knowledge from the
+pre-training corpora, the acquired knowledge may be fundamentally incorrect or
+outdated over time, which necessitates rectifying the knowledge of the language
+model (LM) after the training. A promising approach involves employing a
+hyper-network to generate parameter shift, whereas existing hyper-networks
+suffer from inferior scalability in synchronous editing operation amount. To
+mitigate the problem, we propose the MAssive Language Model Editing Network
+(MALMEN), which formulates the parameter shift aggregation as the least square
+problem, subsequently updating the LM parameters using the normal equation. To
+accommodate editing multiple facts simultaneously with limited memory budgets,
+we separate the computation on the hyper-network and LM, enabling arbitrary
+batch size on both neural networks. Our method is evaluated by editing up to
+thousands of facts on LMs with different architectures, i.e., BERT-base, GPT-2,
+T5-XL (2.8B), and GPT-J (6B), across various knowledge-intensive NLP tasks,
+i.e., closed book fact-checking and question answering. Remarkably, MALMEN is
+capable of editing hundreds of times more facts than strong baselines with the
+identical hyper-network architecture and outperforms editor specifically
+designed for GPT. Our code is available at
+https://github.com/ChenmienTan/malmen.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MM-LLMs: Recent Advances in MultiModal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13601v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13601v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duzhen Zhang, Yahan Yu, Chenxing Li, Jiahua Dong, Dan Su, Chenhui Chu, Dong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the past year, MultiModal Large Language Models (MM-LLMs) have undergone
+substantial advancements, augmenting off-the-shelf LLMs to support MM inputs or
+outputs via cost-effective training strategies. The resulting models not only
+preserve the inherent reasoning and decision-making capabilities of LLMs but
+also empower a diverse range of MM tasks. In this paper, we provide a
+comprehensive survey aimed at facilitating further research of MM-LLMs.
+Specifically, we first outline general design formulations for model
+architecture and training pipeline. Subsequently, we provide brief
+introductions of $26$ existing MM-LLMs, each characterized by its specific
+formulations. Additionally, we review the performance of MM-LLMs on mainstream
+benchmarks and summarize key training recipes to enhance the potency of
+MM-LLMs. Lastly, we explore promising directions for MM-LLMs while concurrently
+maintaining a real-time tracking website for the latest developments in the
+field. We hope that this survey contributes to the ongoing advancement of the
+MM-LLMs domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What do <span class="highlight-title">self-supervised</span> speech models know about words? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00162v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00162v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankita Pasad, Chung-Ming Chien, Shane Settle, Karen Livescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many self-supervised speech models (S3Ms) have been introduced over the last
+few years, improving performance and data efficiency on various speech tasks.
+However, these empirical successes alone do not give a complete picture of what
+is learned during pre-training. Recent work has begun analyzing how S3Ms encode
+certain properties, such as phonetic and speaker information, but we still lack
+a proper understanding of knowledge encoded at the word level and beyond. In
+this work, we use lightweight analysis methods to study segment-level
+linguistic properties -- word identity, boundaries, pronunciation, syntactic
+features, and semantic features -- encoded in S3Ms. We present a comparative
+study of layer-wise representations from ten S3Ms and find that (i) the
+frame-level representations within each word segment are not all equally
+informative, and (ii) the pre-training objective and model size heavily
+influence the accessibility and distribution of linguistic information across
+layers. We also find that on several tasks -- word discrimination, word
+segmentation, and semantic sentence similarity -- S3Ms trained with visual
+grounding outperform their speech-only counterparts. Finally, our task-based
+analyses demonstrate an improved performance on word segmentation and acoustic
+word discrimination while using simpler methods than prior work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a pre-MIT Press publication version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Adaptive Placement and Parallelism Framework for Accelerating RLHF
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11819v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11819v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youshao Xiao, Weichang Wu, Zhenglei Zhou, Fagui Mao, Shangchun Zhao, Lin Ju, Lei Liang, Xiaolu Zhang, Jun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, ChatGPT or InstructGPT like large language models (LLM) has made a
+significant impact in the AI world. Many works have attempted to reproduce the
+complex InstructGPT's training pipeline, namely Reinforcement Learning with
+Human Feedback (RLHF). However, the mainstream distributed RLHF training
+methods typically adopt a fixed model placement strategy, referred to as the
+Flattening strategy. This strategy treats all four interdependent models
+involved in RLHF as a single entity, distributing them across all devices and
+applying parallelism techniques designed for a single model, regardless of the
+different workloads inherent to each model. As a result, this strategy
+exacerbates the generation bottlenecks in the RLHF training and degrades the
+overall training efficiency. To address these issues, we propose an adaptive
+model placement framework that offers two flexible model placement strategies.
+The Interleaving strategy helps reduce memory redundancy and communication
+costs of RLHF training by placing models without dependencies on exclusive
+devices with careful orchestration. On the other hand, the Separation strategy
+improves the throughput of model training by separating the training and
+inference runtime of the RLHF pipeline with additional shadow models.
+Furthermore, our framework provides a simple user interface and allows for the
+agile allocation of models across devices in a fine-grained manner for various
+training scenarios, involving models of varying sizes and devices of different
+scales. Extensive experiments have demonstrated that our Interleaving and
+Separation strategies can achieve notable improvements up to 11X, compared to
+the current SOTA approaches. The results highlight the effectiveness and
+adaptability of our approaches in accelerating the training of distributed
+RLHF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unifying Large Language Models and Knowledge Graphs: A Roadmap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08302v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08302v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shirui Pan, Linhao Luo, Yufei Wang, Chen Chen, Jiapu Wang, Xindong Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs), such as ChatGPT and GPT4, are making new waves
+in the field of natural language processing and artificial intelligence, due to
+their emergent ability and generalizability. However, LLMs are black-box
+models, which often fall short of capturing and accessing factual knowledge. In
+contrast, Knowledge Graphs (KGs), Wikipedia and Huapu for example, are
+structured knowledge models that explicitly store rich factual knowledge. KGs
+can enhance LLMs by providing external knowledge for inference and
+interpretability. Meanwhile, KGs are difficult to construct and evolving by
+nature, which challenges the existing methods in KGs to generate new facts and
+represent unseen knowledge. Therefore, it is complementary to unify LLMs and
+KGs together and simultaneously leverage their advantages. In this article, we
+present a forward-looking roadmap for the unification of LLMs and KGs. Our
+roadmap consists of three general frameworks, namely, 1) KG-enhanced LLMs,
+which incorporate KGs during the pre-training and inference phases of LLMs, or
+for the purpose of enhancing understanding of the knowledge learned by LLMs; 2)
+LLM-augmented KGs, that leverage LLMs for different KG tasks such as embedding,
+completion, construction, graph-to-text generation, and question answering; and
+3) Synergized LLMs + KGs, in which LLMs and KGs play equal roles and work in a
+mutually beneficial way to enhance both LLMs and KGs for bidirectional
+reasoning driven by both data and knowledge. We review and summarize existing
+efforts within these three frameworks in our roadmap and pinpoint their future
+research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A short version of this paper was accepted by IEEE Transactions on
+  Knowledge and Data Engineering (TKDE)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">99</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Pathway: Improve <span class="highlight-title">Transformer</span>s with Irrelevant Data from Other
+  Modalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14405v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14405v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyuan Zhang, Xiaohan Ding, Kaixiong Gong, Yixiao Ge, Ying Shan, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose to improve transformers of a specific modality with irrelevant
+data from other modalities, e.g., improve an ImageNet model with audio or point
+cloud datasets. We would like to highlight that the data samples of the target
+modality are irrelevant to the other modalities, which distinguishes our method
+from other works utilizing paired (e.g., CLIP) or interleaved data of different
+modalities. We propose a methodology named Multimodal Pathway - given a target
+modality and a transformer designed for it, we use an auxiliary transformer
+trained with data of another modality and construct pathways to connect
+components of the two models so that data of the target modality can be
+processed by both models. In this way, we utilize the universal
+sequence-to-sequence modeling abilities of transformers obtained from two
+modalities. As a concrete implementation, we use a modality-specific tokenizer
+and task-specific head as usual but utilize the transformer blocks of the
+auxiliary model via a proposed method named Cross-Modal Re-parameterization,
+which exploits the auxiliary weights without any inference costs. On the image,
+point cloud, video, and audio recognition tasks, we observe significant and
+consistent performance improvements with irrelevant data from other modalities.
+The code and models are available at https://github.com/AILab-CVC/M2PT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and models are available at
+  https://github.com/AILab-CVC/M2PT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deconstructing Denoising Diffusion Models for <span class="highlight-title">Self-Supervised</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinlei Chen, Zhuang Liu, Saining Xie, Kaiming He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we examine the representation learning abilities of Denoising
+Diffusion Models (DDM) that were originally purposed for image generation. Our
+philosophy is to deconstruct a DDM, gradually transforming it into a classical
+Denoising Autoencoder (DAE). This deconstructive procedure allows us to explore
+how various components of modern DDMs influence self-supervised representation
+learning. We observe that only a very few modern components are critical for
+learning good representations, while many others are nonessential. Our study
+ultimately arrives at an approach that is highly simplified and to a large
+extent resembles a classical DAE. We hope our study will rekindle interest in a
+family of classical methods within the realm of modern self-supervised
+learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report, 10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Mobile Manipulation for Articulated Objects In the Open World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Xiong, Russell Mendonca, Kenneth Shaw, Deepak Pathak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying robots in open-ended unstructured environments such as homes has
+been a long-standing research problem. However, robots are often studied only
+in closed-off lab settings, and prior mobile manipulation work is restricted to
+pick-move-place, which is arguably just the tip of the iceberg in this area. In
+this paper, we introduce Open-World Mobile Manipulation System, a full-stack
+approach to tackle realistic articulated object operation, e.g. real-world
+doors, cabinets, drawers, and refrigerators in open-ended unstructured
+environments. The robot utilizes an adaptive learning framework to initially
+learns from a small set of data through behavior cloning, followed by learning
+from online practice on novel objects that fall outside the training
+distribution. We also develop a low-cost mobile manipulation hardware platform
+capable of safe and autonomous online adaptation in unstructured environments
+with a cost of around 20,000 USD. In our experiments we utilize 20 articulate
+objects across 4 buildings in the CMU campus. With less than an hour of online
+learning for each object, the system is able to increase success rate from 50%
+of BC pre-training to 95% using online adaptation. Video results at
+https://open-world-mobilemanip.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website at https://open-world-mobilemanip.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Range-Agnostic Multi-View Depth Estimation With Keyframe Selection <span class="chip">3DV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Conti, Matteo Poggi, Valerio Cambareri, Stefano Mattoccia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for 3D reconstruction from posed frames require prior knowledge about
+the scene metric range, usually to recover matching cues along the epipolar
+lines and narrow the search range. However, such prior might not be directly
+available or estimated inaccurately in real scenarios -- e.g., outdoor 3D
+reconstruction from video sequences -- therefore heavily hampering performance.
+In this paper, we focus on multi-view depth estimation without requiring prior
+knowledge about the metric range of the scene by proposing RAMDepth, an
+efficient and purely 2D framework that reverses the depth estimation and
+matching steps order. Moreover, we demonstrate the capability of our framework
+to provide rich insights about the quality of the views used for prediction.
+Additional material can be found on our project page
+https://andreaconti.github.io/projects/range_agnostic_multi_view_depth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3DV 2024 Project Page
+  https://andreaconti.github.io/projects/range_agnostic_multi_view_depth GitHub
+  Page https://github.com/andreaconti/ramdepth.git</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ pix2gestalt: Amodal Segmentation by Synthesizing Wholes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ege Ozguroglu, Ruoshi Liu, Dídac Surís, Dian Chen, Achal Dave, Pavel Tokmakov, Carl Vondrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce pix2gestalt, a framework for zero-shot amodal segmentation,
+which learns to estimate the shape and appearance of whole objects that are
+only partially visible behind occlusions. By capitalizing on large-scale
+diffusion models and transferring their representations to this task, we learn
+a conditional diffusion model for reconstructing whole objects in challenging
+zero-shot cases, including examples that break natural and physical priors,
+such as art. As training data, we use a synthetically curated dataset
+containing occluded objects paired with their whole counterparts. Experiments
+show that our approach outperforms supervised baselines on established
+benchmarks. Our model can furthermore be used to significantly improve the
+performance of existing object recognition and 3D reconstruction methods in the
+presence of occlusions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website: https://gestalt.cs.columbia.edu/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Patch Dependence for Masked Autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Letian Fu, Long Lian, Renhao Wang, Baifeng Shi, Xudong Wang, Adam Yala, Trevor Darrell, Alexei A. Efros, Ken Goldberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we re-examine inter-patch dependencies in the decoding
+mechanism of masked autoencoders (MAE). We decompose this decoding mechanism
+for masked patch reconstruction in MAE into self-attention and cross-attention.
+Our investigations suggest that self-attention between mask patches is not
+essential for learning good representations. To this end, we propose a novel
+pretraining framework: Cross-Attention Masked Autoencoders (CrossMAE).
+CrossMAE's decoder leverages only cross-attention between masked and visible
+tokens, with no degradation in downstream performance. This design also enables
+decoding only a small subset of mask tokens, boosting efficiency. Furthermore,
+each decoder block can now leverage different encoder features, resulting in
+improved representation learning. CrossMAE matches MAE in performance with 2.5
+to 3.7$\times$ less decoding compute. It also surpasses MAE on ImageNet
+classification and COCO instance segmentation under the same compute. Code and
+models: https://crossmae.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inconsistency Masks: Removing the Uncertainty from Input-Pseudo-Label
+  Pairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14387v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14387v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael R. H. Vorndran, Bernhard F. Roeck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating sufficient labeled data is a significant hurdle in the efficient
+execution of deep learning projects, especially in uncharted territories of
+image segmentation where labeling demands extensive time, unlike classification
+tasks. Our study confronts this challenge, operating in an environment
+constrained by limited hardware resources and the lack of extensive datasets or
+pre-trained models. We introduce the novel use of Inconsistency Masks (IM) to
+effectively filter uncertainty in image-pseudo-label pairs, substantially
+elevating segmentation quality beyond traditional semi-supervised learning
+techniques. By integrating IM with other methods, we demonstrate remarkable
+binary segmentation performance on the ISIC 2018 dataset, starting with just
+10% labeled data. Notably, three of our hybrid models outperform those trained
+on the fully labeled dataset. Our approach consistently achieves exceptional
+results across three additional datasets and shows further improvement when
+combined with other techniques. For comprehensive and robust evaluation, this
+paper includes an extensive analysis of prevalent semi-supervised learning
+strategies, all trained under identical starting conditions. The full code is
+available at: https://github.com/MichaelVorndran/InconsistencyMasks
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 22 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UrbanGenAI: Reconstructing Urban Landscapes using Panoptic Segmentation
+  and Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timo Kapsalis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contemporary design practices, the integration of computer vision and
+generative artificial intelligence (genAI) represents a transformative shift
+towards more interactive and inclusive processes. These technologies offer new
+dimensions of image analysis and generation, which are particularly relevant in
+the context of urban landscape reconstruction. This paper presents a novel
+workflow encapsulated within a prototype application, designed to leverage the
+synergies between advanced image segmentation and diffusion models for a
+comprehensive approach to urban design. Our methodology encompasses the
+OneFormer model for detailed image segmentation and the Stable Diffusion XL
+(SDXL) diffusion model, implemented through ControlNet, for generating images
+from textual descriptions. Validation results indicated a high degree of
+performance by the prototype application, showcasing significant accuracy in
+both object detection and text-to-image generation. This was evidenced by
+superior Intersection over Union (IoU) and CLIP scores across iterative
+evaluations for various categories of urban landscape features. Preliminary
+testing included utilising UrbanGenAI as an educational tool enhancing the
+learning experience in design pedagogy, and as a participatory instrument
+facilitating community-driven urban planning. Early results suggested that
+UrbanGenAI not only advances the technical frontiers of urban landscape
+reconstruction but also provides significant pedagogical and participatory
+planning benefits. The ongoing development of UrbanGenAI aims to further
+validate its effectiveness across broader contexts and integrate additional
+features such as real-time feedback mechanisms and 3D modelling capabilities.
+Keywords: generative AI; panoptic image segmentation; diffusion models; urban
+landscape design; design pedagogy; co-design
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Robust Generalizable Radiance Field with Visibility and Feature
+  Augmented Point Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14354v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14354v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxu Wang, Ziyi Zhang, Renjing Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel paradigm for the generalizable neural radiance
+field (NeRF). Previous generic NeRF methods combine multiview stereo techniques
+with image-based neural rendering for generalization, yielding impressive
+results, while suffering from three issues. First, occlusions often result in
+inconsistent feature matching. Then, they deliver distortions and artifacts in
+geometric discontinuities and locally sharp shapes due to their individual
+process of sampled points and rough feature aggregation. Third, their
+image-based representations experience severe degradations when source views
+are not near enough to the target view. To address challenges, we propose the
+first paradigm that constructs the generalizable neural field based on
+point-based rather than image-based rendering, which we call the Generalizable
+neural Point Field (GPF). Our approach explicitly models visibilities by
+geometric priors and augments them with neural features. We propose a novel
+nonuniform log sampling strategy to improve both rendering speed and
+reconstruction quality. Moreover, we present a learnable kernel spatially
+augmented with features for feature aggregations, mitigating distortions at
+places with drastically varying geometries. Besides, our representation can be
+easily manipulated. Experiments show that our model can deliver better
+geometries, view consistencies, and rendering quality than all counterparts and
+benchmarks on three datasets in both generalization and finetuning settings,
+preliminarily proving the potential of the new paradigm for generalizable NeRF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Conference on Learning Representations 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to navigate efficiently and precisely in real environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14349v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14349v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume Bono, Hervé Poirier, Leonid Antsfeld, Gianluca Monaci, Boris Chidlovskii, Christian Wolf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of autonomous navigation of terrestrial robots, the creation
+of realistic models for agent dynamics and sensing is a widespread habit in the
+robotics literature and in commercial applications, where they are used for
+model based control and/or for localization and mapping. The more recent
+Embodied AI literature, on the other hand, focuses on modular or end-to-end
+agents trained in simulators like Habitat or AI-Thor, where the emphasis is put
+on photo-realistic rendering and scene diversity, but high-fidelity robot
+motion is assigned a less privileged role. The resulting sim2real gap
+significantly impacts transfer of the trained models to real robotic platforms.
+In this work we explore end-to-end training of agents in simulation in settings
+which minimize the sim2real gap both, in sensing and in actuation. Our agent
+directly predicts (discretized) velocity commands, which are maintained through
+closed-loop control in the real robot. The behavior of the real robot
+(including the underlying low-level controller) is identified and simulated in
+a modified Habitat simulator. Noise models for odometry and localization
+further contribute in lowering the sim2real gap. We evaluate on real navigation
+scenarios, explore different localization and point goal calculation methods
+and report significant gains in performance and robustness compared to prior
+work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive Multi-task Anti-Noise Learning and Distilling Frameworks for
+  Fine-grained Vehicle Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dichao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-grained vehicle recognition (FGVR) is an essential fundamental
+technology for intelligent transportation systems, but very difficult because
+of its inherent intra-class variation. Most previous FGVR studies only focus on
+the intra-class variation caused by different shooting angles, positions, etc.,
+while the intra-class variation caused by image noise has received little
+attention. This paper proposes a progressive multi-task anti-noise learning
+(PMAL) framework and a progressive multi-task distilling (PMD) framework to
+solve the intra-class variation problem in FGVR due to image noise. The PMAL
+framework achieves high recognition accuracy by treating image denoising as an
+additional task in image recognition and progressively forcing a model to learn
+noise invariance. The PMD framework transfers the knowledge of the PMAL-trained
+model into the original backbone network, which produces a model with about the
+same recognition accuracy as the PMAL-trained model, but without any additional
+overheads over the original backbone network. Combining the two frameworks, we
+obtain models that significantly exceed previous state-of-the-art methods in
+recognition accuracy on two widely-used, standard FGVR datasets, namely
+Stanford Cars, and CompCars, as well as three additional surveillance
+image-based vehicle-type classification datasets, namely Beijing Institute of
+Technology (BIT)-Vehicle, Vehicle Type Image Data 2 (VTID2), and Vehicle Images
+Dataset for Make Model Recognition (VIDMMR), without any additional overheads
+over the original backbone networks. The source code is available at
+https://github.com/Dichao-Liu/Anti-noise_FGVR
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unlocking Past Information: Temporal Embeddings in Cooperative Bird's
+  Eye View Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Rößle, Jeremias Gerner, Klaus Bogenberger, Daniel Cremers, Stefanie Schmidtner, Torsten Schön
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and comprehensive semantic segmentation of Bird's Eye View (BEV) is
+essential for ensuring safe and proactive navigation in autonomous driving.
+Although cooperative perception has exceeded the detection capabilities of
+single-agent systems, prevalent camera-based algorithms in cooperative
+perception neglect valuable information derived from historical observations.
+This limitation becomes critical during sensor failures or communication issues
+as cooperative perception reverts to single-agent perception, leading to
+degraded performance and incomplete BEV segmentation maps. This paper
+introduces TempCoBEV, a temporal module designed to incorporate historical cues
+into current observations, thereby improving the quality and reliability of BEV
+map segmentations. We propose an importance-guided attention architecture to
+effectively integrate temporal information that prioritizes relevant properties
+for BEV map segmentation. TempCoBEV is an independent temporal module that
+seamlessly integrates into state-of-the-art camera-based cooperative perception
+models. We demonstrate through extensive experiments on the OPV2V dataset that
+TempCoBEV performs better than non-temporal models in predicting current and
+future BEV map segmentations, particularly in scenarios involving communication
+failures. We show the efficacy of TempCoBEV and its capability to integrate
+historical cues into the current BEV map, improving predictions under optimal
+communication conditions by up to 2% and under communication failures by up to
+19%. The code will be published on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized People Diversity: Learning a Human Perception-Aligned
+  Diversity Representation for People Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansa Srinivasan, Candice Schumann, Aradhana Sinha, David Madras, Gbolahan Oluwafemi Olanubi, Alex Beutel, Susanna Ricco, Jilin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capturing the diversity of people in images is challenging: recent literature
+tends to focus on diversifying one or two attributes, requiring expensive
+attribute labels or building classifiers. We introduce a diverse people image
+ranking method which more flexibly aligns with human notions of people
+diversity in a less prescriptive, label-free manner. The Perception-Aligned
+Text-derived Human representation Space (PATHS) aims to capture all or many
+relevant features of people-related diversity, and, when used as the
+representation space in the standard Maximal Marginal Relevance (MMR) ranking
+algorithm, is better able to surface a range of types of people-related
+diversity (e.g. disability, cultural attire). PATHS is created in two stages.
+First, a text-guided approach is used to extract a person-diversity
+representation from a pre-trained image-text model. Then this representation is
+fine-tuned on perception judgments from human annotators so that it captures
+the aspects of people-related similarity that humans find most salient.
+Empirical results show that the PATHS method achieves diversity better than
+baseline methods, according to side-by-side ratings from human annotators.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ POUR-Net: A Population-Prior-Aided Over-Under-Representation Network for
+  Low-Count PET Attenuation Map Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Zhou, Jun Hou, Tianqi Chen, Yinchi Zhou, Xiongchao Chen, Huidong Xie, Qiong Liu, Xueqi Guo, Yu-Jung Tsai, Vladimir Y. Panin, Takuya Toyonaga, James S. Duncan, Chi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-dose PET offers a valuable means of minimizing radiation exposure in PET
+imaging. However, the prevalent practice of employing additional CT scans for
+generating attenuation maps (u-map) for PET attenuation correction
+significantly elevates radiation doses. To address this concern and further
+mitigate radiation exposure in low-dose PET exams, we propose POUR-Net - an
+innovative population-prior-aided over-under-representation network that aims
+for high-quality attenuation map generation from low-dose PET. First, POUR-Net
+incorporates an over-under-representation network (OUR-Net) to facilitate
+efficient feature extraction, encompassing both low-resolution abstracted and
+fine-detail features, for assisting deep generation on the full-resolution
+level. Second, complementing OUR-Net, a population prior generation machine
+(PPGM) utilizing a comprehensive CT-derived u-map dataset, provides additional
+prior information to aid OUR-Net generation. The integration of OUR-Net and
+PPGM within a cascade framework enables iterative refinement of $\mu$-map
+generation, resulting in the production of high-quality $\mu$-maps.
+Experimental results underscore the effectiveness of POUR-Net, showing it as a
+promising solution for accurate CT-free low-count PET attenuation correction,
+which also surpasses the performance of previous baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sketch2NeRF: Multi-view Sketch-guided Text-to-3D Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14257v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14257v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minglin Chen, Longguang Wang, Weihao Yuan, Yukun Wang, Zhe Sheng, Yisheng He, Zilong Dong, Liefeng Bo, Yulan Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, text-to-3D approaches have achieved high-fidelity 3D content
+generation using text description. However, the generated objects are
+stochastic and lack fine-grained control. Sketches provide a cheap approach to
+introduce such fine-grained control. Nevertheless, it is challenging to achieve
+flexible control from these sketches due to their abstraction and ambiguity. In
+this paper, we present a multi-view sketch-guided text-to-3D generation
+framework (namely, Sketch2NeRF) to add sketch control to 3D generation.
+Specifically, our method leverages pretrained 2D diffusion models (e.g., Stable
+Diffusion and ControlNet) to supervise the optimization of a 3D scene
+represented by a neural radiance field (NeRF). We propose a novel synchronized
+generation and reconstruction method to effectively optimize the NeRF. In the
+experiments, we collected two kinds of multi-view sketch datasets to evaluate
+the proposed method. We demonstrate that our method can synthesize 3D
+consistent contents with fine-grained sketch control while being high-fidelity
+to text prompts. Extensive results show that our method achieves
+state-of-the-art performance in terms of sketch similarity and text alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Producing Plankton Classifiers that are Robust to <span class="highlight-title">Dataset</span> Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Chen, Sreenath Kyathanahally, Marta Reyes, Stefanie Merkli, Ewa Merz, Emanuele Francazi, Marvin Hoege, Francesco Pomati, Marco Baity-Jesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern plankton high-throughput monitoring relies on deep learning
+classifiers for species recognition in water ecosystems. Despite satisfactory
+nominal performances, a significant challenge arises from Dataset Shift, which
+causes performances to drop during deployment. In our study, we integrate the
+ZooLake dataset with manually-annotated images from 10 independent days of
+deployment, serving as test cells to benchmark Out-Of-Dataset (OOD)
+performances. Our analysis reveals instances where classifiers, initially
+performing well in In-Dataset conditions, encounter notable failures in
+practical scenarios. For example, a MobileNet with a 92% nominal test accuracy
+shows a 77% OOD accuracy. We systematically investigate conditions leading to
+OOD performance drops and propose a preemptive assessment method to identify
+potential pitfalls when classifying new data, and pinpoint features in OOD
+images that adversely impact classification. We present a three-step pipeline:
+(i) identifying OOD degradation compared to nominal test performance, (ii)
+conducting a diagnostic analysis of degradation causes, and (iii) providing
+solutions. We find that ensembles of BEiT vision transformers, with targeted
+augmentations addressing OOD robustness, geometric ensembling, and
+rotation-based test-time augmentation, constitute the most robust model, which
+we call BEsT model. It achieves an 83% OOD accuracy, with errors concentrated
+on container classes. Moreover, it exhibits lower sensitivity to dataset shift,
+and reproduces well the plankton abundances. Our proposed pipeline is
+applicable to generic plankton classifiers, contingent on the availability of
+suitable test cells. By identifying critical shortcomings and offering
+practical procedures to fortify models against dataset shift, our study
+contributes to the development of more reliable plankton classification
+technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ JUMP: A joint multimodal registration pipeline for neuroimaging with
+  minimal preprocessing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adria Casamitjana, Juan Eugenio Iglesias, Raul Tudela, Aida Ninerola-Baizan, Roser Sala-Llonch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a pipeline for unbiased and robust multimodal registration of
+neuroimaging modalities with minimal pre-processing. While typical multimodal
+studies need to use multiple independent processing pipelines, with diverse
+options and hyperparameters, we propose a single and structured framework to
+jointly process different image modalities. The use of state-of-the-art
+learning-based techniques enables fast inferences, which makes the presented
+method suitable for large-scale and/or multi-cohort datasets with a diverse
+number of modalities per session. The pipeline currently works with structural
+MRI, resting state fMRI and amyloid PET images. We show the predictive power of
+the derived biomarkers using in a case-control study and study the cross-modal
+relationship between different image modalities. The code can be found in
+https: //github.com/acasamitjana/JUMP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On generalisability of segment anything model for nuclear instance
+  segmentation in histology images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kesi Xu, Lea Goetz, Nasir Rajpoot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained on a large and diverse dataset, the segment anything model (SAM)
+is the first promptable foundation model in computer vision aiming at object
+segmentation tasks. In this work, we evaluate SAM for the task of nuclear
+instance segmentation performance with zero-shot learning and finetuning. We
+compare SAM with other representative methods in nuclear instance segmentation,
+especially in the context of model generalisability. To achieve automatic
+nuclear instance segmentation, we propose using a nuclei detection model to
+provide bounding boxes or central points of nu-clei as visual prompts for SAM
+in generating nuclear instance masks from histology images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Unexplored: Understanding the Impact of Layer Adjustments
+  on Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haixia Liu, Tim Brailsford, James Goulding, Gavin Smith, Larry Bull
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates how adjustments to deep learning architectures impact
+model performance in image classification. Small-scale experiments generate
+initial insights although the trends observed are not consistent with the
+entire dataset. Filtering operations in the image processing pipeline are
+crucial, with image filtering before pre-processing yielding better results.
+The choice and order of layers as well as filter placement significantly impact
+model performance. This study provides valuable insights into optimizing deep
+learning models, with potential avenues for future research including
+collaborative platforms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clinical Melanoma Diagnosis with Artificial Intelligence: Insights from
+  a Prospective Multicenter Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Heinlein, Roman C. Maron, Achim Hekler, Sarah Haggenmüller, Christoph Wies, Jochen S. Utikal, Friedegund Meier, Sarah Hobelsberger, Frank F. Gellrich, Mildred Sergon, Axel Hauschild, Lars E. French, Lucie Heinzerling, Justin G. Schlager, Kamran Ghoreschi, Max Schlaak, Franz J. Hilke, Gabriela Poch, Sören Korsing, Carola Berking, Markus V. Heppt, Michael Erdmann, Sebastian Haferkamp, Konstantin Drexler, Dirk Schadendorf, Wiebke Sondermann, Matthias Goebeler, Bastian Schilling, Eva Krieghoff-Henning, Titus J. Brinker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early detection of melanoma, a potentially lethal type of skin cancer with
+high prevalence worldwide, improves patient prognosis. In retrospective
+studies, artificial intelligence (AI) has proven to be helpful for enhancing
+melanoma detection. However, there are few prospective studies confirming these
+promising results. Existing studies are limited by low sample sizes, too
+homogenous datasets, or lack of inclusion of rare melanoma subtypes, preventing
+a fair and thorough evaluation of AI and its generalizability, a crucial aspect
+for its application in the clinical setting. Therefore, we assessed 'All Data
+are Ext' (ADAE), an established open-source ensemble algorithm for detecting
+melanomas, by comparing its diagnostic accuracy to that of dermatologists on a
+prospectively collected, external, heterogeneous test set comprising eight
+distinct hospitals, four different camera setups, rare melanoma subtypes, and
+special anatomical sites. We advanced the algorithm with real test-time
+augmentation (R-TTA, i.e. providing real photographs of lesions taken from
+multiple angles and averaging the predictions), and evaluated its
+generalization capabilities. Overall, the AI showed higher balanced accuracy
+than dermatologists (0.798, 95% confidence interval (CI) 0.779-0.814 vs. 0.781,
+95% CI 0.760-0.802; p<0.001), obtaining a higher sensitivity (0.921, 95% CI
+0.900- 0.942 vs. 0.734, 95% CI 0.701-0.770; p<0.001) at the cost of a lower
+specificity (0.673, 95% CI 0.641-0.702 vs. 0.828, 95% CI 0.804-0.852; p<0.001).
+As the algorithm exhibited a significant performance advantage on our
+heterogeneous dataset exclusively comprising melanoma-suspicious lesions, AI
+may offer the potential to support dermatologists particularly in diagnosing
+challenging cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vivim: a Video Vision Mamba for Medical Video Object Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijun Yang, Zhaohu Xing, Lei Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional convolutional neural networks have a limited receptive field
+while transformer-based networks are mediocre in constructing long-term
+dependency from the perspective of computational complexity. Such the
+bottleneck poses a significant challenge when processing long video sequences
+in video analysis tasks. Very recently, the state space models (SSMs) with
+efficient hardware-aware designs, famous by Mamba, have exhibited impressive
+achievements in long sequence modeling, which facilitates the development of
+deep neural networks on many vision tasks. To better capture available cues in
+video frames, this paper presents a generic Video Vision Mamba-based framework
+for medical video object segmentation tasks, named Vivim. Our Vivim can
+effectively compress the long-term spatiotemporal representation into sequences
+at varying scales by our designed Temporal Mamba Block. Compared to existing
+video-level Transformer-based methods, our model maintains excellent
+segmentation results with better speed performance. Extensive experiments on
+the breast US dataset demonstrate the effectiveness and efficiency of our
+Vivim. The code for Vivim is available at:
+https://github.com/scott-yjyang/Vivim.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grounded SAM: Assembling Open-World Models for Diverse Visual Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14159v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14159v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianhe Ren, Shilong Liu, Ailing Zeng, Jing Lin, Kunchang Li, He Cao, Jiayu Chen, Xinyu Huang, Yukang Chen, Feng Yan, Zhaoyang Zeng, Hao Zhang, Feng Li, Jie Yang, Hongyang Li, Qing Jiang, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Grounded SAM, which uses Grounding DINO as an open-set object
+detector to combine with the segment anything model (SAM). This integration
+enables the detection and segmentation of any regions based on arbitrary text
+inputs and opens a door to connecting various vision models. As shown in Fig.1,
+a wide range of vision tasks can be achieved by using the versatile Grounded
+SAM pipeline. For example, an automatic annotation pipeline based solely on
+input images can be realized by incorporating models such as BLIP and Recognize
+Anything. Additionally, incorporating Stable-Diffusion allows for controllable
+image editing, while the integration of OSX facilitates promptable 3D human
+motion analysis. Grounded SAM also shows superior performance on
+open-vocabulary benchmarks, achieving 48.7 mean AP on SegInW (Segmentation in
+the wild) zero-shot benchmark with the combination of Grounding DINO-Base and
+SAM-Huge models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LanDA: Language-Guided Multi-Source Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14148v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14148v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenbin Wang, Lei Zhang, Lituan Wang, Minjuan Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Source Domain Adaptation (MSDA) aims to mitigate changes in data
+distribution when transferring knowledge from multiple labeled source domains
+to an unlabeled target domain. However, existing MSDA techniques assume target
+domain images are available, yet overlook image-rich semantic information.
+Consequently, an open question is whether MSDA can be guided solely by textual
+cues in the absence of target domain images. By employing a multimodal model
+with a joint image and language embedding space, we propose a novel
+language-guided MSDA approach, termed LanDA, based on optimal transfer theory,
+which facilitates the transfer of multiple source domains to a new target
+domain, requiring only a textual description of the target domain without
+needing even a single target domain image, while retaining task-relevant
+information. We present extensive experiments across different transfer
+scenarios using a suite of relevant benchmarks, demonstrating that LanDA
+outperforms standard fine-tuning and ensemble approaches in both target and
+source domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy-Based Concept Bottleneck Models: Unifying Prediction, Concept
+  Intervention, and Conditional Interpretations <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyue Xu, Yi Qin, Lu Mi, Hao Wang, Xiaomeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing methods, such as concept bottleneck models (CBMs), have been
+successful in providing concept-based interpretations for black-box deep
+learning models. They typically work by predicting concepts given the input and
+then predicting the final class label given the predicted concepts. However,
+(1) they often fail to capture the high-order, nonlinear interaction between
+concepts, e.g., correcting a predicted concept (e.g., "yellow breast") does not
+help correct highly correlated concepts (e.g., "yellow belly"), leading to
+suboptimal final accuracy; (2) they cannot naturally quantify the complex
+conditional dependencies between different concepts and class labels (e.g., for
+an image with the class label "Kentucky Warbler" and a concept "black bill",
+what is the probability that the model correctly predicts another concept
+"black crown"), therefore failing to provide deeper insight into how a
+black-box model works. In response to these limitations, we propose
+Energy-based Concept Bottleneck Models (ECBMs). Our ECBMs use a set of neural
+networks to define the joint energy of candidate (input, concept, class)
+tuples. With such a unified interface, prediction, concept correction, and
+conditional dependency quantification are then represented as conditional
+probabilities, which are generated by composing different energy functions. Our
+ECBMs address both limitations of existing CBMs, providing higher accuracy and
+richer concept interpretations. Empirical results show that our approach
+outperforms the state-of-the-art on real-world datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Expression-aware video inpainting for HMD removal in XR applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Ghorbani Lohesara, Karen Egiazarian, Sebastian Knorr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Head-mounted displays (HMDs) serve as indispensable devices for observing
+extended reality (XR) environments and virtual content. However, HMDs present
+an obstacle to external recording techniques as they block the upper face of
+the user. This limitation significantly affects social XR applications,
+specifically teleconferencing, where facial features and eye gaze information
+play a vital role in creating an immersive user experience. In this study, we
+propose a new network for expression-aware video inpainting for HMD removal
+(EVI-HRnet) based on generative adversarial networks (GANs). Our model
+effectively fills in missing information with regard to facial landmarks and a
+single occlusion-free reference image of the user. The framework and its
+components ensure the preservation of the user's identity across frames using
+the reference frame. To further improve the level of realism of the inpainted
+output, we introduce a novel facial expression recognition (FER) loss function
+for emotion preservation. Our results demonstrate the remarkable capability of
+the proposed framework to remove HMDs from facial videos while maintaining the
+subject's facial expression and identity. Moreover, the outputs exhibit
+temporal consistency along the inpainted frames. This lightweight framework
+presents a practical approach for HMD occlusion removal, with the potential to
+enhance various collaborative XR applications without the need for additional
+hardware.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CVMP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enabling Cross-Camera Collaboration for Video Analytics on Distributed
+  Smart Cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chulhong Min, Juheon Yi, Utku Gunay Acer, Fahim Kawsar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Overlapping cameras offer exciting opportunities to view a scene from
+different angles, allowing for more advanced, comprehensive and robust
+analysis. However, existing visual analytics systems for multi-camera streams
+are mostly limited to (i) per-camera processing and aggregation and (ii)
+workload-agnostic centralized processing architectures. In this paper, we
+present Argus, a distributed video analytics system with cross-camera
+collaboration on smart cameras. We identify multi-camera, multi-target tracking
+as the primary task of multi-camera video analytics and develop a novel
+technique that avoids redundant, processing-heavy identification tasks by
+leveraging object-wise spatio-temporal association in the overlapping fields of
+view across multiple cameras. We further develop a set of techniques to perform
+these operations across distributed cameras without cloud support at low
+latency by (i) dynamically ordering the camera and object inspection sequence
+and (ii) flexibly distributing the workload across smart cameras, taking into
+account network transmission and heterogeneous computational capacities.
+Evaluation of three real-world overlapping camera datasets with two Nvidia
+Jetson devices shows that Argus reduces the number of object identifications
+and end-to-end latency by up to 7.13x and 2.19x (4.86x and 1.60x compared to
+the state-of-the-art), while achieving comparable tracking quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention-based Efficient Classification for 3D MRI Image of Alzheimer's
+  Disease 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14130v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14130v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihao Lin, Ximeng Li, Yan Zhang, Jinshan Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early diagnosis of Alzheimer Diagnostics (AD) is a challenging task due to
+its subtle and complex clinical symptoms. Deep learning-assisted medical
+diagnosis using image recognition techniques has become an important research
+topic in this field. The features have to accurately capture main variations of
+anatomical brain structures. However, time-consuming is expensive for feature
+extraction by deep learning training. This study proposes a novel Alzheimer's
+disease detection model based on Convolutional Neural Networks. The model
+utilizes a pre-trained ResNet network as the backbone, incorporating
+post-fusion algorithm for 3D medical images and attention mechanisms. The
+experimental results indicate that the employed 2D fusion algorithm effectively
+improves the model's training expense. And the introduced attention mechanism
+accurately weights important regions in images, further enhancing the model's
+diagnostic accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incorporating Exemplar Optimization into Training with Dual Networks for
+  Human Mesh Recovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14121v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14121v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongwei Nie, Mingxian Fan, Chengjiang Long, Qing Zhang, Jian Zhu, Xuemiao Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel optimization-based human mesh recovery method from a
+single image. Given a test exemplar, previous approaches optimize the
+pre-trained regression network to minimize the 2D re-projection loss, which
+however suffer from over-/under-fitting problems. This is because the
+``exemplar optimization'' at testing time has too weak relation to the
+pre-training process, and the exemplar optimization loss function is different
+from the training loss function. (1) We incorporate exemplar optimization into
+the training stage. During training, our method first executes exemplar
+optimization and subsequently proceeds with training-time optimization. The
+exemplar optimization may run into a wrong direction, while the subsequent
+training optimization serves to correct the deviation. Involved in training,
+the exemplar optimization learns to adapt its behavior to training data,
+thereby acquires generalibility to test exemplars. (2) We devise a dual-network
+architecture to convey the novel training paradigm, which is composed of a main
+regression network and an auxiliary network, in which we can formulate the
+exemplar optimization loss function in the same form as the training loss
+function. This further enhances the compatibility between the exemplar and
+training optimizations. Experiments demonstrate that our exemplar optimization
+after the novel training scheme significantly outperforms state-of-the-art
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MIFI: MultI-camera Feature Integration for Roust 3D Distracted Driver
+  Activity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14115v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14115v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Kuang, Wenjing Li, Fang Li, Jun Zhang, Zhongcheng Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distracted driver activity recognition plays a critical role in risk
+aversion-particularly beneficial in intelligent transportation systems.
+However, most existing methods make use of only the video from a single view
+and the difficulty-inconsistent issue is neglected. Different from them, in
+this work, we propose a novel MultI-camera Feature Integration (MIFI) approach
+for 3D distracted driver activity recognition by jointly modeling the data from
+different camera views and explicitly re-weighting examples based on their
+degree of difficulty. Our contributions are two-fold: (1) We propose a simple
+but effective multi-camera feature integration framework and provide three
+types of feature fusion techniques. (2) To address the difficulty-inconsistent
+problem in distracted driver activity recognition, a periodic learning method,
+named example re-weighting that can jointly learn the easy and hard samples, is
+presented. The experimental results on the 3MDAD dataset demonstrate that the
+proposed MIFI can consistently boost performance compared to single-view
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Intelligent Transportation Systems.
+  Minor typos have been fixed in Table IV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scene Graph to Image Synthesis: Integrating CLIP Guidance with Graph
+  Conditioning in Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14111v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14111v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rameshwar Mishra, A V Subramanyam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in generative models have sparked significant interest in
+generating images while adhering to specific structural guidelines. Scene graph
+to image generation is one such task of generating images which are consistent
+with the given scene graph. However, the complexity of visual scenes poses a
+challenge in accurately aligning objects based on specified relations within
+the scene graph. Existing methods approach this task by first predicting a
+scene layout and generating images from these layouts using adversarial
+training. In this work, we introduce a novel approach to generate images from
+scene graphs which eliminates the need of predicting intermediate layouts. We
+leverage pre-trained text-to-image diffusion models and CLIP guidance to
+translate graph knowledge into images. Towards this, we first pre-train our
+graph encoder to align graph features with CLIP features of corresponding
+images using a GAN based training. Further, we fuse the graph features with
+CLIP embedding of object labels present in the given scene graph to create a
+graph consistent CLIP guided conditioning signal. In the conditioning input,
+object embeddings provide coarse structure of the image and graph features
+provide structural alignment based on relationships among objects. Finally, we
+fine tune a pre-trained diffusion model with the graph consistent conditioning
+signal with reconstruction and CLIP alignment loss. Elaborate experiments
+reveal that our method outperforms existing methods on standard benchmarks of
+COCO-stuff and Visual Genome dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Double Trouble? Impact and Detection of Duplicates in Face Image
+  <span class="highlight-title">Dataset</span>s <span class="chip">ICPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Torsten Schlett, Christian Rathgeb, Juan Tapia, Christoph Busch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Various face image datasets intended for facial biometrics research were
+created via web-scraping, i.e. the collection of images publicly available on
+the internet. This work presents an approach to detect both exactly and nearly
+identical face image duplicates, using file and image hashes. The approach is
+extended through the use of face image preprocessing. Additional steps based on
+face recognition and face image quality assessment models reduce false
+positives, and facilitate the deduplication of the face images both for intra-
+and inter-subject duplicate sets. The presented approach is applied to five
+datasets, namely LFW, TinyFace, Adience, CASIA-WebFace, and C-MS-Celeb (a
+cleaned MS-Celeb-1M variant). Duplicates are detected within every dataset,
+with hundreds to hundreds of thousands of duplicates for all except LFW. Face
+recognition and quality assessment experiments indicate a minor impact on the
+results through the duplicate removal. The final deduplication data is publicly
+available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 13th International Conference on Pattern Recognition
+  Applications and Methods (ICPRAM 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProCNS: Progressive Prototype Calibration and Noise Suppression for
+  Weakly-Supervised Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Y. Liu, L. Lin, K. K. Y. Wong, X. Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-supervised segmentation (WSS) has emerged as a solution to mitigate
+the conflict between annotation cost and model performance by adopting sparse
+annotation formats (e.g., point, scribble, block, etc.). Typical approaches
+attempt to exploit anatomy and topology priors to directly expand sparse
+annotations into pseudo-labels. However, due to a lack of attention to the
+ambiguous edges in medical images and insufficient exploration of sparse
+supervision, existing approaches tend to generate erroneous and overconfident
+pseudo proposals in noisy regions, leading to cumulative model error and
+performance degradation. In this work, we propose a novel WSS approach, named
+ProCNS, encompassing two synergistic modules devised with the principles of
+progressive prototype calibration and noise suppression. Specifically, we
+design a Prototype-based Regional Spatial Affinity (PRSA) loss to maximize the
+pair-wise affinities between spatial and semantic elements, providing our model
+of interest with more reliable guidance. The affinities are derived from the
+input images and the prototype-refined predictions. Meanwhile, we propose an
+Adaptive Noise Perception and Masking (ANPM) module to obtain more enriched and
+representative prototype representations, which adaptively identifies and masks
+noisy regions within the pseudo proposals, reducing potential erroneous
+interference during prototype computation. Furthermore, we generate specialized
+soft pseudo-labels for the noisy regions identified by ANPM, providing
+supplementary supervision. Extensive experiments on three medical image
+segmentation tasks involving different modalities demonstrate that the proposed
+framework significantly outperforms representative state-of-the-art methods
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CreativeSynth: Creative Blending and Synthesis of Visual Arts based on
+  Multimodal Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nisha Huang, Weiming Dong, Yuxin Zhang, Fan Tang, Ronghui Li, Chongyang Ma, Xiu Li, Changsheng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale text-to-image generative models have made impressive strides,
+showcasing their ability to synthesize a vast array of high-quality images.
+However, adapting these models for artistic image editing presents two
+significant challenges. Firstly, users struggle to craft textual prompts that
+meticulously detail visual elements of the input image. Secondly, prevalent
+models, when effecting modifications in specific zones, frequently disrupt the
+overall artistic style, complicating the attainment of cohesive and
+aesthetically unified artworks. To surmount these obstacles, we build the
+innovative unified framework CreativeSynth, which is based on a diffusion model
+with the ability to coordinate multimodal inputs and multitask in the field of
+artistic image generation. By integrating multimodal features with customized
+attention mechanisms, CreativeSynth facilitates the importation of real-world
+semantic content into the domain of art through inversion and real-time style
+transfer. This allows for the precise manipulation of image style and content
+while maintaining the integrity of the original model parameters. Rigorous
+qualitative and quantitative evaluations underscore that CreativeSynth excels
+in enhancing artistic images' fidelity and preserves their innate aesthetic
+essence. By bridging the gap between generative models and artistic finesse,
+CreativeSynth becomes a custom digital palette.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A real-time rendering method for high albedo anisotropic materials with
+  multiple scattering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shun Fang, Xing Feng, Ming Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a neural network-based real-time volume rendering method for
+realistic and efficient rendering of volumetric media. The traditional volume
+rendering method uses path tracing to solve the radiation transfer equation,
+which requires a huge amount of calculation and cannot achieve real-time
+rendering. Therefore, this paper uses neural networks to simulate the iterative
+integration process of solving the radiative transfer equation to speed up the
+volume rendering of volume media. Specifically, the paper first performs data
+processing on the volume medium to generate a variety of sampling features,
+including density features, transmittance features and phase features. The
+hierarchical transmittance fields are fed into a 3D-CNN network to compute more
+important transmittance features. Secondly, the diffuse reflection sampling
+template and the highlight sampling template are used to layer the three types
+of sampling features into the network. This method can pay more attention to
+light scattering, highlights and shadows, and then select important channel
+features through the attention module. Finally, the scattering distribution of
+the center points of all sampling templates is predicted through the backbone
+neural network. This method can achieve realistic volumetric media rendering
+effects and greatly increase the rendering speed while maintaining rendering
+quality, which is of great significance for real-time rendering applications.
+Experimental results indicate that our method outperforms previous methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Clustering with Diffused Sampling and Hardness-aware
+  Self-distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai-Xin Zhang, Dong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep clustering has gained significant attention due to its capability in
+learning clustering-friendly representations without labeled data. However,
+previous deep clustering methods tend to treat all samples equally, which
+neglect the variance in the latent distribution and the varying difficulty in
+classifying or clustering different samples. To address this, this paper
+proposes a novel end-to-end deep clustering method with diffused sampling and
+hardness-aware self-distillation (HaDis). Specifically, we first align one view
+of instances with another view via diffused sampling alignment (DSA), which
+helps improve the intra-cluster compactness. To alleviate the sampling bias, we
+present the hardness-aware self-distillation (HSD) mechanism to mine the
+hardest positive and negative samples and adaptively adjust their weights in a
+self-distillation fashion, which is able to deal with the potential imbalance
+in sample contributions during optimization. Further, the prototypical
+contrastive learning is incorporated to simultaneously enhance the
+inter-cluster separability and intra-cluster compactness. Experimental results
+on five challenging image datasets demonstrate the superior clustering
+performance of our HaDis method over the state-of-the-art. Source code is
+available at https://github.com/Regan-Zhang/HaDis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diverse and Lifespan Facial Age Transformation Synthesis with Identity
+  Variation Rationality Metric 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14036v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14036v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiu-Cheng Xie, Jun Yang, Wenqing Wang, Feng Xu, Hao Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face aging has received continuous research attention over the past two
+decades. Although previous works on this topic have achieved impressive
+success, two longstanding problems remain unsettled: 1) generating diverse and
+plausible facial aging patterns at the target age stage; 2) measuring the
+rationality of identity variation between the original portrait and its
+syntheses with age progression or regression. In this paper, we introduce DLAT
++ , the first algorithm that can realize Diverse and Lifespan Age
+Transformation on human faces, where the diversity jointly manifests in the
+transformation of facial textures and shapes. Apart from the diversity
+mechanism embedded in the model, multiple consistency restrictions are
+leveraged to keep it away from counterfactual aging syntheses. Moreover, we
+propose a new metric to assess the rationality of Identity Deviation under Age
+Gaps (IDAG) between the input face and its series of age-transformed
+generations, which is based on statistical laws summarized from plenty of
+genuine face-aging data. Extensive experimental results demonstrate the
+uniqueness and effectiveness of our method in synthesizing diverse and
+perceptually reasonable faces across the whole lifetime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Spatial-Temporal Feature Enrichment and Fidelity
+  Preservation Network for Skeleton based Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14034v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14034v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuankun Li, Shuai Li, Yanbo Gao, Ping Chen, Jian Li, Wanqing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised skeleton based action recognition has achieved remarkable
+progress recently. Existing unsupervised learning methods suffer from severe
+overfitting problem, and thus small networks are used, significantly reducing
+the representation capability. To address this problem, the overfitting
+mechanism behind the unsupervised learning for skeleton based action
+recognition is first investigated. It is observed that the skeleton is already
+a relatively high-level and low-dimension feature, but not in the same manifold
+as the features for action recognition. Simply applying the existing
+unsupervised learning method may tend to produce features that discriminate the
+different samples instead of action classes, resulting in the overfitting
+problem. To solve this problem, this paper presents an Unsupervised
+spatial-temporal Feature Enrichment and Fidelity Preservation framework
+(U-FEFP) to generate rich distributed features that contain all the information
+of the skeleton sequence. A spatial-temporal feature transformation subnetwork
+is developed using spatial-temporal graph convolutional network and graph
+convolutional gate recurrent unit network as the basic feature extraction
+network. The unsupervised Bootstrap Your Own Latent based learning is used to
+generate rich distributed features and the unsupervised pretext task based
+learning is used to preserve the information of the skeleton sequence. The two
+unsupervised learning ways are collaborated as U-FEFP to produce robust and
+discriminative representations. Experimental results on three widely used
+benchmarks, namely NTU-RGB+D-60, NTU-RGB+D-120 and PKU-MMD dataset, demonstrate
+that the proposed U-FEFP achieves the best performance compared with the
+state-of-the-art unsupervised learning methods. t-SNE illustrations further
+validate that U-FEFP can learn more discriminative features for unsupervised
+skeleton based action recognition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GauU-Scene: A Scene Reconstruction Benchmark on Large Scale 3D
+  Reconstruction <span class="highlight-title">Dataset</span> Using Gaussian Splatting <span class="chip">IJCAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Butian Xiong, Zhuo Li, Zhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel large-scale scene reconstruction benchmark using the
+newly developed 3D representation approach, Gaussian Splatting, on our
+expansive U-Scene dataset. U-Scene encompasses over one and a half square
+kilometres, featuring a comprehensive RGB dataset coupled with LiDAR ground
+truth. For data acquisition, we employed the Matrix 300 drone equipped with the
+high-accuracy Zenmuse L1 LiDAR, enabling precise rooftop data collection. This
+dataset, offers a unique blend of urban and academic environments for advanced
+spatial analysis convers more than 1.5 km$^2$. Our evaluation of U-Scene with
+Gaussian Splatting includes a detailed analysis across various novel
+viewpoints. We also juxtapose these results with those derived from our
+accurate point cloud dataset, highlighting significant differences that
+underscore the importance of combine multi-modal information
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCAI2024 submit, 8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse and Transferable Universal Singular Vectors Attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kseniia Kuvshinova, Olga Tsymboi, Ivan Oseledets
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research in the field of adversarial attacks and models' vulnerability is
+one of the fundamental directions in modern machine learning. Recent studies
+reveal the vulnerability phenomenon, and understanding the mechanisms behind
+this is essential for improving neural network characteristics and
+interpretability. In this paper, we propose a novel sparse universal white-box
+adversarial attack. Our approach is based on truncated power iteration
+providing sparsity to $(p,q)$-singular vectors of the hidden layers of Jacobian
+matrices. Using the ImageNet benchmark validation subset, we analyze the
+proposed method in various settings, achieving results comparable to dense
+baselines with more than a 50% fooling rate while damaging only 5% of pixels
+and utilizing 256 samples for perturbation fitting. We also show that our
+algorithm admits higher attack magnitude without affecting the human ability to
+solve the task. Furthermore, we investigate that the constructed perturbations
+are highly transferable among different models without significantly decreasing
+the fooling rate. Our findings demonstrate the vulnerability of
+state-of-the-art models to sparse attacks and highlight the importance of
+developing robust machine learning systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PLCNet: Patch-wise Lane Correction Network for Automatic Lane Correction
+  in High-definition Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haiyang Peng, Yi Zhan, Benkang Wang, Hongtao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In High-definition (HD) maps, lane elements constitute the majority of
+components and demand stringent localization requirements to ensure safe
+vehicle navigation. Vision lane detection with LiDAR position assignment is a
+prevalent method to acquire initial lanes for HD maps. However, due to
+incorrect vision detection and coarse camera-LiDAR calibration, initial lanes
+may deviate from their true positions within an uncertain range. To mitigate
+the need for manual lane correction, we propose a patch-wise lane correction
+network (PLCNet) to automatically correct the positions of initial lane points
+in local LiDAR images that are transformed from point clouds. PLCNet first
+extracts multi-scale image features and crops patch (ROI) features centered at
+each initial lane point. By applying ROIAlign, the fix-sized ROI features are
+flattened into 1D features. Then, a 1D lane attention module is devised to
+compute instance-level lane features with adaptive weights. Finally, lane
+correction offsets are inferred by a multi-layer perceptron and used to correct
+the initial lane positions. Considering practical applications, our automatic
+method supports merging local corrected lanes into global corrected lanes.
+Through extensive experiments on a self-built dataset, we demonstrate that
+PLCNet achieves fast and effective initial lane correction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Ensemble Loss and Latent Refinement for High-Fidelity Neural
+  Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daxin Li, Yuanchao Bai, Kai Wang, Junjun Jiang, Xianming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in neural compression have surpassed traditional codecs
+in PSNR and MS-SSIM measurements. However, at low bit-rates, these methods can
+introduce visually displeasing artifacts, such as blurring, color shifting, and
+texture loss, thereby compromising perceptual quality of images. To address
+these issues, this study presents an enhanced neural compression method
+designed for optimal visual fidelity. We have trained our model with a
+sophisticated semantic ensemble loss, integrating Charbonnier loss, perceptual
+loss, style loss, and a non-binary adversarial loss, to enhance the perceptual
+quality of image reconstructions. Additionally, we have implemented a latent
+refinement process to generate content-aware latent codes. These codes adhere
+to bit-rate constraints, balance the trade-off between distortion and fidelity,
+and prioritize bit allocation to regions of greater importance. Our empirical
+findings demonstrate that this approach significantly improves the statistical
+fidelity of neural image compression. On CLIC2024 validation set, our approach
+achieves a 62% bitrate saving compared to MS-ILLM under FID metric.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WAL-Net: Weakly supervised auxiliary task learning network for carotid
+  plaques classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haitao Gan, Lingchao Fu, Ran Zhou, Weiyan Gan, Furong Wang, Xiaoyan Wu, Zhi Yang, Zhongwei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The classification of carotid artery ultrasound images is a crucial means for
+diagnosing carotid plaques, holding significant clinical relevance for
+predicting the risk of stroke. Recent research suggests that utilizing plaque
+segmentation as an auxiliary task for classification can enhance performance by
+leveraging the correlation between segmentation and classification tasks.
+However, this approach relies on obtaining a substantial amount of
+challenging-to-acquire segmentation annotations. This paper proposes a novel
+weakly supervised auxiliary task learning network model (WAL-Net) to explore
+the interdependence between carotid plaque classification and segmentation
+tasks. The plaque classification task is primary task, while the plaque
+segmentation task serves as an auxiliary task, providing valuable information
+to enhance the performance of the primary task. Weakly supervised learning is
+adopted in the auxiliary task to completely break away from the dependence on
+segmentation annotations. Experiments and evaluations are conducted on a
+dataset comprising 1270 carotid plaque ultrasound images from Wuhan University
+Zhongnan Hospital. Results indicate that the proposed method achieved an
+approximately 1.3% improvement in carotid plaque classification accuracy
+compared to the baseline network. Specifically, the accuracy of mixed-echoic
+plaques classification increased by approximately 3.3%, demonstrating the
+effectiveness of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion-based Data Augmentation for Object Counting Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Wang, Yuelei Li, Jia Wan, Nuno Vasconcelos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crowd counting is an important problem in computer vision due to its wide
+range of applications in image understanding. Currently, this problem is
+typically addressed using deep learning approaches, such as Convolutional
+Neural Networks (CNNs) and Transformers. However, deep networks are data-driven
+and are prone to overfitting, especially when the available labeled crowd
+dataset is limited. To overcome this limitation, we have designed a pipeline
+that utilizes a diffusion model to generate extensive training data. We are the
+first to generate images conditioned on a location dot map (a binary dot map
+that specifies the location of human heads) with a diffusion model. We are also
+the first to use these diverse synthetic data to augment the crowd counting
+models. Our proposed smoothed density map input for ControlNet significantly
+improves ControlNet's performance in generating crowds in the correct
+locations. Also, Our proposed counting loss for the diffusion model effectively
+minimizes the discrepancies between the location dot map and the crowd images
+generated. Additionally, our innovative guidance sampling further directs the
+diffusion process toward regions where the generated crowd images align most
+accurately with the location dot map. Collectively, we have enhanced
+ControlNet's ability to generate specified objects from a location dot map,
+which can be used for data augmentation in various counting problems. Moreover,
+our framework is versatile and can be easily adapted to all kinds of counting
+problems. Extensive experiments demonstrate that our framework improves the
+counting performance on the ShanghaiTech, NWPU-Crowd, UCF-QNRF, and TRANCOS
+datasets, showcasing its effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Innovations in Diagnosing Diabetic Retinopathy: The
+  Potential of Transfer Learning and the DiaCNN Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13990v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13990v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed R. Shoaib, Heba M. Emara, Jun Zhao, Walid El-Shafai, Naglaa F. Soliman, Ahmed S. Mubarak, Osama A. Omer, Fathi E. Abd El-Samie, Hamada Esmaiel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diabetic retinopathy (DR) is a significant cause of vision impairment,
+emphasizing the critical need for early detection and timely intervention to
+avert visual deterioration. Diagnosing DR is inherently complex, as it
+necessitates the meticulous examination of intricate retinal images by
+experienced specialists. This makes the early diagnosis of DR essential for
+effective treatment and the prevention of eventual blindness. Traditional
+diagnostic methods, relying on human interpretation of these medical images,
+face challenges in terms of accuracy and efficiency. In the present research,
+we introduce a novel method that offers superior precision in DR diagnosis,
+compared to these traditional methods, by employing advanced deep learning
+techniques. Central to this approach is the concept of transfer learning. This
+entails using pre-existing, well-established models, specifically
+InceptionResNetv2 and Inceptionv3, to extract features and fine-tune select
+layers to cater to the unique requirements of this specific diagnostic task.
+Concurrently, we also present a newly devised model, DiaCNN, which is tailored
+for the classification of eye diseases. To validate the efficacy of the
+proposed methodology, we leveraged the Ocular Disease Intelligent Recognition
+(ODIR) dataset, which comprises eight different eye disease categories. The
+results were promising. The InceptionResNetv2 model, incorporating transfer
+learning, registered an impressive 97.5% accuracy in both the training and
+testing phases. Its counterpart, the Inceptionv3 model, achieved an even more
+commendable 99.7% accuracy during training, and 97.5% during testing.
+Remarkably, the DiaCNN model showcased unparalleled precision, achieving 100%
+accuracy in training and 98.3\% in testing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Manipulate Artistic Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Guo, Yuqi Zhang, De Ma, Qian Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancement in computer vision has significantly lowered the barriers
+to artistic creation. Exemplar-based image translation methods have attracted
+much attention due to flexibility and controllability. However, these methods
+hold assumptions regarding semantics or require semantic information as the
+input, while accurate semantics is not easy to obtain in artistic images.
+Besides, these methods suffer from cross-domain artifacts due to training data
+prior and generate imprecise structure due to feature compression in the
+spatial domain. In this paper, we propose an arbitrary Style Image Manipulation
+Network (SIM-Net), which leverages semantic-free information as guidance and a
+region transportation strategy in a self-supervised manner for image
+generation. Our method balances computational efficiency and high resolution to
+a certain extent. Moreover, our method facilitates zero-shot style image
+manipulation. Both qualitative and quantitative experiments demonstrate the
+superiority of our method over state-of-the-art methods.Code is available at
+https://github.com/SnailForce/SIM-Net.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BootPIG: Bootstrapping Zero-shot Personalized Image Generation
+  Capabilities in <span class="highlight-title">Pretrain</span>ed Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Senthil Purushwalkam, Akash Gokul, Shafiq Joty, Nikhil Naik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent text-to-image generation models have demonstrated incredible success
+in generating images that faithfully follow input prompts. However, the
+requirement of using words to describe a desired concept provides limited
+control over the appearance of the generated concepts. In this work, we address
+this shortcoming by proposing an approach to enable personalization
+capabilities in existing text-to-image diffusion models. We propose a novel
+architecture (BootPIG) that allows a user to provide reference images of an
+object in order to guide the appearance of a concept in the generated images.
+  The proposed BootPIG architecture makes minimal modifications to a pretrained
+text-to-image diffusion model and utilizes a separate UNet model to steer the
+generations toward the desired appearance. We introduce a training procedure
+that allows us to bootstrap personalization capabilities in the BootPIG
+architecture using data generated from pretrained text-to-image models, LLM
+chat agents, and image segmentation models. In contrast to existing methods
+that require several days of pretraining, the BootPIG architecture can be
+trained in approximately 1 hour. Experiments on the DreamBooth dataset
+demonstrate that BootPIG outperforms existing zero-shot methods while being
+comparable with test-time finetuning approaches. Through a user study, we
+validate the preference for BootPIG generations over existing methods both in
+maintaining fidelity to the reference object's appearance and aligning with
+textual prompts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Pseudo-labelling and Enhancing Robustness for Semi-Supervised
+  Domain Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13965v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13965v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adnan Khan, Mai A. Shaaban, Muhammad Haris Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Beyond attaining domain generalization (DG), visual recognition models should
+also be data-efficient during learning by leveraging limited labels. We study
+the problem of Semi-Supervised Domain Generalization (SSDG) which is crucial
+for real-world applications like automated healthcare. SSDG requires learning a
+cross-domain generalizable model when the given training data is only partially
+labelled. Empirical investigations reveal that the DG methods tend to
+underperform in SSDG settings, likely because they are unable to exploit the
+unlabelled data. Semi-supervised learning (SSL) shows improved but still
+inferior results compared to fully-supervised learning. A key challenge, faced
+by the best-performing SSL-based SSDG methods, is selecting accurate
+pseudo-labels under multiple domain shifts and reducing overfitting to source
+domains under limited labels. In this work, we propose new SSDG approach, which
+utilizes a novel uncertainty-guided pseudo-labelling with model averaging
+(UPLM). Our uncertainty-guided pseudo-labelling (UPL) uses model uncertainty to
+improve pseudo-labelling selection, addressing poor model calibration under
+multi-source unlabelled data. The UPL technique, enhanced by our novel model
+averaging (MA) strategy, mitigates overfitting to source domains with limited
+labels. Extensive experiments on key representative DG datasets suggest that
+our method demonstrates effectiveness against existing methods. Our code and
+chosen labelled data seeds are available on GitHub:
+https://github.com/Adnan-Khan7/UPLM
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Extensible Framework for Open Heterogeneous Collaborative Perception <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Lu, Yue Hu, Yiqi Zhong, Dequan Wang, Siheng Chen, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative perception aims to mitigate the limitations of single-agent
+perception, such as occlusions, by facilitating data exchange among multiple
+agents. However, most current works consider a homogeneous scenario where all
+agents use identity sensors and perception models. In reality, heterogeneous
+agent types may continually emerge and inevitably face a domain gap when
+collaborating with existing agents. In this paper, we introduce a new open
+heterogeneous problem: how to accommodate continually emerging new
+heterogeneous agent types into collaborative perception, while ensuring high
+perception performance and low integration cost? To address this problem, we
+propose HEterogeneous ALliance (HEAL), a novel extensible collaborative
+perception framework. HEAL first establishes a unified feature space with
+initial agents via a novel multi-scale foreground-aware Pyramid Fusion network.
+When heterogeneous new agents emerge with previously unseen modalities or
+models, we align them to the established unified space with an innovative
+backward alignment. This step only involves individual training on the new
+agent type, thus presenting extremely low training costs and high
+extensibility. It also protects new agents' model details from disclosure since
+the training can be conducted by the agent owner locally. To enrich agents'
+data heterogeneity, we bring OPV2V-H, a new large-scale dataset with more
+diverse sensor types. Extensive experiments on OPV2V-H and DAIR-V2X datasets
+show that HEAL surpasses SOTA methods in performance while reducing the
+training parameters by 91.5% when integrating 3 new agent types. Code and data
+are available at: https://github.com/yifanlu0227/HEAL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024. The code and data are open-sourced at
+  https://github.com/yifanlu0227/HEAL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TriSAM: Tri-Plane SAM for zero-shot cortical blood vessel segmentation
+  in VEM images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia Wan, Wanhua Li, Atmadeep Banerjee, Jason Ken Adhinarta, Evelina Sjostedt, Jingpeng Wu, Jeff Lichtman, Hanspeter Pfister, Donglai Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address a significant gap in the field of neuroimaging by
+introducing the largest-to-date public benchmark, BvEM, designed specifically
+for cortical blood vessel segmentation in Volume Electron Microscopy (VEM)
+images. The intricate relationship between cerebral blood vessels and neural
+function underscores the vital role of vascular analysis in understanding brain
+health. While imaging techniques at macro and mesoscales have garnered
+substantial attention and resources, the microscale VEM imaging, capable of
+revealing intricate vascular details, has lacked the necessary benchmarking
+infrastructure. As researchers delve deeper into the microscale intricacies of
+cerebral vasculature, our BvEM benchmark represents a critical step toward
+unraveling the mysteries of neurovascular coupling and its impact on brain
+function and pathology. The BvEM dataset is based on VEM image volumes from
+three mammal species: adult mouse, macaque, and human. We standardized the
+resolution, addressed imaging variations, and meticulously annotated blood
+vessels through semi-automatic, manual, and quality control processes, ensuring
+high-quality 3D segmentation. Furthermore, we developed a zero-shot cortical
+blood vessel segmentation method named TriSAM, which leverages the powerful
+segmentation model SAM for 3D segmentation. To lift SAM from 2D segmentation to
+3D volume segmentation, TriSAM employs a multi-seed tracking framework,
+leveraging the reliability of certain image planes for tracking while using
+others to identify potential turning points. This approach, consisting of
+Tri-Plane selection, SAM-based tracking, and recursive redirection, effectively
+achieves long-term 3D blood vessel segmentation without model training or
+fine-tuning. Experimental results show that TriSAM achieved superior
+performances on the BvEM benchmark across three species.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditional Neural Video Coding with Spatial-Temporal Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henan Wang, Xiaohan Pan, Runsen Feng, Zongyu Guo, Zhibo Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This document is an expanded version of a one-page abstract originally
+presented at the 2024 Data Compression Conference. It describes our proposed
+method for the video track of the Challenge on Learned Image Compression (CLIC)
+2024. Our scheme follows the typical hybrid coding framework with some novel
+techniques. Firstly, we adopt Spynet network to produce accurate motion vectors
+for motion estimation. Secondly, we introduce the context mining scheme with
+conditional frame coding to fully exploit the spatial-temporal information. As
+for the low target bitrates given by CLIC, we integrate spatial-temporal
+super-resolution modules to improve rate-distortion performance. Our team name
+is IMCLVC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 2024 Data Compression Conference (DCC) for
+  presentation as a poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Image Quality Database for Multiple Industrial Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanchao Ma, Zehan Wu, Hongyan Liu, Chengxu Zhou, Ke Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed a broader range of applications of image
+processing technologies in multiple industrial processes, such as smoke
+detection, security monitoring, and workpiece inspection. Different kinds of
+distortion types and levels must be introduced into an image during the
+processes of acquisition, compression, transmission, storage, and display,
+which might heavily degrade the image quality and thus strongly reduce the
+final display effect and clarity. To verify the reliability of existing image
+quality assessment methods, we establish a new industrial process image
+database (IPID), which contains 3000 distorted images generated by applying
+different levels of distortion types to each of the 50 source images. We
+conduct the subjective test on the aforementioned 3000 images to collect their
+subjective quality ratings in a well-suited laboratory environment. Finally, we
+perform comparison experiments on IPID database to investigate the performance
+of some objective image quality assessment algorithms. The experimental results
+show that the state-of-the-art image quality assessment methods have difficulty
+in predicting the quality of images that contain multiple distortion types.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AM-SORT: Adaptable Motion Predictor with Historical Trajectory Embedding
+  for Multi-Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vitaliy Kim, Gunho Jung, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many multi-object tracking (MOT) approaches, which employ the Kalman Filter
+as a motion predictor, assume constant velocity and Gaussian-distributed
+filtering noises. These assumptions render the Kalman Filter-based trackers
+effective in linear motion scenarios. However, these linear assumptions serve
+as a key limitation when estimating future object locations within scenarios
+involving non-linear motion and occlusions. To address this issue, we propose a
+motion-based MOT approach with an adaptable motion predictor, called AM-SORT,
+which adapts to estimate non-linear uncertainties. AM-SORT is a novel extension
+of the SORT-series trackers that supersedes the Kalman Filter with the
+transformer architecture as a motion predictor. We introduce a historical
+trajectory embedding that empowers the transformer to extract spatio-temporal
+features from a sequence of bounding boxes. AM-SORT achieves competitive
+performance compared to state-of-the-art trackers on DanceTrack, with 56.3 IDF1
+and 55.6 HOTA. We conduct extensive experiments to demonstrate the
+effectiveness of our method in predicting non-linear movement under occlusions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StyleInject: Parameter Efficient Tuning of Text-to-Image Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13942v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13942v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yalong Bai, Mohan Zhou, Qing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to fine-tune generative models for text-to-image generation tasks
+is crucial, particularly facing the complexity involved in accurately
+interpreting and visualizing textual inputs. While LoRA is efficient for
+language model adaptation, it often falls short in text-to-image tasks due to
+the intricate demands of image generation, such as accommodating a broad
+spectrum of styles and nuances. To bridge this gap, we introduce StyleInject, a
+specialized fine-tuning approach tailored for text-to-image models. StyleInject
+comprises multiple parallel low-rank parameter matrices, maintaining the
+diversity of visual features. It dynamically adapts to varying styles by
+adjusting the variance of visual features based on the characteristics of the
+input signal. This approach significantly minimizes the impact on the original
+model's text-image alignment capabilities while adeptly adapting to various
+styles in transfer learning. StyleInject proves particularly effective in
+learning from and enhancing a range of advanced, community-fine-tuned
+generative models. Our comprehensive experiments, including both small-sample
+and large-scale data fine-tuning as well as base model distillation, show that
+StyleInject surpasses traditional LoRA in both text-image semantic consistency
+and human preference evaluation, all while ensuring greater parameter
+efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Video Object Segmentation with Distillation Learning of
+  Deformable Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13937v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13937v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quang-Trung Truong, Duc Thanh Nguyen, Binh-Son Hua, Sai-Kit Yeung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video object segmentation is a fundamental research problem in computer
+vision. Recent techniques have often applied attention mechanism to object
+representation learning from video sequences. However, due to temporal changes
+in the video data, attention maps may not well align with the objects of
+interest across video frames, causing accumulated errors in long-term video
+processing. In addition, existing techniques have utilised complex
+architectures, requiring highly computational complexity and hence limiting the
+ability to integrate video object segmentation into low-powered devices. To
+address these issues, we propose a new method for self-supervised video object
+segmentation based on distillation learning of deformable attention.
+Specifically, we devise a lightweight architecture for video object
+segmentation that is effectively adapted to temporal changes. This is enabled
+by deformable attention mechanism, where the keys and values capturing the
+memory of a video sequence in the attention module have flexible locations
+updated across frames. The learnt object representations are thus adaptive to
+both the spatial and temporal dimensions. We train the proposed architecture in
+a self-supervised fashion through a new knowledge distillation paradigm where
+deformable attention maps are integrated into the distillation loss. We
+qualitatively and quantitatively evaluate our method and compare it with
+existing methods on benchmark datasets including DAVIS 2016/2017 and
+YouTube-VOS 2018/2019. Experimental results verify the superiority of our
+method via its achieved state-of-the-art performance and optimal memory usage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MambaMorph: a Mamba-based Backbone with Contrastive Feature Learning for
+  Deformable MR-CT Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Guo, Yinuo Wang, Cai Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deformable image registration is an essential approach for medical image
+analysis.This paper introduces MambaMorph, an innovative multi-modality
+deformable registration network, specifically designed for Magnetic Resonance
+(MR) and Computed Tomography (CT) image alignment. MambaMorph stands out with
+its Mamba-based registration module and a contrastive feature learning
+approach, addressing the prevalent challenges in multi-modality registration.
+The network leverages Mamba blocks for efficient long-range modeling and
+high-dimensional data processing, coupled with a feature extractor that learns
+fine-grained features for enhanced registration accuracy. Experimental results
+showcase MambaMorph's superior performance over existing methods in MR-CT
+registration, underlining its potential in clinical applications. This work
+underscores the significance of feature learning in multi-modality registration
+and positions MambaMorph as a trailblazing solution in this field. The code for
+MambaMorph is available at: https://github.com/Guo-Stone/MambaMorph.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Graph Supported Benchmark and Video Captioning for Basketball 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyu Xi, Ge Shi, Lifang Wu, Xuefen Li, Junchi Yan, Liang Wang, Zilin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the recent emergence of video captioning models, how to generate the
+text description with specific entity names and fine-grained actions is far
+from being solved, which however has great applications such as basketball live
+text broadcast. In this paper, a new multimodal knowledge supported basketball
+benchmark for video captioning is proposed. Specifically, we construct a
+Multimodal Basketball Game Knowledge Graph (MbgKG) to provide knowledge beyond
+videos. Then, a Multimodal Basketball Game Video Captioning (MbgVC) dataset
+that contains 9 types of fine-grained shooting events and 286 players'
+knowledge (i.e., images and names) is constructed based on MbgKG. We develop a
+novel framework in the encoder-decoder form named Entity-Aware Captioner (EAC)
+for basketball live text broadcast. The temporal information in video is
+encoded by introducing the bi-directional GRU (Bi-GRU) module. And the
+multi-head self-attention module is utilized to model the relationships among
+the players and select the key players. Besides, we propose a new performance
+evaluation metric named Game Description Score (GDS), which measures not only
+the linguistic performance but also the accuracy of the names prediction.
+Extensive experiments on MbgVC dataset demonstrate that EAC effectively
+leverages external knowledge and outperforms advanced video captioning models.
+The proposed benchmark and corresponding codes will be publicly available soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AscDAMs: Advanced SLAM-based channel detection and mapping system 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengfei Wang, Fucheng Lu, Jintao Qin, Taosheng Huang, Hui Kong, Ping Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining high-resolution, accurate channel topography and deposit conditions
+is the prior challenge for the study of channelized debris flow. Currently,
+wide-used mapping technologies including satellite imaging and drone
+photogrammetry struggle to precisely observe channel interior conditions of
+mountainous long-deep gullies, particularly those in the Wenchuan Earthquake
+region. SLAM is an emerging tech for 3D mapping; however, extremely rugged
+environment in long-deep gullies poses two major challenges even for the
+state-of-art SLAM: (1) Atypical features; (2) Violent swaying and oscillation
+of sensors. These issues result in large deviation and lots of noise for SLAM
+results. To improve SLAM mapping in such environments, we propose an advanced
+SLAM-based channel detection and mapping system, namely AscDAMs. It features
+three main enhancements to post-process SLAM results: (1) The digital
+orthophoto map aided deviation correction algorithm greatly eliminates the
+systematic error; (2) The point cloud smoothing algorithm substantially
+diminishes noises; (3) The cross section extraction algorithm enables the
+quantitative assessment of channel deposits and their changes. Two field
+experiments were conducted in Chutou Gully, Wenchuan County in China in
+February and November 2023, representing observations before and after the
+rainy season. We demonstrate the capability of AscDAMs to greatly improve SLAM
+results, promoting SLAM for mapping the specially challenging environment. The
+proposed method compensates for the insufficiencies of existing technologies in
+detecting debris flow channel interiors including detailed channel morphology,
+erosion patterns, deposit distinction, volume estimation and change detection.
+It serves to enhance the study of full-scale debris flow mechanisms, long-term
+post-seismic evolution, and hazard assessment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Appearance Debiased Gaze Estimation via Stochastic Subject-Wise
+  Adversarial Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suneung Kim, Woo-Jeoung Nam, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, appearance-based gaze estimation has been attracting attention in
+computer vision, and remarkable improvements have been achieved using various
+deep learning techniques. Despite such progress, most methods aim to infer gaze
+vectors from images directly, which causes overfitting to person-specific
+appearance factors. In this paper, we address these challenges and propose a
+novel framework: Stochastic subject-wise Adversarial gaZE learning (SAZE),
+which trains a network to generalize the appearance of subjects. We design a
+Face generalization Network (Fgen-Net) using a face-to-gaze encoder and face
+identity classifier and a proposed adversarial loss. The proposed loss
+generalizes face appearance factors so that the identity classifier inferences
+a uniform probability distribution. In addition, the Fgen-Net is trained by a
+learning mechanism that optimizes the network by reselecting a subset of
+subjects at every training step to avoid overfitting. Our experimental results
+verify the robustness of the method in that it yields state-of-the-art
+performance, achieving 3.89 and 4.42 on the MPIIGaze and EyeDiap datasets,
+respectively. Furthermore, we demonstrate the positive generalization effect by
+conducting further experiments using face images involving different styles
+generated from the generative model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MGAug: Multimodal Geometric Augmentation in Latent Spaces of Image
+  Deformations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13440v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13440v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tonmoy Hossain, Miaomiao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geometric transformations have been widely used to augment the size of
+training images. Existing methods often assume a unimodal distribution of the
+underlying transformations between images, which limits their power when data
+with multimodal distributions occur. In this paper, we propose a novel model,
+Multimodal Geometric Augmentation (MGAug), that for the first time generates
+augmenting transformations in a multimodal latent space of geometric
+deformations. To achieve this, we first develop a deep network that embeds the
+learning of latent geometric spaces of diffeomorphic transformations (a.k.a.
+diffeomorphisms) in a variational autoencoder (VAE). A mixture of multivariate
+Gaussians is formulated in the tangent space of diffeomorphisms and serves as a
+prior to approximate the hidden distribution of image transformations. We then
+augment the original training dataset by deforming images using randomly
+sampled transformations from the learned multimodal latent space of VAE. To
+validate the efficiency of our model, we jointly learn the augmentation
+strategy with two distinct domain-specific tasks: multi-class classification on
+2D synthetic datasets and segmentation on real 3D brain magnetic resonance
+images (MRIs). We also compare MGAug with state-of-the-art transformation-based
+image augmentation algorithms. Experimental results show that our proposed
+approach outperforms all baselines by significantly improved prediction
+accuracy. Our code is publicly available at
+https://github.com/tonmoy-hossain/MGAug.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProteusNeRF: Fast Lightweight NeRF Editing using 3D-Aware Image Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09965v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09965v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binglun Wang, Niladri Shekhar Dutt, Niloy J. Mitra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRFs) have recently emerged as a popular option for
+photo-realistic object capture due to their ability to faithfully capture
+high-fidelity volumetric content even from handheld video input. Although much
+research has been devoted to efficient optimization leading to real-time
+training and rendering, options for interactive editing NeRFs remain limited.
+We present a very simple but effective neural network architecture that is fast
+and efficient while maintaining a low memory footprint. This architecture can
+be incrementally guided through user-friendly image-based edits. Our
+representation allows straightforward object selection via semantic feature
+distillation at the training stage. More importantly, we propose a local
+3D-aware image context to facilitate view-consistent image editing that can
+then be distilled into fine-tuned NeRFs, via geometric and appearance
+adjustments. We evaluate our setup on a variety of examples to demonstrate
+appearance and geometric edits and report 10-30x speedup over concurrent work
+focusing on text-guided NeRF editing. Video results can be seen on our project
+webpage at https://proteusnerf.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Processing of Tri-Plane Hybrid Neural Fields <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01140v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01140v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adriano Cardace, Pierluigi Zama Ramirez, Francesco Ballerini, Allan Zhou, Samuele Salti, Luigi Di Stefano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driven by the appealing properties of neural fields for storing and
+communicating 3D data, the problem of directly processing them to address tasks
+such as classification and part segmentation has emerged and has been
+investigated in recent works. Early approaches employ neural fields
+parameterized by shared networks trained on the whole dataset, achieving good
+task performance but sacrificing reconstruction quality. To improve the latter,
+later methods focus on individual neural fields parameterized as large
+Multi-Layer Perceptrons (MLPs), which are, however, challenging to process due
+to the high dimensionality of the weight space, intrinsic weight space
+symmetries, and sensitivity to random initialization. Hence, results turn out
+significantly inferior to those achieved by processing explicit
+representations, e.g., point clouds or meshes. In the meantime, hybrid
+representations, in particular based on tri-planes, have emerged as a more
+effective and efficient alternative to realize neural fields, but their direct
+processing has not been investigated yet. In this paper, we show that the
+tri-plane discrete data structure encodes rich information, which can be
+effectively processed by standard deep-learning machinery. We define an
+extensive benchmark covering a diverse set of fields such as occupancy,
+signed/unsigned distance, and, for the first time, radiance fields. While
+processing a field with the same reconstruction quality, we achieve task
+performance far superior to frameworks that process large MLPs and, for the
+first time, almost on par with architectures handling explicit representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SymTC: A Symbiotic <span class="highlight-title">Transformer</span>-CNN Net for Instance Segmentation of
+  Lumbar Spine MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09627v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09627v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiasong Chen, Linchen Qian, Linhai Ma, Timur Urakov, Weiyong Gu, Liang Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intervertebral disc disease, a prevalent ailment, frequently leads to
+intermittent or persistent low back pain, and diagnosing and assessing of this
+disease rely on accurate measurement of vertebral bone and intervertebral disc
+geometries from lumbar MR images. Deep neural network (DNN) models may assist
+clinicians with more efficient image segmentation of individual instances
+(disks and vertebrae) of the lumbar spine in an automated way, which is termed
+as instance image segmentation. In this work, we proposed SymTC, an innovative
+lumbar spine MR image segmentation model that combines the strengths of
+Transformer and Convolutional Neural Network (CNN). Specifically, we designed a
+parallel dual-path architecture to merge CNN layers and Transformer layers, and
+we integrated a novel position embedding into the self-attention module of
+Transformer, enhancing the utilization of positional information for more
+accurate segmentation. To further improves model performance, we introduced a
+new data augmentation technique to create synthetic yet realistic MR image
+dataset, named SSMSpine, which is made publicly available. We evaluated our
+SymTC and the other 15 existing image segmentation models on our private
+in-house dataset and the public SSMSpine dataset, using two metrics, Dice
+Similarity Coefficient and 95% Hausdorff Distance. The results show that our
+SymTC has the best performance for segmenting vertebral bones and
+intervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine
+dataset are available at https://github.com/jiasongchen/SymTC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapting Segment Anything Model for Change Detection in HR Remote
+  Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01429v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01429v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Ding, Kun Zhu, Daifeng Peng, Hao Tang, Kuiwu Yang, Lorenzo Bruzzone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Foundation Models (VFMs) such as the Segment Anything Model (SAM)
+allow zero-shot or interactive segmentation of visual contents, thus they are
+quickly applied in a variety of visual scenes. However, their direct use in
+many Remote Sensing (RS) applications is often unsatisfactory due to the
+special imaging characteristics of RS images. In this work, we aim to utilize
+the strong visual recognition capabilities of VFMs to improve the change
+detection of high-resolution Remote Sensing Images (RSIs). We employ the visual
+encoder of FastSAM, an efficient variant of the SAM, to extract visual
+representations in RS scenes. To adapt FastSAM to focus on some specific ground
+objects in the RS scenes, we propose a convolutional adaptor to aggregate the
+task-oriented change information. Moreover, to utilize the semantic
+representations that are inherent to SAM features, we introduce a task-agnostic
+semantic learning branch to model the semantic latent in bi-temporal RSIs. The
+resulting method, SAMCD, obtains superior accuracy compared to the SOTA methods
+and exhibits a sample-efficient learning ability that is comparable to
+semi-supervised CD methods. To the best of our knowledge, this is the first
+work that adapts VFMs for the CD of HR RSIs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Visual Computing with Camera RAW Snapshots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.07778v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.07778v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Li, Ming Lu, Xu Zhang, Xin Feng, M. Salman Asif, Zhan Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional cameras capture image irradiance on a sensor and convert it to
+RGB images using an image signal processor (ISP). The images can then be used
+for photography or visual computing tasks in a variety of applications, such as
+public safety surveillance and autonomous driving. One can argue that since RAW
+images contain all the captured information, the conversion of RAW to RGB using
+an ISP is not necessary for visual computing. In this paper, we propose a novel
+$\rho$-Vision framework to perform high-level semantic understanding and
+low-level compression using RAW images without the ISP subsystem used for
+decades. Considering the scarcity of available RAW image datasets, we first
+develop an unpaired CycleR2R network based on unsupervised CycleGAN to train
+modular unrolled ISP and inverse ISP (invISP) models using unpaired RAW and RGB
+images. We can then flexibly generate simulated RAW images (simRAW) using any
+existing RGB image dataset and finetune different models originally trained for
+the RGB domain to process real-world camera RAW images. We demonstrate object
+detection and image compression capabilities in RAW-domain using RAW-domain
+YOLOv3 and RAW image compressor (RIC) on snapshots from various cameras.
+Quantitative results reveal that RAW-domain task inference provides better
+detection accuracy and compression compared to RGB-domain processing.
+Furthermore, the proposed \r{ho}-Vision generalizes across various camera
+sensors and different task-specific models. Additional advantages of the
+proposed $\rho$-Vision that eliminates the ISP are the potential reductions in
+computations and processing times.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by T-PAMI 2024. Homepage:
+  https://njuvision.github.io/rho-vision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Facial Action Unit Detection Based on Multi-task Learning Strategy for
+  Unlabeled Facial Images in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05207v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05207v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqiao Shang, Bin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial Action Unit (AU) detection often relies on highly-cost accurate
+labeling or inaccurate pseudo labeling techniques in recent years. How to
+introduce large amounts of unlabeled facial images in the wild into supervised
+AU detection frameworks has become a challenging problem. Additionally, nearly
+every type of AUs has the problem of unbalanced positive and negative samples.
+Inspired by other multi-task learning frameworks, we first propose a multi-task
+learning strategy boosting AU detection in the wild through jointing facial
+landmark detection and AU domain separation and reconstruction. Our introduced
+dual domains facial landmark detection framework can solve the lack of accurate
+facial landmark coordinates during the AU domain separation and reconstruction
+training process, while the parameters of homostructural facial extraction
+modules from these two similar facial tasks are shared. Moreover, we propose a
+pixel-level feature alignment scheme to maintain the consistency of features
+obtained from two separation and reconstruction processes. Furthermore, a
+weighted asymmetric loss is proposed to change the contribution of positive and
+negative samples of each type of AUs to model parameters updating. Experimental
+results on three widely used benchmarks demonstrate our superiority to most
+state-of-the-art methods for AU detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figure, submitted to Expert Systems with Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Defending Against Physical Adversarial Patch Attacks on Infrared Human
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15519v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15519v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Strack, Futa Waseda, Huy H. Nguyen, Yinqiang Zheng, Isao Echizen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared detection is an emerging technique for safety-critical tasks owing
+to its remarkable anti-interference capability. However, recent studies have
+revealed that it is vulnerable to physically-realizable adversarial patches,
+posing risks in its real-world applications. To address this problem, we are
+the first to investigate defense strategies against adversarial patch attacks
+on infrared detection, especially human detection. We have devised a
+straightforward defense strategy, patch-based occlusion-aware detection (POD),
+which efficiently augments training samples with random patches and
+subsequently detects them. POD not only robustly detects people but also
+identifies adversarial patch locations. Surprisingly, while being extremely
+computationally efficient, POD easily generalizes to state-of-the-art
+adversarial patch attacks that are unseen during training. Furthermore, POD
+improves detection precision even in a clean (i.e., no-attack) situation due to
+the data augmentation effect. Evaluation demonstrated that POD is robust to
+adversarial patches of various shapes and sizes. The effectiveness of our
+baseline approach is shown to be a viable defense mechanism for real-world
+infrared human detection systems, paving the way for exploring future research
+directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Lukas Strack and Futa Waseda contributed equally. 6 pages,
+  Under-review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Theory of General Difference in Continuous and Discrete Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08098v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08098v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linmi Tao, Ruiyang Liu, Donglai Tao, Wu Xia, Feilong Ma, Yu Cheng, Jingmao Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Though a core element of the digital age, numerical difference algorithms
+struggle with noise susceptibility. This stems from a key disconnect between
+the infinitesimal quantities in continuous differentiation and the finite
+intervals in its discrete counterpart. This disconnect violates the fundamental
+definition of differentiation (Leibniz and Cauchy). To bridge this gap, we
+build a novel general difference (Tao General Difference, TGD). Departing from
+derivative-by-integration, TGD generalizes differentiation to finite intervals
+in continuous domains through three key constraints. This allows us to
+calculate the general difference of a sequence in discrete domain via the
+continuous step function constructed from the sequence. Two construction
+methods, the rotational construction and the orthogonal construction, are
+proposed to construct the operators of TGD. The construction TGD operators take
+same convolution mode in calculation for continuous functions, discrete
+sequences, and arrays across any dimension. Our analysis with example
+operations showcases TGD's capability in both continuous and discrete domains,
+paving the way for accurate and noise-resistant differentiation in the digital
+era.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Grounded Object Centric Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09437v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09437v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avinash Kori, Francesco Locatello, Fabio De Sousa Ribeiro, Francesca Toni, Ben Glocker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The extraction of modular object-centric representations for downstream tasks
+is an emerging area of research. Learning grounded representations of objects
+that are guaranteed to be stable and invariant promises robust performance
+across different tasks and environments. Slot Attention (SA) learns
+object-centric representations by assigning objects to \textit{slots}, but
+presupposes a \textit{single} distribution from which all slots are randomly
+initialised. This results in an inability to learn \textit{specialized} slots
+which bind to specific object types and remain invariant to identity-preserving
+changes in object appearance. To address this, we present
+\emph{\textsc{Co}nditional \textsc{S}lot \textsc{A}ttention} (\textsc{CoSA})
+using a novel concept of \emph{Grounded Slot Dictionary} (GSD) inspired by
+vector quantization. Our proposed GSD comprises (i) canonical object-level
+property vectors and (ii) parametric Gaussian distributions, which define a
+prior over the slots. We demonstrate the benefits of our method in multiple
+downstream tasks such as scene generation, composition, and task adaptation,
+whilst remaining competitive with SA in popular object discovery benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning from History: Task-agnostic Model Contrastive Learning for
+  Image Restoration <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06023v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06023v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gang Wu, Junjun Jiang, Kui Jiang, Xianming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning has emerged as a prevailing paradigm for high-level
+vision tasks, which, by introducing properly negative samples, has also been
+exploited for low-level vision tasks to achieve a compact optimization space to
+account for their ill-posed nature. However, existing methods rely on manually
+predefined and task-oriented negatives, which often exhibit pronounced
+task-specific biases. To address this challenge, our paper introduces an
+innovative method termed 'learning from history', which dynamically generates
+negative samples from the target model itself. Our approach, named Model
+Contrastive Learning for Image Restoration (MCLIR), rejuvenates latency models
+as negative models, making it compatible with diverse image restoration tasks.
+We propose the Self-Prior guided Negative loss (SPN) to enable it. This
+approach significantly enhances existing models when retrained with the
+proposed model contrastive paradigm. The results show significant improvements
+in image restoration across various tasks and architectures. For example,
+models retrained with SPN outperform the original FFANet and DehazeFormer by
+3.41 dB and 0.57 dB on the RESIDE indoor dataset for image dehazing. Similarly,
+they achieve notable improvements of 0.47 dB on SPA-Data over IDT for image
+deraining and 0.12 dB on Manga109 for a 4x scale super-resolution over
+lightweight SwinIR, respectively. Code and retrained models are available at
+https://github.com/Aitical/MCLIR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera Ready Version. Accepted to The 38th Annual AAAI Conference on
+  Artificial Intelligence (AAAI 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking the Sim-to-Real Gap in Cloth Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09543v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09543v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Blanco-Mulero, Oriol Barbany, Gokhan Alcan, Adrià Colomé, Carme Torras, Ville Kyrki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Realistic physics engines play a crucial role for learning to manipulate
+deformable objects such as garments in simulation. By doing so, researchers can
+circumvent challenges such as sensing the deformation of the object in the
+realworld. In spite of the extensive use of simulations for this task, few
+works have evaluated the reality gap between deformable object simulators and
+real-world data. We present a benchmark dataset to evaluate the sim-to-real gap
+in cloth manipulation. The dataset is collected by performing a dynamic as well
+as a quasi-static cloth manipulation task involving contact with a rigid table.
+We use the dataset to evaluate the reality gap, computational time, and
+simulation stability of four popular deformable object simulators: MuJoCo,
+Bullet, Flex, and SOFA. Additionally, we discuss the benefits and drawbacks of
+each simulator. The benchmark dataset is open-source. Supplementary material,
+videos, and code, can be found at
+https://sites.google.com/view/cloth-sim2real-benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Robotics and Automation Letters (RA-L). 8 pages, 6
+  figures. Supplementary material available at
+  https://sites.google.com/view/cloth-sim2real-benchmark</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Private, fair and accurate: Training large-scale, privacy-preserving AI
+  models in medical imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01622v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01622v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroosh Tayebi Arasteh, Alexander Ziller, Christiane Kuhl, Marcus Makowski, Sven Nebelung, Rickmer Braren, Daniel Rueckert, Daniel Truhn, Georgios Kaissis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence (AI) models are increasingly used in the medical
+domain. However, as medical data is highly sensitive, special precautions to
+ensure its protection are required. The gold standard for privacy preservation
+is the introduction of differential privacy (DP) to model training. Prior work
+indicates that DP has negative implications on model accuracy and fairness,
+which are unacceptable in medicine and represent a main barrier to the
+widespread use of privacy-preserving techniques. In this work, we evaluated the
+effect of privacy-preserving training of AI models regarding accuracy and
+fairness compared to non-private training. For this, we used two datasets: (1)
+A large dataset (N=193,311) of high quality clinical chest radiographs, and (2)
+a dataset (N=1,625) of 3D abdominal computed tomography (CT) images, with the
+task of classifying the presence of pancreatic ductal adenocarcinoma (PDAC).
+Both were retrospectively collected and manually labeled by experienced
+radiologists. We then compared non-private deep convolutional neural networks
+(CNNs) and privacy-preserving (DP) models with respect to privacy-utility
+trade-offs measured as area under the receiver-operator-characteristic curve
+(AUROC), and privacy-fairness trade-offs, measured as Pearson's r or
+Statistical Parity Difference. We found that, while the privacy-preserving
+trainings yielded lower accuracy, they did largely not amplify discrimination
+against age, sex or co-morbidity. Our study shows that -- under the challenging
+realistic circumstances of a real-life clinical dataset -- the
+privacy-preserving training of diagnostic deep learning models is possible with
+excellent diagnostic accuracy and fairness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Communications Medicine. 2024. Nature Portfolio</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RD-VIO: Robust Visual-Inertial Odometry for Mobile Augmented Reality in
+  Dynamic Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.15072v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.15072v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinyu Li, Xiaokun Pan, Gan Huang, Ziyang Zhang, Nan Wang, Hujun Bao, Guofeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is typically challenging for visual or visual-inertial odometry systems to
+handle the problems of dynamic scenes and pure rotation. In this work, we
+design a novel visual-inertial odometry (VIO) system called RD-VIO to handle
+both of these two problems. Firstly, we propose an IMU-PARSAC algorithm which
+can robustly detect and match keypoints in a two-stage process. In the first
+state, landmarks are matched with new keypoints using visual and IMU
+measurements. We collect statistical information from the matching and then
+guide the intra-keypoint matching in the second stage. Secondly, to handle the
+problem of pure rotation, we detect the motion type and adapt the
+deferred-triangulation technique during the data-association process. We make
+the pure-rotational frames into the special subframes. When solving the
+visual-inertial bundle adjustment, they provide additional constraints to the
+pure-rotational motion. We evaluate the proposed VIO system on public datasets.
+Experiments show the proposed RD-VIO has obvious advantages over other methods
+in dynamic environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable Video Object Segmentation with Identification Mechanism <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.11442v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.11442v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongxin Yang, Jiaxu Miao, Yunchao Wei, Wenguan Wang, Xiaohan Wang, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper delves into the challenges of achieving scalable and effective
+multi-object modeling for semi-supervised Video Object Segmentation (VOS).
+Previous VOS methods decode features with a single positive object, limiting
+the learning of multi-object representation as they must match and segment each
+target separately under multi-object scenarios. Additionally, earlier
+techniques catered to specific application objectives and lacked the
+flexibility to fulfill different speed-accuracy requirements. To address these
+problems, we present two innovative approaches, Associating Objects with
+Transformers (AOT) and Associating Objects with Scalable Transformers (AOST).
+In pursuing effective multi-object modeling, AOT introduces the IDentification
+(ID) mechanism to allocate each object a unique identity. This approach enables
+the network to model the associations among all objects simultaneously, thus
+facilitating the tracking and segmentation of objects in a single network pass.
+To address the challenge of inflexible deployment, AOST further integrates
+scalable long short-term transformers that incorporate scalable supervision and
+layer-wise ID-based attention. This enables online architecture scalability in
+VOS for the first time and overcomes ID embeddings' representation limitations.
+Given the absence of a benchmark for VOS involving densely multi-object
+annotations, we propose a challenging Video Object Segmentation in the Wild
+(VOSW) benchmark to validate our approaches. We evaluated various AOT and AOST
+variants using extensive experiments across VOSW and five commonly used VOS
+benchmarks, including YouTube-VOS 2018 & 2019 Val, DAVIS-2017 Val & Test, and
+DAVIS-2016. Our approaches surpass the state-of-the-art competitors and display
+exceptional efficiency and scalability consistently across all six benchmarks.
+Project page: https://github.com/yoxu515/aot-benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extension of arXiv:2106.02638 (NeurIPS 2021)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SegMamba: Long-range Sequential Modeling Mamba For 3D Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13560v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13560v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaohu Xing, Tian Ye, Yijun Yang, Guang Liu, Lei Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Transformer architecture has shown a remarkable ability in modeling
+global relationships. However, it poses a significant computational challenge
+when processing high-dimensional medical images. This hinders its development
+and widespread adoption in this task. Mamba, as a State Space Model (SSM),
+recently emerged as a notable manner for long-range dependencies in sequential
+modeling, excelling in natural language processing filed with its remarkable
+memory efficiency and computational speed. Inspired by its success, we
+introduce SegMamba, a novel 3D medical image \textbf{Seg}mentation
+\textbf{Mamba} model, designed to effectively capture long-range dependencies
+within whole volume features at every scale. Our SegMamba, in contrast to
+Transformer-based methods, excels in whole volume feature modeling from a state
+space model standpoint, maintaining superior processing speed, even with volume
+features at a resolution of {$64\times 64\times 64$}. Comprehensive experiments
+on the BraTS2023 dataset demonstrate the effectiveness and efficiency of our
+SegMamba. The code for SegMamba is available at:
+https://github.com/ge-xing/SegMamba
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code has released</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hallucination Augmented Contrastive Learning for Multimodal Large
+  Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.06968v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.06968v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoya Jiang, Haiyang Xu, Mengfan Dong, Jiaxing Chen, Wei Ye, Ming Yan, Qinghao Ye, Ji Zhang, Fei Huang, Shikun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal large language models (MLLMs) have been shown to efficiently
+integrate natural language with visual information to handle multi-modal tasks.
+However, MLLMs still face a fundamental limitation of hallucinations, where
+they tend to generate erroneous or fabricated information. In this paper, we
+address hallucinations in MLLMs from a novel perspective of representation
+learning. We first analyzed the representation distribution of textual and
+visual tokens in MLLM, revealing two important findings: 1) there is a
+significant gap between textual and visual representations, indicating
+unsatisfactory cross-modal representation alignment; 2) representations of
+texts that contain and do not contain hallucinations are entangled, making it
+challenging to distinguish them. These two observations inspire us with a
+simple yet effective method to mitigate hallucinations. Specifically, we
+introduce contrastive learning into MLLMs and use text with hallucination as
+hard negative examples, naturally bringing representations of non-hallucinative
+text and visual samples closer while pushing way representations of
+non-hallucinating and hallucinative text. We evaluate our method quantitatively
+and qualitatively, showing its effectiveness in reducing hallucination
+occurrences and improving performance across multiple benchmarks. On the
+MMhal-Bench benchmark, our method obtains a 34.66% /29.5% improvement over the
+baseline MiniGPT-4/LLaVA. Our code is available on
+https://github.com/X-PLUG/mPLUG-HalOwl/tree/main/hacl.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Autoencoding of Dental Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Ziruo Ye, Thomas Ørkild, Peter Lempel Søndergaard, Søren Hauberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital dentistry has made significant advancements, yet numerous challenges
+remain. This paper introduces the FDI 16 dataset, an extensive collection of
+tooth meshes and point clouds. Additionally, we present a novel approach:
+Variational FoldingNet (VF-Net), a fully probabilistic variational autoencoder
+designed for point clouds. Notably, prior latent variable models for point
+clouds lack a one-to-one correspondence between input and output points.
+Instead, they rely on optimizing Chamfer distances, a metric that lacks a
+normalized distributional counterpart, rendering it unsuitable for
+probabilistic modeling. We replace the explicit minimization of Chamfer
+distances with a suitable encoder, increasing computational efficiency while
+simplifying the probabilistic extension. This allows for straightforward
+application in various tasks, including mesh generation, shape completion, and
+representation learning. Empirically, we provide evidence of lower
+reconstruction error in dental reconstruction and interpolation, showcasing
+state-of-the-art performance in dental sample generation while identifying
+valuable latent representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MO-YOLO: End-to-End Multiple-Object Tracking Method with YOLO and
+  Decoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.17170v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.17170v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liao Pan, Yang Feng, Wu Di, Liu Bo, Zhang Xingle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of multi-object tracking (MOT), recent Transformer based
+end-to-end models like MOTR have demonstrated exceptional performance on
+datasets such as DanceTracker. However, the computational demands of these
+models present challenges in training and deployment. Drawing inspiration from
+successful models like GPT, we present MO-YOLO, an efficient and
+computationally frugal end-to-end MOT model. MO-YOLO integrates principles from
+You Only Look Once (YOLO) and RT-DETR, adopting a decoder-only approach. By
+leveraging the decoder from RT-DETR and architectural components from YOLOv8,
+MO-YOLO achieves high speed, shorter training times, and proficient MOT
+performance. On the Dancetrack, MO-YOLO not only matches MOTR's performance but
+also surpasses it, achieving over twice the frames per second (MOTR 9.5 FPS,
+MO-YOLO 19.6 FPS). Furthermore, MO-YOLO demonstrates significantly reduced
+training times and lower hardware requirements compared to MOTR. This research
+introduces a promising paradigm for efficient end-to-end MOT, emphasizing
+enhanced performance and resource efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoSSegGaussians: Compact and Swift Scene Segmenting 3D Gaussians with
+  Dual Feature Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.05925v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.05925v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Dou, Tianyu Zhang, Yongjia Ma, Zhaohui Wang, Zejian Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Compact and Swift Segmenting 3D Gaussians(CoSSegGaussians), a
+method for compact 3D-consistent scene segmentation at fast rendering speed
+with only RGB images input. Previous NeRF-based segmentation methods have
+relied on time-consuming neural scene optimization. While recent 3D Gaussian
+Splatting has notably improved speed, existing Gaussian-based segmentation
+methods struggle to produce compact masks, especially in zero-shot
+segmentation. This issue probably stems from their straightforward assignment
+of learnable parameters to each Gaussian, resulting in a lack of robustness
+against cross-view inconsistent 2D machine-generated labels. Our method aims to
+address this problem by employing Dual Feature Fusion Network as Gaussians'
+segmentation field. Specifically, we first optimize 3D Gaussians under RGB
+supervision. After Gaussian Locating, DINO features extracted from images are
+applied through explicit unprojection, which are further incorporated with
+spatial features from the efficient point cloud processing network. Feature
+aggregation is utilized to fuse them in a global-to-local strategy for compact
+segmentation features. Experimental results show that our model outperforms
+baselines on both semantic and panoptic zero-shot segmentation task, meanwhile
+consumes less than 10\% inference time compared to NeRF-based methods. Code and
+more results will be available at https://David-Dou.github.io/CoSSegGaussians.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Correct writing details</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WaveDM: Wavelet-Based Diffusion Models for Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13819v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13819v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Huang, Jiancheng Huang, Jianzhuang Liu, Mingfu Yan, Yu Dong, Jiaxi Lv, Chaoqi Chen, Shifeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Latest diffusion-based methods for many image restoration tasks outperform
+traditional models, but they encounter the long-time inference problem. To
+tackle it, this paper proposes a Wavelet-Based Diffusion Model (WaveDM). WaveDM
+learns the distribution of clean images in the wavelet domain conditioned on
+the wavelet spectrum of degraded images after wavelet transform, which is more
+time-saving in each step of sampling than modeling in the spatial domain. To
+ensure restoration performance, a unique training strategy is proposed where
+the low-frequency and high-frequency spectrums are learned using distinct
+modules. In addition, an Efficient Conditional Sampling (ECS) strategy is
+developed from experiments, which reduces the number of total sampling steps to
+around 5. Evaluations on twelve benchmark datasets including image raindrop
+removal, rain steaks removal, dehazing, defocus deblurring, demoir\'eing, and
+denoising demonstrate that WaveDM achieves state-of-the-art performance with
+the efficiency that is comparable to traditional one-pass methods and over
+100$\times$ faster than existing image restoration methods using vanilla
+diffusion models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TMM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Reasoning with Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11562v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11562v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiankai Sun, Chuanyang Zheng, Enze Xie, Zhengying Liu, Ruihang Chu, Jianing Qiu, Jiaqi Xu, Mingyu Ding, Hongyang Li, Mengzhe Geng, Yue Wu, Wenhai Wang, Junsong Chen, Zhangyue Yin, Xiaozhe Ren, Jie Fu, Junxian He, Wu Yuan, Qi Liu, Xihui Liu, Yu Li, Hao Dong, Yu Cheng, Ming Zhang, Pheng Ann Heng, Jifeng Dai, Ping Luo, Jingdong Wang, Ji-Rong Wen, Xipeng Qiu, Yike Guo, Hui Xiong, Qun Liu, Zhenguo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning, a crucial ability for complex problem-solving, plays a pivotal
+role in various real-world settings such as negotiation, medical diagnosis, and
+criminal investigation. It serves as a fundamental methodology in the field of
+Artificial General Intelligence (AGI). With the ongoing development of
+foundation models, e.g., Large Language Models (LLMs), there is a growing
+interest in exploring their abilities in reasoning tasks. In this paper, we
+introduce seminal foundation models proposed or adaptable for reasoning,
+highlighting the latest advancements in various reasoning tasks, methods, and
+benchmarks. We then delve into the potential future directions behind the
+emergence of reasoning abilities within foundation models. We also discuss the
+relevance of multimodal learning, autonomous agents, and super alignment in the
+context of reasoning. By discussing these future research directions, we hope
+to inspire researchers in their exploration of this field, stimulate further
+advancements in reasoning with foundation models, and contribute to the
+development of AGI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 Figures, 160 Pages, 750+ References, Project Page
+  https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReSimAD: Zero-Shot 3D Domain Transfer for Autonomous Driving with Source
+  Reconstruction and Target Simulation <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05527v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05527v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Zhang, Xinyu Cai, Jiakang Yuan, Donglin Yang, Jianfei Guo, Xiangchao Yan, Renqiu Xia, Botian Shi, Min Dou, Tao Chen, Si Liu, Junchi Yan, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain shifts such as sensor type changes and geographical situation
+variations are prevalent in Autonomous Driving (AD), which poses a challenge
+since AD model relying on the previous domain knowledge can be hardly directly
+deployed to a new domain without additional costs. In this paper, we provide a
+new perspective and approach of alleviating the domain shifts, by proposing a
+Reconstruction-Simulation-Perception (ReSimAD) scheme. Specifically, the
+implicit reconstruction process is based on the knowledge from the previous old
+domain, aiming to convert the domain-related knowledge into domain-invariant
+representations, e.g., 3D scene-level meshes. Besides, the point clouds
+simulation process of multiple new domains is conditioned on the above
+reconstructed 3D meshes, where the target-domain-like simulation samples can be
+obtained, thus reducing the cost of collecting and annotating new-domain data
+for the subsequent perception process. For experiments, we consider different
+cross-domain situations such as Waymo-to-KITTI, Waymo-to-nuScenes,
+Waymo-to-ONCE, etc, to verify the zero-shot target-domain perception using
+ReSimAD. Results demonstrate that our method is beneficial to boost the domain
+generalization ability, even promising for 3D pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024. Code and simulated points are available at
+  https://github.com/PJLab-ADG/3DTrans#resimad</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UNIMO-G: Unified Image Generation through Multimodal Conditional
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13388v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13388v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Li, Xue Xu, Jiachen Liu, Xinyan Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing text-to-image diffusion models primarily generate images from text
+prompts. However, the inherent conciseness of textual descriptions poses
+challenges in faithfully synthesizing images with intricate details, such as
+specific entities or scenes. This paper presents UNIMO-G, a simple multimodal
+conditional diffusion framework that operates on multimodal prompts with
+interleaved textual and visual inputs, which demonstrates a unified ability for
+both text-driven and subject-driven image generation. UNIMO-G comprises two
+core components: a Multimodal Large Language Model (MLLM) for encoding
+multimodal prompts, and a conditional denoising diffusion network for
+generating images based on the encoded multimodal input. We leverage a
+two-stage training strategy to effectively train the framework: firstly
+pre-training on large-scale text-image pairs to develop conditional image
+generation capabilities, and then instruction tuning with multimodal prompts to
+achieve unified image generation proficiency. A well-designed data processing
+pipeline involving language grounding and image segmentation is employed to
+construct multi-modal prompts. UNIMO-G excels in both text-to-image generation
+and zero-shot subject-driven synthesis, and is notably effective in generating
+high-fidelity images from complex multimodal prompts involving multiple image
+entities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://unimo-ptm.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLIC: Large Receptive Field Transform Coding with Adaptive Weights for
+  Learned Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09571v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09571v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jiang, Peirong Ning, Jiayu Yang, Yongqi Zhai, Feng Gao, Ronggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective Receptive field (ERF) plays an important role in transform coding,
+which determines how much redundancy can be removed at most during transform
+and how many spatial priors can be utilized to synthesize textures during
+inverse transform. Existing methods rely on stacks of small kernels, whose ERF
+remains not large enough instead, or heavy non-local attention mechanisms,
+which limit the potential of high resolution image coding. To tackle this
+issue, we propose Large Receptive Field Transform Coding with Adaptive Weights
+for Learned Image Compression (LLIC). Specifically, for the first time in
+learned image compression community, we introduce a few large kernel-based
+depth-wise convolutions to reduce more redundancy while maintaining modest
+complexity. Due to wide range of image diversity, we propose to enhance the
+adaptability of convolutions via generating weights in a self-conditioned
+manner. The large kernels cooperate with non-linear embedding and gate
+mechanisms for better expressiveness and lighter point-wise interactions. We
+also investigate improved training techniques to fully exploit the potential of
+large kernels. In addition, to enhance the interactions among channels, we
+propose the adaptive channel-wise bit allocation via generating channel
+importance factor in a self-conditioned manner. To demonstrate the
+effectiveness of proposed transform coding, we align the entropy model to
+compare with existing transform methods and obtain models LLIC-STF, LLIC-ELIC,
+LLIC-TCM. Extensive experiments demonstrate our proposed LLIC models have
+significant improvements over corresponding baselines and achieve
+state-of-the-art performances and better trade-off between performance and
+complexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision <span class="highlight-title">Transformer</span> with Super Token Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11167v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11167v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaibo Huang, Xiaoqiang Zhou, Jie Cao, Ran He, Tieniu Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision transformer has achieved impressive performance for many vision tasks.
+However, it may suffer from high redundancy in capturing local features for
+shallow layers. Local self-attention or early-stage convolutions are thus
+utilized, which sacrifice the capacity to capture long-range dependency. A
+challenge then arises: can we access efficient and effective global context
+modeling at the early stages of a neural network? To address this issue, we
+draw inspiration from the design of superpixels, which reduces the number of
+image primitives in subsequent processing, and introduce super tokens into
+vision transformer. Super tokens attempt to provide a semantically meaningful
+tessellation of visual content, thus reducing the token number in
+self-attention as well as preserving global modeling. Specifically, we propose
+a simple yet strong super token attention (STA) mechanism with three steps: the
+first samples super tokens from visual tokens via sparse association learning,
+the second performs self-attention on super tokens, and the last maps them back
+to the original token space. STA decomposes vanilla global attention into
+multiplications of a sparse association map and a low-dimensional attention,
+leading to high efficiency in capturing global dependencies. Based on STA, we
+develop a hierarchical vision transformer. Extensive experiments demonstrate
+its strong performance on various vision tasks. In particular, without any
+extra training data or label, it achieves 86.4% top-1 accuracy on ImageNet-1K
+with less than 100M parameters. It also achieves 53.9 box AP and 46.8 mask AP
+on the COCO detection task, and 51.9 mIOU on the ADE20K semantic segmentation
+task. Code is released at https://github.com/hhb072/STViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Open-Vocabulary Tracking with Large <span class="highlight-title">Pre-Train</span>ed Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06992v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06992v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wen-Hsuan Chu, Adam W. Harley, Pavel Tokmakov, Achal Dave, Leonidas Guibas, Katerina Fragkiadaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object tracking is central to robot perception and scene understanding.
+Tracking-by-detection has long been a dominant paradigm for object tracking of
+specific object categories. Recently, large-scale pre-trained models have shown
+promising advances in detecting and segmenting objects and parts in 2D static
+images in the wild. This begs the question: can we re-purpose these large-scale
+pre-trained static image models for open-vocabulary video tracking? In this
+paper, we re-purpose an open-vocabulary detector, segmenter, and dense optical
+flow estimator, into a model that tracks and segments objects of any category
+in 2D videos. Our method predicts object and part tracks with associated
+language descriptions in monocular videos, rebuilding the pipeline of Tractor
+with modern large pre-trained models for static image detection and
+segmentation: we detect open-vocabulary object instances and propagate their
+boxes from frame to frame using a flow-based motion model, refine the
+propagated boxes with the box regression module of the visual detector, and
+prompt an open-world segmenter with the refined box to segment the objects. We
+decide the termination of an object track based on the objectness score of the
+propagated boxes, as well as forward-backward optical flow consistency. We
+re-identify objects across occlusions using deep feature matching. We show that
+our model achieves strong performance on multiple established video object
+segmentation and tracking benchmarks, and can produce reasonable tracks in
+manipulation data. In particular, our model outperforms previous
+state-of-the-art in UVO and BURST, benchmarks for open-world object tracking
+and segmentation, despite never being explicitly trained for tracking. We hope
+that our approach can serve as a simple and extensible framework for future
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page available at https://wenhsuanchu.github.io/ovtracktor/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Min-Max-Jump distance and its applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05994v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05994v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gangli Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore three applications of Min-Max-Jump distance (MMJ distance).
+MMJ-based K-means revises K-means with MMJ distance. MMJ-based Silhouette
+coefficient revises Silhouette coefficient with MMJ distance. We also tested
+the Clustering with Neural Network and Index (CNNI) model with MMJ-based
+Silhouette coefficient. In the last application, we tested using Min-Max-Jump
+distance for predicting labels of new points, after a clustering analysis of
+data. Result shows Min-Max-Jump distance achieves good performances in all the
+three proposed applications. In addition, we devise several algorithms for
+calculating or estimating the distance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Model for Dense Matching <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19094v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19094v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jisu Nam, Gyuseong Lee, Sunwoo Kim, Hyeonsu Kim, Hyoungwon Cho, Seyeon Kim, Seungryong Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The objective for establishing dense correspondence between paired images
+consists of two terms: a data term and a prior term. While conventional
+techniques focused on defining hand-designed prior terms, which are difficult
+to formulate, recent approaches have focused on learning the data term with
+deep neural networks without explicitly modeling the prior, assuming that the
+model itself has the capacity to learn an optimal prior from a large-scale
+dataset. The performance improvement was obvious, however, they often fail to
+address inherent ambiguities of matching, such as textureless regions,
+repetitive patterns, and large displacements. To address this, we propose
+DiffMatch, a novel conditional diffusion-based framework designed to explicitly
+model both the data and prior terms. Unlike previous approaches, this is
+accomplished by leveraging a conditional denoising diffusion model. DiffMatch
+consists of two main components: conditional denoising diffusion module and
+cost injection module. We stabilize the training process and reduce memory
+usage with a stage-wise training strategy. Furthermore, to boost performance,
+we introduce an inference technique that finds a better path to the accurate
+matching field. Our experimental results demonstrate significant performance
+improvements of our method over existing approaches, and the ablation studies
+validate our design choices along with the effectiveness of each component.
+Project page is available at https://ku-cvlab.github.io/DiffMatch/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024 (Oral), Project page is available at
+  https://ku-cvlab.github.io/DiffMatch/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short
+  Video Search Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10475v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10475v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangshuo Qiao, Xianxin Li, Xiaozhe Qu, Jie Zhang, Yang Liu, Yu Luo, Cihang Jin, Jin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Models pre-trained on large-scale image-text datasets have
+shown superior performance in downstream tasks such as image retrieval. Most of
+the images for pre-training are presented in the form of open domain
+common-sense visual elements. Differently, video covers in short video search
+scenarios are presented as user-originated contents that provide important
+visual summaries of videos. In addition, a portion of the video covers come
+with manually designed cover texts that provide semantic complements. In order
+to fill in the gaps in short video cover data, we establish the first
+large-scale cover-text benchmark for Chinese short video search scenarios.
+Specifically, we release two large-scale datasets CBVS-5M/10M to provide short
+video covers, and the manual fine-labeling dataset CBVS-20K to provide real
+user queries, which serves as an image-text benchmark test in the Chinese short
+video search field. To integrate the semantics of cover text in the case of
+modality missing, we propose UniCLIP where cover texts play a guiding role
+during training, however are not relied upon by inference. Extensive evaluation
+on CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has
+been deployed to Tencent's online video search systems with hundreds of
+millions of visits and achieved significant gains. The dataset and code are
+available at https://github.com/QQBrowserVideoSearch/CBVS-UniCLIP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond First Impressions: Integrating Joint Multi-modal Cues for
+  Comprehensive 3D Representation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02982v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02982v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowei Wang, Jiji Tang, Jiayi Ji, Xiaoshuai Sun, Rongsheng Zhang, Yiwei Ma, Minda Zhao, Lincheng Li, zeng zhao, Tangjie Lv, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, 3D understanding has turned to 2D vision-language
+pre-trained models to overcome data scarcity challenges. However, existing
+methods simply transfer 2D alignment strategies, aligning 3D representations
+with single-view 2D images and coarse-grained parent category text. These
+approaches introduce information degradation and insufficient synergy issues,
+leading to performance loss. Information degradation arises from overlooking
+the fact that a 3D representation should be equivalent to a series of
+multi-view images and more fine-grained subcategory text. Insufficient synergy
+neglects the idea that a robust 3D representation should align with the joint
+vision-language space, rather than independently aligning with each modality.
+In this paper, we propose a multi-view joint modality modeling approach, termed
+JM3D, to obtain a unified representation for point cloud, text, and image.
+Specifically, a novel Structured Multimodal Organizer (SMO) is proposed to
+address the information degradation issue, which introduces contiguous
+multi-view images and hierarchical text to enrich the representation of vision
+and language modalities. A Joint Multi-modal Alignment (JMA) is designed to
+tackle the insufficient synergy problem, which models the joint modality by
+incorporating language knowledge into the visual modality. Extensive
+experiments on ModelNet40 and ScanObjectNN demonstrate the effectiveness of our
+proposed method, JM3D, which achieves state-of-the-art performance in zero-shot
+3D classification. JM3D outperforms ULIP by approximately 4.3% on PointMLP and
+achieves an improvement of up to 6.5% accuracy on PointNet++ in top-1 accuracy
+for zero-shot 3D classification on ModelNet40. The source code and trained
+models for all our experiments are publicly available at
+https://github.com/Mr-Neko/JM3D.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023, 3D Understanding, JM3D</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ JM3D & JM3D-LLM: Elevating 3D Understanding with Joint Multi-modal Cues 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09503v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09503v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Ji, Haowei Wang, Changli Wu, Yiwei Ma, Xiaoshuai Sun, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rising importance of 3D understanding, pivotal in computer vision,
+autonomous driving, and robotics, is evident. However, a prevailing trend,
+which straightforwardly resorted to transferring 2D alignment strategies to the
+3D domain, encounters three distinct challenges: (1) Information Degradation:
+This arises from the alignment of 3D data with mere single-view 2D images and
+generic texts, neglecting the need for multi-view images and detailed
+subcategory texts. (2) Insufficient Synergy: These strategies align 3D
+representations to image and text features individually, hampering the overall
+optimization for 3D models. (3) Underutilization: The fine-grained information
+inherent in the learned representations is often not fully exploited,
+indicating a potential loss in detail. To address these issues, we introduce
+JM3D, a comprehensive approach integrating point cloud, text, and image. Key
+contributions include the Structured Multimodal Organizer (SMO), enriching
+vision-language representation with multiple views and hierarchical text, and
+the Joint Multi-modal Alignment (JMA), combining language understanding with
+visual representation. Our advanced model, JM3D-LLM, marries 3D representation
+with large language models via efficient fine-tuning. Evaluations on ModelNet40
+and ScanObjectNN establish JM3D's superiority. The superior performance of
+JM3D-LLM further underscores the effectiveness of our representation transfer
+approach. Our code and models are available at https://github.com/Mr-Neko/JM3D.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 4 figures, 10 tables, 3D understanding</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IA-LSTM: Interaction-Aware LSTM for Pedestrian Trajectory Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15193v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15193v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuehai Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting the trajectory of pedestrians in crowd scenarios is indispensable
+in self-driving or autonomous mobile robot field because estimating the future
+locations of pedestrians around is beneficial for policy decision to avoid
+collision. It is a challenging issue because humans have different walking
+motions, and the interactions between humans and objects in the current
+environment, especially between humans themselves, are complex. Previous
+researchers focused on how to model human-human interactions but neglected the
+relative importance of interactions. To address this issue, a novel mechanism
+based on correntropy is introduced. The proposed mechanism not only can measure
+the relative importance of human-human interactions but also can build personal
+space for each pedestrian. An interaction module including this data-driven
+mechanism is further proposed. In the proposed module, the data-driven
+mechanism can effectively extract the feature representations of dynamic
+human-human interactions in the scene and calculate the corresponding weights
+to represent the importance of different interactions. To share such social
+messages among pedestrians, an interaction-aware architecture based on long
+short-term memory network for trajectory prediction is designed. Experiments
+are conducted on two public datasets. Experimental results demonstrate that our
+model can achieve better performance than several latest methods with good
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Energy-based Automated Model Evaluation <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12689v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12689v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ru Peng, Heming Zou, Haobo Wang, Yawen Zeng, Zenan Huang, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional evaluation protocols on machine learning models rely heavily
+on a labeled, i.i.d-assumed testing dataset, which is not often present in real
+world applications. The Automated Model Evaluation (AutoEval) shows an
+alternative to this traditional workflow, by forming a proximal prediction
+pipeline of the testing performance without the presence of ground-truth
+labels. Despite its recent successes, the AutoEval frameworks still suffer from
+an overconfidence issue, substantial storage and computational cost. In that
+regard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that
+allows the AutoEval framework to be both more efficient and effective. The core
+of the MDE is to establish a meta-distribution statistic, on the information
+(energy) associated with individual samples, then offer a smoother
+representation enabled by energy-based learning. We further provide our
+theoretical insights by connecting the MDE with the classification loss. We
+provide extensive experiments across modalities, datasets and different
+architectural backbones to validate MDE's validity, together with its
+superiority compared with prior approaches. We also prove MDE's versatility by
+showing its seamless integration with large-scale models, and easy adaption to
+learning scenarios with noisy- or imbalanced- labels. Code and data are
+available: https://github.com/pengr/Energy_AutoEval
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR2024 poster paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mementos: A Comprehensive Benchmark for Multimodal Large Language Model
+  Reasoning over Image Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10529v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10529v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiyao Wang, Yuhang Zhou, Xiaoyu Liu, Hongjin Lu, Yuancheng Xu, Feihong He, Jaehong Yoon, Taixi Lu, Gedas Bertasius, Mohit Bansal, Huaxiu Yao, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in
+handling a variety of visual-language tasks. However, current MLLM benchmarks
+are predominantly designed to evaluate reasoning based on static information
+about a single image, and the ability of modern MLLMs to extrapolate from image
+sequences, which is essential for understanding our ever-changing world, has
+been less investigated. To address this challenge, this paper introduces
+Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning
+abilities. Mementos features 4,761 diverse image sequences with varying
+lengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning
+performance. Through a careful evaluation of nine recent MLLMs on Mementos,
+including GPT-4V and Gemini, we find that they struggle to accurately describe
+dynamic information about given image sequences, often leading to
+hallucinations/misrepresentations of objects and their corresponding behaviors.
+Our quantitative analysis and case studies identify three key factors impacting
+MLLMs' sequential image reasoning: the correlation between object and
+behavioral hallucinations, the influence of cooccurring behaviors, and the
+compounding impact of behavioral hallucinations. Our dataset is available at
+https://github.com/umd-huang-lab/Mementos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 23 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Discovering Intrinsic Spatial-Temporal Logic Rules to Explain Human
+  Actions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12244v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12244v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengzhi Cao, Chao Yang, Shuang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a logic-informed knowledge-driven modeling framework for human
+movements by analyzing their trajectories. Our approach is inspired by the fact
+that human actions are usually driven by their intentions or desires, and are
+influenced by environmental factors such as the spatial relationships with
+surrounding objects. In this paper, we introduce a set of spatial-temporal
+logic rules as knowledge to explain human actions. These rules will be
+automatically discovered from observational data. To learn the model parameters
+and the rule content, we design an expectation-maximization (EM) algorithm,
+which treats the rule content as latent variables. The EM algorithm alternates
+between the E-step and M-step: in the E-step, the posterior distribution over
+the latent rule content is evaluated; in the M-step, the rule generator and
+model parameters are jointly optimized by maximizing the current expected
+log-likelihood. Our model may have a wide range of applications in areas such
+as sports analytics, robotics, and autonomous cars, where understanding human
+movements are essential. We demonstrate the model's superior interpretability
+and prediction performance on pedestrian and NBA basketball player datasets,
+both achieving promising results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>There are missing descriptions of the results in section 5.6, and the
+  coordinates have an offset</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The effectiveness of MAE pre-<span class="highlight-title">pretrain</span>ing for billion-scale <span class="highlight-title">pretrain</span>ing <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13496v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13496v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mannat Singh, Quentin Duval, Kalyan Vasudev Alwala, Haoqi Fan, Vaibhav Aggarwal, Aaron Adcock, Armand Joulin, Piotr Dollár, Christoph Feichtenhofer, Ross Girshick, Rohit Girdhar, Ishan Misra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper revisits the standard pretrain-then-finetune paradigm used in
+computer vision for visual recognition tasks. Typically, state-of-the-art
+foundation models are pretrained using large scale (weakly) supervised datasets
+with billions of images. We introduce an additional pre-pretraining stage that
+is simple and uses the self-supervised MAE technique to initialize the model.
+While MAE has only been shown to scale with the size of models, we find that it
+scales with the size of the training dataset as well. Thus, our MAE-based
+pre-pretraining scales with both model and data size making it applicable for
+training foundation models. Pre-pretraining consistently improves both the
+model convergence and the downstream transfer performance across a range of
+model scales (millions to billions of parameters), and dataset sizes (millions
+to billions of images). We measure the effectiveness of pre-pretraining on 10
+different visual recognition tasks spanning image classification, video
+recognition, object detection, low-shot classification and zero-shot
+recognition. Our largest model achieves new state-of-the-art results on
+iNaturalist-18 (91.7%), ImageNet-ReaL (91.1%), 1-shot ImageNet-1k (63.6%), and
+zero-shot transfer on Food-101 (96.2%). Our study reveals that model
+initialization plays a significant role, even for web-scale pretraining with
+billions of images, and our models are available publicly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Models available at
+  https://github.com/facebookresearch/maws/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging sinusoidal representation networks to predict fMRI signals
+  from EEG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.04234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.04234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yamin Li, Ange Lou, Ziyuan Xu, Shiyu Wang, Catie Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In modern neuroscience, functional magnetic resonance imaging (fMRI) has been
+a crucial and irreplaceable tool that provides a non-invasive window into the
+dynamics of whole-brain activity. Nevertheless, fMRI is limited by hemodynamic
+blurring as well as high cost, immobility, and incompatibility with metal
+implants. Electroencephalography (EEG) is complementary to fMRI and can
+directly record the cortical electrical activity at high temporal resolution,
+but has more limited spatial resolution and is unable to recover information
+about deep subcortical brain structures. The ability to obtain fMRI information
+from EEG would enable cost-effective, imaging across a wider set of brain
+regions. Further, beyond augmenting the capabilities of EEG, cross-modality
+models would facilitate the interpretation of fMRI signals. However, as both
+EEG and fMRI are high-dimensional and prone to artifacts, it is currently
+challenging to model fMRI from EEG. To address this challenge, we propose a
+novel architecture that can predict fMRI signals directly from multi-channel
+EEG without explicit feature engineering. Our model achieves this by
+implementing a Sinusoidal Representation Network (SIREN) to learn frequency
+information in brain dynamics from EEG, which serves as the input to a
+subsequent encoder-decoder to effectively reconstruct the fMRI signal from a
+specific brain region. We evaluate our model using a simultaneous EEG-fMRI
+dataset with 8 subjects and investigate its potential for predicting
+subcortical fMRI signals. The present results reveal that our model outperforms
+a recent state-of-the-art model, and indicates the potential of leveraging
+periodic activation functions in deep neural networks to model functional
+neuroimaging data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAiD: Speech-driven Blendshape Facial Animation with Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inkyu Park, Jaewoong Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech-driven 3D facial animation is challenging due to the scarcity of
+large-scale visual-audio datasets despite extensive research. Most prior works,
+typically focused on learning regression models on a small dataset using the
+method of least squares, encounter difficulties generating diverse lip
+movements from speech and require substantial effort in refining the generated
+outputs. To address these issues, we propose a speech-driven 3D facial
+animation with a diffusion model (SAiD), a lightweight Transformer-based U-Net
+with a cross-modality alignment bias between audio and visual to enhance lip
+synchronization. Moreover, we introduce BlendVOCA, a benchmark dataset of pairs
+of speech audio and parameters of a blendshape facial model, to address the
+scarcity of public resources. Our experimental results demonstrate that the
+proposed approach achieves comparable or superior performance in lip
+synchronization to baselines, ensures more diverse lip movements, and
+streamlines the animation editing process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Fix bug related to the font size</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Aggregate Multi-Scale Context for Instance Segmentation in
+  Remote Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.11057v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.11057v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Liu, Huifang Li, Chao Hu, Shuang Luo, Yan Luo, Chang Wen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of instance segmentation in remote sensing images, aiming at
+performing per-pixel labeling of objects at instance level, is of great
+importance for various civil applications. Despite previous successes, most
+existing instance segmentation methods designed for natural images encounter
+sharp performance degradations when they are directly applied to top-view
+remote sensing images. Through careful analysis, we observe that the challenges
+mainly come from the lack of discriminative object features due to severe scale
+variations, low contrasts, and clustered distributions. In order to address
+these problems, a novel context aggregation network (CATNet) is proposed to
+improve the feature extraction process. The proposed model exploits three
+lightweight plug-and-play modules, namely dense feature pyramid network
+(DenseFPN), spatial context pyramid (SCP), and hierarchical region of interest
+extractor (HRoIE), to aggregate global visual context at feature, spatial, and
+instance domains, respectively. DenseFPN is a multi-scale feature propagation
+module that establishes more flexible information flows by adopting inter-level
+residual connections, cross-level dense connections, and feature re-weighting
+strategy. Leveraging the attention mechanism, SCP further augments the features
+by aggregating global spatial context into local regions. For each instance,
+HRoIE adaptively generates RoI features for different downstream tasks.
+Extensive evaluations of the proposed scheme on iSAID, DIOR, NWPU VHR-10, and
+HRSID datasets demonstrate that the proposed approach outperforms
+state-of-the-arts under similar computational costs. Source code and
+pre-trained models are available at https://github.com/yeliudev/CATNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Transactions on Neural Networks and Learning Systems
+  (TNNLS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Perceptual Group Tokenizer: Building Perception with Iterative Grouping <span class="chip">ICLR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.18296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.18296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Deng, Ting Chen, Yang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human visual recognition system shows astonishing capability of compressing
+visual information into a set of tokens containing rich representations without
+label supervision. One critical driving principle behind it is perceptual
+grouping. Despite being widely used in computer vision in the early 2010s, it
+remains a mystery whether perceptual grouping can be leveraged to derive a
+neural visual recognition backbone that generates as powerful representations.
+In this paper, we propose the Perceptual Group Tokenizer, a model that entirely
+relies on grouping operations to extract visual features and perform
+self-supervised representation learning, where a series of grouping operations
+are used to iteratively hypothesize the context for pixels or superpixels to
+refine feature representations. We show that the proposed model can achieve
+competitive performance compared to state-of-the-art vision architectures, and
+inherits desirable properties including adaptive computation without
+re-training, and interpretability. Specifically, Perceptual Group Tokenizer
+achieves 80.3% on ImageNet-1K self-supervised learning benchmark with linear
+probe evaluation, marking a new progress under this paradigm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The International Conference on Learning Representations (ICLR) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerating Retrieval-Augmented Language Model Serving with Speculation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Zhang, Alan Zhu, Lijie Yang, Yihua Xu, Lanting Li, Phitchaya Mangpo Phothilimthana, Zhihao Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented language models (RaLM) have demonstrated the potential to
+solve knowledge-intensive natural language processing (NLP) tasks by combining
+a non-parametric knowledge base with a parametric language model. Instead of
+fine-tuning a fully parametric model, RaLM excels at its low-cost adaptation to
+the latest data and better source attribution mechanisms. Among various RaLM
+approaches, iterative RaLM delivers a better generation quality due to a more
+frequent interaction between the retriever and the language model. Despite the
+benefits, iterative RaLM usually encounters high overheads due to the frequent
+retrieval step. To this end, we propose RaLMSpec, a speculation-inspired
+framework that provides generic speed-up over iterative RaLM while preserving
+the same model outputs through speculative retrieval and batched verification.
+By further incorporating prefetching, optimal speculation stride scheduler, and
+asynchronous verification, RaLMSpec can automatically exploit the acceleration
+potential to the fullest. For naive iterative RaLM serving, extensive
+evaluations over three language models on four downstream QA datasets
+demonstrate that RaLMSpec can achieve a speed-up ratio of 1.75-2.39x,
+1.04-1.39x, and 1.31-1.77x when the retriever is an exact dense retriever,
+approximate dense retriever, and sparse retriever respectively compared with
+the baseline. For KNN-LM serving, RaLMSpec can achieve a speed-up ratio up to
+7.59x and 2.45x when the retriever is an exact dense retriever and approximate
+dense retriever, respectively, compared with the baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards 3D Molecule-Text Interpretation in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sihang Li, Zhiyuan Liu, Yanchen Luo, Xiang Wang, Xiangnan He, Kenji Kawaguchi, Tat-Seng Chua, Qi Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language Models (LMs) have greatly influenced diverse domains. However, their
+inherent limitation in comprehending 3D molecular structures has considerably
+constrained their potential in the biomolecular domain. To bridge this gap, we
+focus on 3D molecule-text interpretation, and propose 3D-MoLM: 3D-Molecular
+Language Modeling. Specifically, 3D-MoLM enables an LM to interpret and analyze
+3D molecules by equipping the LM with a 3D molecular encoder. This integration
+is achieved by a 3D molecule-text projector, bridging the 3D molecular
+encoder's representation space and the LM's input space. Moreover, to enhance
+3D-MoLM's ability of cross-modal molecular understanding and instruction
+following, we meticulously curated a 3D molecule-centric instruction tuning
+dataset -- 3D-MoIT. Through 3D molecule-text alignment and 3D molecule-centric
+instruction tuning, 3D-MoLM establishes an integration of 3D molecular encoder
+and LM. It significantly surpasses existing baselines on downstream tasks,
+including molecule-text retrieval, molecule captioning, and more challenging
+open-text molecular QA tasks, especially focusing on 3D-dependent properties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating Large Language Models into Recommendation via Mutual
+  Augmentation and Adaptive Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sichun Luo, Yuxuan Yao, Bowei He, Yinya Huang, Aojun Zhou, Xinyi Zhang, Yuanzhang Xiao, Mingjie Zhan, Linqi Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional recommendation methods have achieved notable advancements by
+harnessing collaborative or sequential information from user behavior.
+Recently, large language models (LLMs) have gained prominence for their
+capabilities in understanding and reasoning over textual semantics, and have
+found utility in various domains, including recommendation. Conventional
+recommendation methods and LLMs each have their strengths and weaknesses. While
+conventional methods excel at mining collaborative information and modeling
+sequential behavior, they struggle with data sparsity and the long-tail
+problem. LLMs, on the other hand, are proficient at utilizing rich textual
+contexts but face challenges in mining collaborative or sequential information.
+Despite their individual successes, there is a significant gap in leveraging
+their combined potential to enhance recommendation performance.
+  In this paper, we introduce a general and model-agnostic framework known as
+\textbf{L}arge \textbf{la}nguage model with \textbf{m}utual augmentation and
+\textbf{a}daptive aggregation for \textbf{Rec}ommendation (\textbf{Llama4Rec}).
+Llama4Rec synergistically combines conventional and LLM-based recommendation
+models. Llama4Rec proposes data augmentation and prompt augmentation strategies
+tailored to enhance the conventional model and LLM respectively. An adaptive
+aggregation module is adopted to combine the predictions of both kinds of
+models to refine the final recommendation results. Empirical studies on three
+real-world datasets validate the superiority of Llama4Rec, demonstrating its
+consistent outperformance of baseline methods and significant improvements in
+recommendation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Search for Stability: Learning Dynamics of Strategic Publishers with
+  Initial Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16695v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16695v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omer Madmon, Idan Pipano, Itamar Reinman, Moshe Tennenholtz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a game-theoretic information retrieval model in which strategic
+publishers aim to maximize their chances of being ranked first by the search
+engine while maintaining the integrity of their original documents. We show
+that the commonly used Probability Ranking Principle (PRP) ranking scheme
+results in an unstable environment where games often fail to reach pure Nash
+equilibrium. We propose the Relative Ranking Principle (RRP) as an alternative
+ranking principle and introduce two families of ranking functions that are
+instances of the RRP. We provide both theoretical and empirical evidence that
+these methods lead to a stable search ecosystem, by providing positive results
+on the learning dynamics convergence. We also define the publishers' and users'
+welfare, demonstrate a possible publisher-user trade-off, and provide means for
+a search system designer to control it. Finally, we show how instability harms
+long-term users' welfare.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Recommender Systems with Large Language Model Reasoning Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10835v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10835v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Wang, Zhixuan Chu, Xin Ouyang, Simeng Wang, Hongyan Hao, Yue Shen, Jinjie Gu, Siqiao Xue, James Y Zhang, Qing Cui, Longfei Li, Jun Zhou, Sheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation systems aim to provide users with relevant suggestions, but
+often lack interpretability and fail to capture higher-level semantic
+relationships between user behaviors and profiles. In this paper, we propose a
+novel approach that leverages large language models (LLMs) to construct
+personalized reasoning graphs. These graphs link a user's profile and
+behavioral sequences through causal and logical inferences, representing the
+user's interests in an interpretable way. Our approach, LLM reasoning graphs
+(LLMRG), has four components: chained graph reasoning, divergent extension,
+self-verification and scoring, and knowledge base self-improvement. The
+resulting reasoning graph is encoded using graph neural networks, which serves
+as additional input to improve conventional recommender systems, without
+requiring extra user or item information. Our approach demonstrates how LLMs
+can enable more logical and interpretable recommender systems through
+personalized reasoning graphs. LLMRG allows recommendations to benefit from
+both engineered recommendation systems and LLM-derived reasoning graphs. We
+demonstrate the effectiveness of LLMRG on benchmarks and real-world scenarios
+in enhancing base recommendation models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SynthTab: Leveraging Synthesized Data for Guitar Tablature Transcription <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09085v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09085v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongyi Zang, Yi Zhong, Frank Cwitkowitz, Zhiyao Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Guitar tablature is a form of music notation widely used among guitarists. It
+captures not only the musical content of a piece, but also its implementation
+and ornamentation on the instrument. Guitar Tablature Transcription (GTT) is an
+important task with broad applications in music education, composition, and
+entertainment. Existing GTT datasets are quite limited in size and scope,
+rendering models trained on them prone to overfitting and incapable of
+generalizing to out-of-domain data. In order to address this issue, we present
+a methodology for synthesizing large-scale GTT audio using commercial acoustic
+and electric guitar plugins. We procure SynthTab, a dataset derived from
+DadaGP, which is a vast and diverse collection of richly annotated symbolic
+tablature. The proposed synthesis pipeline produces audio which faithfully
+adheres to the original fingerings and a subset of techniques specified in the
+tablature, and covers multiple guitars and styles for each track. Experiments
+show that pre-training a baseline GTT model on SynthTab can improve
+transcription performance when fine-tuning and testing on an individual
+dataset. More importantly, cross-dataset experiments show that pre-training
+significantly mitigates issues with overfitting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decentralized Collaborative Learning with Adaptive Reference Data for
+  On-Device POI Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13448v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13448v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiqi Zheng, Liang Qu, Tong Chen, Lizhen Cui, Yuhui Shi, Hongzhi Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Location-based Social Networks, Point-of-Interest (POI) recommendation
+helps users discover interesting places. There is a trend to move from the
+cloud-based model to on-device recommendations for privacy protection and
+reduced server reliance. Due to the scarcity of local user-item interactions on
+individual devices, solely relying on local instances is not adequate.
+Collaborative Learning (CL) emerges to promote model sharing among users, where
+reference data is an intermediary that allows users to exchange their soft
+decisions without directly sharing their private data or parameters, ensuring
+privacy and benefiting from collaboration. However, existing CL-based
+recommendations typically use a single reference for all users. Reference data
+valuable for one user might be harmful to another, given diverse user
+preferences. Users may not offer meaningful soft decisions on items outside
+their interest scope. Consequently, using the same reference data for all
+collaborations can impede knowledge exchange and lead to sub-optimal
+performance. To address this gap, we introduce the Decentralized Collaborative
+Learning with Adaptive Reference Data (DARD) framework, which crafts adaptive
+reference data for effective user collaboration. It first generates a
+desensitized public reference data pool with transformation and probability
+data generation methods. For each user, the selection of adaptive reference
+data is executed in parallel by training loss tracking and influence function.
+Local models are trained with individual private data and collaboratively with
+the geographical and semantic neighbors. During the collaboration between two
+users, they exchange soft decisions based on a combined set of their adaptive
+reference data. Our evaluations across two real-world datasets highlight DARD's
+superiority in recommendation performance and addressing the scarcity of
+available reference data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ It's About Time: Incorporating Temporality in Retrieval Augmented
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13222v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13222v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anoushka Gade, Jorjeta Jetcheva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The web serves as a global repository of knowledge, used by billions of
+people to search for information. Ensuring that users receive the most relevant
+and up-to-date information, especially in the presence of multiple versions of
+web content from different time points remains a critical challenge for
+information retrieval. This challenge has recently been compounded by the
+increased use of question answering tools trained on Wikipedia or web content
+and powered by large language models (LLMs) which have been found to make up
+information (or hallucinate), and in addition have been shown to struggle with
+the temporal dimensions of information. Even Retriever Augmented Language
+Models (RALMs) which incorporate a document database to reduce LLM
+hallucination are unable to handle temporal queries correctly. This leads to
+instances where RALMs respond to queries such as "Who won the Wimbledon
+Championship?", by retrieving document passages related to Wimbledon but
+without the ability to differentiate between them based on how recent they are.
+  In this paper, we propose and evaluate, TempRALM, a temporally-aware
+Retriever Augmented Language Model (RALM) with few-shot learning extensions,
+which takes into account both semantically and temporally relevant documents
+relative to a given query, rather than relying on semantic similarity alone. We
+show that our approach results in up to 74% improvement in performance over the
+baseline RALM model, without requiring model pre-training, recalculating or
+replacing the RALM document index, or adding other computationally intensive
+elements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">147</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Pathway: Improve <span class="highlight-title">Transformer</span>s with Irrelevant Data from Other
+  Modalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14405v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14405v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyuan Zhang, Xiaohan Ding, Kaixiong Gong, Yixiao Ge, Ying Shan, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose to improve transformers of a specific modality with irrelevant
+data from other modalities, e.g., improve an ImageNet model with audio or point
+cloud datasets. We would like to highlight that the data samples of the target
+modality are irrelevant to the other modalities, which distinguishes our method
+from other works utilizing paired (e.g., CLIP) or interleaved data of different
+modalities. We propose a methodology named Multimodal Pathway - given a target
+modality and a transformer designed for it, we use an auxiliary transformer
+trained with data of another modality and construct pathways to connect
+components of the two models so that data of the target modality can be
+processed by both models. In this way, we utilize the universal
+sequence-to-sequence modeling abilities of transformers obtained from two
+modalities. As a concrete implementation, we use a modality-specific tokenizer
+and task-specific head as usual but utilize the transformer blocks of the
+auxiliary model via a proposed method named Cross-Modal Re-parameterization,
+which exploits the auxiliary weights without any inference costs. On the image,
+point cloud, video, and audio recognition tasks, we observe significant and
+consistent performance improvements with irrelevant data from other modalities.
+The code and models are available at https://github.com/AILab-CVC/M2PT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and models are available at
+  https://github.com/AILab-CVC/M2PT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deconstructing Denoising Diffusion Models for <span class="highlight-title">Self-Supervised</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinlei Chen, Zhuang Liu, Saining Xie, Kaiming He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we examine the representation learning abilities of Denoising
+Diffusion Models (DDM) that were originally purposed for image generation. Our
+philosophy is to deconstruct a DDM, gradually transforming it into a classical
+Denoising Autoencoder (DAE). This deconstructive procedure allows us to explore
+how various components of modern DDMs influence self-supervised representation
+learning. We observe that only a very few modern components are critical for
+learning good representations, while many others are nonessential. Our study
+ultimately arrives at an approach that is highly simplified and to a large
+extent resembles a classical DAE. We hope our study will rekindle interest in a
+family of classical methods within the realm of modern self-supervised
+learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report, 10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Mobile Manipulation for Articulated Objects In the Open World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Xiong, Russell Mendonca, Kenneth Shaw, Deepak Pathak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying robots in open-ended unstructured environments such as homes has
+been a long-standing research problem. However, robots are often studied only
+in closed-off lab settings, and prior mobile manipulation work is restricted to
+pick-move-place, which is arguably just the tip of the iceberg in this area. In
+this paper, we introduce Open-World Mobile Manipulation System, a full-stack
+approach to tackle realistic articulated object operation, e.g. real-world
+doors, cabinets, drawers, and refrigerators in open-ended unstructured
+environments. The robot utilizes an adaptive learning framework to initially
+learns from a small set of data through behavior cloning, followed by learning
+from online practice on novel objects that fall outside the training
+distribution. We also develop a low-cost mobile manipulation hardware platform
+capable of safe and autonomous online adaptation in unstructured environments
+with a cost of around 20,000 USD. In our experiments we utilize 20 articulate
+objects across 4 buildings in the CMU campus. With less than an hour of online
+learning for each object, the system is able to increase success rate from 50%
+of BC pre-training to 95% using online adaptation. Video results at
+https://open-world-mobilemanip.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website at https://open-world-mobilemanip.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ pix2gestalt: Amodal Segmentation by Synthesizing Wholes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ege Ozguroglu, Ruoshi Liu, Dídac Surís, Dian Chen, Achal Dave, Pavel Tokmakov, Carl Vondrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce pix2gestalt, a framework for zero-shot amodal segmentation,
+which learns to estimate the shape and appearance of whole objects that are
+only partially visible behind occlusions. By capitalizing on large-scale
+diffusion models and transferring their representations to this task, we learn
+a conditional diffusion model for reconstructing whole objects in challenging
+zero-shot cases, including examples that break natural and physical priors,
+such as art. As training data, we use a synthetically curated dataset
+containing occluded objects paired with their whole counterparts. Experiments
+show that our approach outperforms supervised baselines on established
+benchmarks. Our model can furthermore be used to significantly improve the
+performance of existing object recognition and 3D reconstruction methods in the
+presence of occlusions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website: https://gestalt.cs.columbia.edu/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Smooth Ranking SVM via Cutting-Plane Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erhan Can Ozcan, Berk Görgülü, Mustafa G. Baydogan, Ioannis Ch. Paschalidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The most popular classification algorithms are designed to maximize
+classification accuracy during training. However, this strategy may fail in the
+presence of class imbalance since it is possible to train models with high
+accuracy by overfitting to the majority class. On the other hand, the Area
+Under the Curve (AUC) is a widely used metric to compare classification
+performance of different algorithms when there is a class imbalance, and
+various approaches focusing on the direct optimization of this metric during
+training have been proposed. Among them, SVM-based formulations are especially
+popular as this formulation allows incorporating different regularization
+strategies easily. In this work, we develop a prototype learning approach that
+relies on cutting-plane method, similar to Ranking SVM, to maximize AUC. Our
+algorithm learns simpler models by iteratively introducing cutting planes, thus
+overfitting is prevented in an unconventional way. Furthermore, it penalizes
+the changes in the weights at each iteration to avoid large jumps that might be
+observed in the test performance, thus facilitating a smooth learning process.
+Based on the experiments conducted on 73 binary classification datasets, our
+method yields the best test AUC in 25 datasets among its relevant competitors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Orthogonal Polynomial Kernel-Based Machine Learning Model for
+  Differential-Algebraic Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tayebeh Taheri, Alireza Afzal Aghaei, Kourosh Parand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent introduction of the Least-Squares Support Vector Regression
+(LS-SVR) algorithm for solving differential and integral equations has sparked
+interest. In this study, we expand the application of this algorithm to address
+systems of differential-algebraic equations (DAEs). Our work presents a novel
+approach to solving general DAEs in an operator format by establishing
+connections between the LS-SVR machine learning model, weighted residual
+methods, and Legendre orthogonal polynomials. To assess the effectiveness of
+our proposed method, we conduct simulations involving various DAE scenarios,
+such as nonlinear systems, fractional-order derivatives, integro-differential,
+and partial DAEs. Finally, we carry out comparisons between our proposed method
+and currently established state-of-the-art approaches, demonstrating its
+reliability and effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Manifold GCN: Diffusion-based Convolutional Neural Network for
+  Manifold-valued Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Hanik, Gabriele Steidl, Christoph von Tycowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose two graph neural network layers for graphs with features in a
+Riemannian manifold. First, based on a manifold-valued graph diffusion
+equation, we construct a diffusion layer that can be applied to an arbitrary
+number of nodes and graph connectivity patterns. Second, we model a tangent
+multilayer perceptron by transferring ideas from the vector neuron framework to
+our general setting. Both layers are equivariant with respect to node
+permutations and isometries of the feature manifold. These properties have been
+shown to lead to a beneficial inductive bias in many deep learning tasks.
+Numerical examples on synthetic data as well as on triangle meshes of the right
+hippocampus to classify Alzheimer's disease demonstrate the very good
+performance of our layers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UrbanGenAI: Reconstructing Urban Landscapes using Panoptic Segmentation
+  and Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timo Kapsalis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contemporary design practices, the integration of computer vision and
+generative artificial intelligence (genAI) represents a transformative shift
+towards more interactive and inclusive processes. These technologies offer new
+dimensions of image analysis and generation, which are particularly relevant in
+the context of urban landscape reconstruction. This paper presents a novel
+workflow encapsulated within a prototype application, designed to leverage the
+synergies between advanced image segmentation and diffusion models for a
+comprehensive approach to urban design. Our methodology encompasses the
+OneFormer model for detailed image segmentation and the Stable Diffusion XL
+(SDXL) diffusion model, implemented through ControlNet, for generating images
+from textual descriptions. Validation results indicated a high degree of
+performance by the prototype application, showcasing significant accuracy in
+both object detection and text-to-image generation. This was evidenced by
+superior Intersection over Union (IoU) and CLIP scores across iterative
+evaluations for various categories of urban landscape features. Preliminary
+testing included utilising UrbanGenAI as an educational tool enhancing the
+learning experience in design pedagogy, and as a participatory instrument
+facilitating community-driven urban planning. Early results suggested that
+UrbanGenAI not only advances the technical frontiers of urban landscape
+reconstruction but also provides significant pedagogical and participatory
+planning benefits. The ongoing development of UrbanGenAI aims to further
+validate its effectiveness across broader contexts and integrate additional
+features such as real-time feedback mechanisms and 3D modelling capabilities.
+Keywords: generative AI; panoptic image segmentation; diffusion models; urban
+landscape design; design pedagogy; co-design
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TURNA: A Turkish Encoder-Decoder Language Model for Enhanced
+  Understanding and Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gökçe Uludoğan, Zeynep Yirmibeşoğlu Balal, Furkan Akkurt, Melikşah Türker, Onur Güngör, Susan Üsküdarlı
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advances in natural language processing have predominantly favored
+well-resourced English-centric models, resulting in a significant gap with
+low-resource languages. In this work, we introduce the language model TURNA,
+which is developed for the low-resource language Turkish and is capable of both
+natural language understanding and generation tasks. TURNA is pretrained with
+an encoder-decoder architecture based on the unified framework UL2 with a
+diverse corpus that we specifically curated for this purpose. We evaluated
+TURNA with three generation tasks and five understanding tasks for Turkish. The
+results show that TURNA outperforms several multilingual models in both
+understanding and generation tasks, and competes with monolingual Turkish
+models in understanding tasks. TURNA is made available at
+https://huggingface.co/boun-tabi-LMG/TURNA .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Genie: Achieving Human Parity in Content-Grounded <span class="highlight-title">Dataset</span>s Generation <span class="chip">ICLR24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14367v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14367v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asaf Yehudai, Boaz Carmeli, Yosi Mass, Ofir Arviv, Nathaniel Mills, Assaf Toledo, Eyal Shnarch, Leshem Choshen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lack of high-quality data for content-grounded generation tasks has been
+identified as a major obstacle to advancing these tasks. To address this gap,
+we propose Genie, a novel method for automatically generating high-quality
+content-grounded data. It consists of three stages: (a) Content Preparation,
+(b) Generation: creating task-specific examples from the content (e.g.,
+question-answer pairs or summaries). (c) Filtering mechanism aiming to ensure
+the quality and faithfulness of the generated data. We showcase this
+methodology by generating three large-scale synthetic data, making wishes, for
+Long-Form Question-Answering (LFQA), summarization, and information extraction.
+In a human evaluation, our generated data was found to be natural and of high
+quality. Furthermore, we compare models trained on our data with models trained
+on human-written data -- ELI5 and ASQA for LFQA and CNN-DailyMail for
+Summarization. We show that our models are on par with or outperforming models
+trained on human-generated data and consistently outperforming them in
+faithfulness. Finally, we applied our method to create LFQA data within the
+medical domain and compared a model trained on it with models trained on other
+domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MoE-Infinity: Activation-Aware Expert Offloading for Efficient MoE
+  Serving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leyang Xue, Yao Fu, Zhan Lu, Luo Mai, Mahesh Marina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents MoE-Infinity, a cost-efficient mixture-of-expert (MoE)
+serving system that realizes activation-aware expert offloading. MoE-Infinity
+features sequence-level expert activation tracing, a new approach adept at
+identifying sparse activations and capturing the temporal locality of MoE
+inference. By analyzing these traces, MoE-Infinity performs novel
+activation-aware expert prefetching and caching, substantially reducing the
+latency overheads usually associated with offloading experts for improved cost
+performance. Extensive experiments in a cluster show that MoE-Infinity
+outperforms numerous existing systems and approaches, reducing latency by 4 -
+20X and decreasing deployment costs by over 8X for various MoEs. MoE-Infinity's
+source code is publicly available at https://github.com/TorchMoE/MoE-Infinity
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ServerlessLLM: Locality-Enhanced Serverless Inference for Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14351v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14351v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, Luo Mai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents ServerlessLLM, a locality-enhanced serverless inference
+system for Large Language Models (LLMs). ServerlessLLM exploits the substantial
+capacity and bandwidth of storage and memory devices available on GPU servers,
+thereby reducing costly remote checkpoint downloads and achieving efficient
+checkpoint loading. ServerlessLLM achieves this through three main
+contributions: (i) fast LLM checkpoint loading via a novel loading-optimized
+checkpoint format design, coupled with an efficient multi-tier checkpoint
+loading system; (ii) locality-driven LLM inference with live migration, which
+allows ServerlessLLM to effectively achieve locality-driven server allocation
+while preserving the low latency of ongoing LLM inference; and (iii)
+locality-aware server allocation, enabling ServerlessLLM to evaluate the status
+of each server in a cluster and effectively schedule model startup time to
+capitalize on local checkpoint placement. Our comprehensive experiments, which
+include microbenchmarks and real-world traces, show that ServerlessLLM
+surpasses state-of-the-art systems by 10 - 200X in latency performance when
+running various LLM inference workloads.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Class-attribute Priors: Adapting Optimization to Heterogeneity and
+  Fairness Objective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuechen Zhang, Mingchen Li, Jiasi Chen, Christos Thrampoulidis, Samet Oymak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern classification problems exhibit heterogeneities across individual
+classes: Each class may have unique attributes, such as sample size, label
+quality, or predictability (easy vs difficult), and variable importance at
+test-time. Without care, these heterogeneities impede the learning process,
+most notably, when optimizing fairness objectives. Confirming this, under a
+gaussian mixture setting, we show that the optimal SVM classifier for balanced
+accuracy needs to be adaptive to the class attributes. This motivates us to
+propose CAP: An effective and general method that generates a class-specific
+learning strategy (e.g. hyperparameter) based on the attributes of that class.
+This way, optimization process better adapts to heterogeneities. CAP leads to
+substantial improvements over the naive approach of assigning separate
+hyperparameters to each class. We instantiate CAP for loss function design and
+post-hoc logit adjustment, with emphasis on label-imbalanced problems. We show
+that CAP is competitive with prior art and its flexibility unlocks clear
+benefits for fairness objectives beyond balanced accuracy. Finally, we evaluate
+CAP on problems with label noise as well as weighted test objectives to
+showcase how CAP can jointly adapt to different heterogeneities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimation of partially known Gaussian graphical models with score-based
+  structural priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martín Sevilla, Antonio García Marques, Santiago Segarra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel algorithm for the support estimation of partially known
+Gaussian graphical models that incorporates prior information about the
+underlying graph. In contrast to classical approaches that provide a point
+estimate based on a maximum likelihood or a maximum a posteriori criterion
+using (simple) priors on the precision matrix, we consider a prior on the graph
+and rely on annealed Langevin diffusion to generate samples from the posterior
+distribution. Since the Langevin sampler requires access to the score function
+of the underlying graph prior, we use graph neural networks to effectively
+estimate the score from a graph dataset (either available beforehand or
+generated from a known distribution). Numerical experiments demonstrate the
+benefits of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SunBlock: Cloudless Protection for IoT Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vadim Safronov, Anna Maria Mandalari, Daniel J. Dubois, David Choffnes, Hamed Haddadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With an increasing number of Internet of Things (IoT) devices present in
+homes, there is a rise in the number of potential information leakage channels
+and their associated security threats and privacy risks. Despite a long history
+of attacks on IoT devices in unprotected home networks, the problem of
+accurate, rapid detection and prevention of such attacks remains open. Many
+existing IoT protection solutions are cloud-based, sometimes ineffective, and
+might share consumer data with unknown third parties. This paper investigates
+the potential for effective IoT threat detection locally, on a home router,
+using AI tools combined with classic rule-based traffic-filtering algorithms.
+Our results show that with a slight rise of router hardware resources caused by
+machine learning and traffic filtering logic, a typical home router
+instrumented with our solution is able to effectively detect risks and protect
+a typical home IoT network, equaling or outperforming existing popular
+solutions, without any effects on benign IoT functionality, and without relying
+on cloud services and third parties.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted at Passive and Active Measurement (PAM)
+  conference 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "All of Me": Mining Users' Attributes from their Public Spotify
+  Playlists 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pier Paolo Tricomi, Luca Pajola, Luca Pasa, Mauro Conti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the age of digital music streaming, playlists on platforms like Spotify
+have become an integral part of individuals' musical experiences. People create
+and publicly share their own playlists to express their musical tastes, promote
+the discovery of their favorite artists, and foster social connections. These
+publicly accessible playlists transcend the boundaries of mere musical
+preferences: they serve as sources of rich insights into users' attributes and
+identities. For example, the musical preferences of elderly individuals may
+lean more towards Frank Sinatra, while Billie Eilish remains a favored choice
+among teenagers. These playlists thus become windows into the diverse and
+evolving facets of one's musical identity.
+  In this work, we investigate the relationship between Spotify users'
+attributes and their public playlists. In particular, we focus on identifying
+recurring musical characteristics associated with users' individual attributes,
+such as demographics, habits, or personality traits. To this end, we conducted
+an online survey involving 739 Spotify users, yielding a dataset of 10,286
+publicly shared playlists encompassing over 200,000 unique songs and 55,000
+artists. Through extensive statistical analyses, we first assess a deep
+connection between a user's Spotify playlists and their real-life attributes.
+For instance, we found individuals high in openness often create playlists
+featuring a diverse array of artists, while female users prefer Pop and K-pop
+music genres. Building upon these observed associations, we create accurate
+predictive models for users' attributes, presenting a novel DeepSet application
+that outperforms baselines in most of these users' attributes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topologies of Reasoning: Demystifying Chains, Trees, and Graphs of
+  Thoughts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maciej Besta, Florim Memedi, Zhenyu Zhang, Robert Gerstenberger, Nils Blach, Piotr Nyczyk, Marcin Copik, Grzegorz Kwaśniewski, Jürgen Müller, Lukas Gianinazzi, Ales Kubicek, Hubert Niewiadomski, Onur Mutlu, Torsten Hoefler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of natural language processing (NLP) has witnessed significant
+progress in recent years, with a notable focus on improving large language
+models' (LLM) performance through innovative prompting techniques. Among these,
+prompt engineering coupled with structures has emerged as a promising paradigm,
+with designs such as Chain-of-Thought, Tree of Thoughts, or Graph of Thoughts,
+in which the overall LLM reasoning is guided by a structure such as a graph. As
+illustrated with numerous examples, this paradigm significantly enhances the
+LLM's capability to solve numerous tasks, ranging from logical or mathematical
+reasoning to planning or creative writing. To facilitate the understanding of
+this growing field and pave the way for future developments, we devise a
+general blueprint for effective and efficient LLM reasoning schemes. For this,
+we conduct an in-depth analysis of the prompt execution pipeline, clarifying
+and clearly defining different concepts. We then build the first taxonomy of
+structure-enhanced LLM reasoning schemes. We focus on identifying fundamental
+classes of harnessed structures, and we analyze the representations of these
+structures, algorithms executed with these structures, and many others. We
+refer to these structures as reasoning topologies, because their representation
+becomes to a degree spatial, as they are contained within the LLM context. Our
+study compares existing prompting schemes using the proposed taxonomy,
+discussing how certain design choices lead to different patterns in performance
+and cost. We also outline theoretical underpinnings, relationships between
+prompting and others parts of the LLM ecosystem such as knowledge bases, and
+the associated research challenges. Our work will help to advance future prompt
+engineering techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information Leakage Detection through Approximate Bayes-optimal
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pritha Gupta, Marcel Wever, Eyke Hüllermeier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In today's data-driven world, the proliferation of publicly available
+information intensifies the challenge of information leakage (IL), raising
+security concerns. IL involves unintentionally exposing secret (sensitive)
+information to unauthorized parties via systems' observable information.
+Conventional statistical approaches, which estimate mutual information (MI)
+between observable and secret information for detecting IL, face challenges
+such as the curse of dimensionality, convergence, computational complexity, and
+MI misestimation. Furthermore, emerging supervised machine learning (ML)
+methods, though effective, are limited to binary system-sensitive information
+and lack a comprehensive theoretical framework. To address these limitations,
+we establish a theoretical framework using statistical learning theory and
+information theory to accurately quantify and detect IL. We demonstrate that MI
+can be accurately estimated by approximating the log-loss and accuracy of the
+Bayes predictor. As the Bayes predictor is typically unknown in practice, we
+propose to approximate it with the help of automated machine learning (AutoML).
+First, we compare our MI estimation approaches against current baselines, using
+synthetic data sets generated using the multivariate normal (MVN) distribution
+with known MI. Second, we introduce a cut-off technique using one-sided
+statistical tests to detect IL, employing the Holm-Bonferroni correction to
+increase confidence in detection decisions. Our study evaluates IL detection
+performance on real-world data sets, highlighting the effectiveness of the
+Bayes predictor's log-loss estimation, and finds our proposed method to
+effectively estimate MI on synthetic data sets and thus detect ILs accurately.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under submission in JMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Producing Plankton Classifiers that are Robust to <span class="highlight-title">Dataset</span> Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Chen, Sreenath Kyathanahally, Marta Reyes, Stefanie Merkli, Ewa Merz, Emanuele Francazi, Marvin Hoege, Francesco Pomati, Marco Baity-Jesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern plankton high-throughput monitoring relies on deep learning
+classifiers for species recognition in water ecosystems. Despite satisfactory
+nominal performances, a significant challenge arises from Dataset Shift, which
+causes performances to drop during deployment. In our study, we integrate the
+ZooLake dataset with manually-annotated images from 10 independent days of
+deployment, serving as test cells to benchmark Out-Of-Dataset (OOD)
+performances. Our analysis reveals instances where classifiers, initially
+performing well in In-Dataset conditions, encounter notable failures in
+practical scenarios. For example, a MobileNet with a 92% nominal test accuracy
+shows a 77% OOD accuracy. We systematically investigate conditions leading to
+OOD performance drops and propose a preemptive assessment method to identify
+potential pitfalls when classifying new data, and pinpoint features in OOD
+images that adversely impact classification. We present a three-step pipeline:
+(i) identifying OOD degradation compared to nominal test performance, (ii)
+conducting a diagnostic analysis of degradation causes, and (iii) providing
+solutions. We find that ensembles of BEiT vision transformers, with targeted
+augmentations addressing OOD robustness, geometric ensembling, and
+rotation-based test-time augmentation, constitute the most robust model, which
+we call BEsT model. It achieves an 83% OOD accuracy, with errors concentrated
+on container classes. Moreover, it exhibits lower sensitivity to dataset shift,
+and reproduces well the plankton abundances. Our proposed pipeline is
+applicable to generic plankton classifiers, contingent on the availability of
+suitable test cells. By identifying critical shortcomings and offering
+practical procedures to fortify models against dataset shift, our study
+contributes to the development of more reliable plankton classification
+technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpretable Solutions for Breast Cancer Diagnosis with Grammatical
+  Evolution and Data Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yumnah Hasan, Allan de Lima, Fatemeh Amerehi, Darian Reyes Fernandez de Bulnes, Patrick Healy, Conor Ryan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical imaging diagnosis increasingly relies on Machine Learning (ML)
+models. This is a task that is often hampered by severely imbalanced datasets,
+where positive cases can be quite rare. Their use is further compromised by
+their limited interpretability, which is becoming increasingly important. While
+post-hoc interpretability techniques such as SHAP and LIME have been used with
+some success on so-called black box models, the use of inherently
+understandable models makes such endeavors more fruitful. This paper addresses
+these issues by demonstrating how a relatively new synthetic data generation
+technique, STEM, can be used to produce data to train models produced by
+Grammatical Evolution (GE) that are inherently understandable. STEM is a
+recently introduced combination of the Synthetic Minority Oversampling
+Technique (SMOTE), Edited Nearest Neighbour (ENN), and Mixup; it has previously
+been successfully used to tackle both between class and within class imbalance
+issues. We test our technique on the Digital Database for Screening Mammography
+(DDSM) and the Wisconsin Breast Cancer (WBC) datasets and compare Area Under
+the Curve (AUC) results with an ensemble of the top three performing
+classifiers from a set of eight standard ML classifiers with varying degrees of
+interpretability. We demonstrate that the GE-derived models present the best
+AUC while still maintaining interpretable solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Labeling Technique for Reddit Text and Fine-Tuned Longformer
+  Models for Classifying Depression Severity in English and Luganda 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Kimera, Daniela N. Rim, Joseph Kirabira, Ubong Godwin Udomah, Heeyoul Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depression is a global burden and one of the most challenging mental health
+conditions to control. Experts can detect its severity early using the Beck
+Depression Inventory (BDI) questionnaire, administer appropriate medication to
+patients, and impede its progression. Due to the fear of potential
+stigmatization, many patients turn to social media platforms like Reddit for
+advice and assistance at various stages of their journey. This research
+extracts text from Reddit to facilitate the diagnostic process. It employs a
+proposed labeling approach to categorize the text and subsequently fine-tunes
+the Longformer model. The model's performance is compared against baseline
+models, including Naive Bayes, Random Forest, Support Vector Machines, and
+Gradient Boosting. Our findings reveal that the Longformer model outperforms
+the baseline models in both English (48%) and Luganda (45%) languages on a
+custom-made dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In IEEE Proceedings of the 14th International Conference on ICT
+  Convergence (ICTC), Jeju, Korea, October 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing the Portability of Parameter Matrices Trained by
+  Parameter-Efficient Finetuning Methods <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Sabry, Anya Belz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the cost of training ever larger language models has grown, so has the
+interest in reusing previously learnt knowledge. Transfer learning methods have
+shown how reusing non-task-specific knowledge can help in subsequent
+task-specific learning. In this paper, we investigate the inverse: porting
+whole functional modules that encode task-specific knowledge from one model to
+another. We designed a study comprising 1,440 training/testing runs to test the
+portability of modules trained by parameter-efficient finetuning (PEFT)
+techniques, using sentiment analysis as an example task. We test portability in
+a wide range of scenarios, involving different PEFT techniques and different
+pretrained host models, among other dimensions. We compare the performance of
+ported modules with that of equivalent modules trained (i) from scratch, and
+(ii) from parameters sampled from the same distribution as the ported module.
+We find that the ported modules far outperform the two alternatives tested, but
+that there are interesting performance differences between the four PEFT
+techniques. We conclude that task-specific knowledge in the form of
+structurally modular sets of parameters as produced by PEFT techniques is
+highly portable, but that degree of success depends on type of PEFT and on
+differences between originating and receiving pretrained models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of EACL 2024. Camera ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sample Efficient Reinforcement Learning by Automatically Learning to
+  Compose Subtasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Han, Mehdi Dastani, Shihan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Improving sample efficiency is central to Reinforcement Learning (RL),
+especially in environments where the rewards are sparse. Some recent approaches
+have proposed to specify reward functions as manually designed or learned
+reward structures whose integrations in the RL algorithms are claimed to
+significantly improve the learning efficiency. Manually designed reward
+structures can suffer from inaccuracy and existing automatically learning
+methods are often computationally intractable for complex tasks. The
+integration of inaccurate or partial reward structures in RL algorithms fail to
+learn optimal policies. In this work, we propose an RL algorithm that can
+automatically structure the reward function for sample efficiency, given a set
+of labels that signify subtasks. Given such minimal knowledge about the task,
+we train a high-level policy that selects optimal sub-tasks in each state
+together with a low-level policy that efficiently learns to complete each
+sub-task. We evaluate our algorithm in a variety of sparse-reward environments.
+The experiment results show that our approach significantly outperforms the
+state-of-art baselines as the difficulty of the task increases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Efficient Federated Learning through Adaptive Weight
+  Clustering and Server-Side Distillation <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14211v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14211v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vasileios Tsouvalas. Aaqib Saeed, Tanir Ozcelebi, Nirvana Meratnia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a promising technique for the collaborative
+training of deep neural networks across multiple devices while preserving data
+privacy. Despite its potential benefits, FL is hindered by excessive
+communication costs due to repeated server-client communication during
+training. To address this challenge, model compression techniques, such as
+sparsification and weight clustering are applied, which often require modifying
+the underlying model aggregation schemes or involve cumbersome hyperparameter
+tuning, with the latter not only adjusts the model's compression rate but also
+limits model's potential for continuous improvement over growing data. In this
+paper, we propose FedCompress, a novel approach that combines dynamic weight
+clustering and server-side knowledge distillation to reduce communication costs
+while learning highly generalizable models. Through a comprehensive evaluation
+on diverse public datasets, we demonstrate the efficacy of our approach
+compared to baselines in terms of communication costs and inference speed. We
+will make our implementation public upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures, Accepted on ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ At the junction between deep learning and statistics of extremes:
+  formalizing the landslide hazard definition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashok Dahal, Raphaël Huser, Luigi Lombardo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The most adopted definition of landslide hazard combines spatial information
+about landslide location (susceptibility), threat (intensity), and frequency
+(return period). Only the first two elements are usually considered and
+estimated when working over vast areas. Even then, separate models constitute
+the standard, with frequency being rarely investigated. Frequency and intensity
+are intertwined and depend on each other because larger events occur less
+frequently and vice versa. However, due to the lack of multi-temporal
+inventories and joint statistical models, modelling such properties via a
+unified hazard model has always been challenging and has yet to be attempted.
+Here, we develop a unified model to estimate landslide hazard at the slope unit
+level to address such gaps. We employed deep learning, combined with a model
+motivated by extreme-value theory to analyse an inventory of 30 years of
+observed rainfall-triggered landslides in Nepal and assess landslide hazard for
+multiple return periods. We also use our model to further explore landslide
+hazard for the same return periods under different climate change scenarios up
+to the end of the century. Our results show that the proposed model performs
+excellently and can be used to model landslide hazard in a unified manner.
+Geomorphologically, we find that under both climate change scenarios (SSP245
+and SSP885), landslide hazard is likely to increase up to two times on average
+in the lower Himalayan regions while remaining the same in the middle Himalayan
+region whilst decreasing slightly in the upper Himalayan region areas.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MTRGL:Effective Temporal Correlation Discerning through Multi-modal
+  Temporal Relational Graph Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwei Su, Shan Wu, Jinhui Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we explore the synergy of deep learning and financial market
+applications, focusing on pair trading. This market-neutral strategy is
+integral to quantitative finance and is apt for advanced deep-learning
+techniques. A pivotal challenge in pair trading is discerning temporal
+correlations among entities, necessitating the integration of diverse data
+modalities. Addressing this, we introduce a novel framework, Multi-modal
+Temporal Relation Graph Learning (MTRGL). MTRGL combines time series data and
+discrete features into a temporal graph and employs a memory-based temporal
+graph neural network. This approach reframes temporal correlation
+identification as a temporal graph link prediction task, which has shown
+empirical success. Our experiments on real-world datasets confirm the superior
+performance of MTRGL, emphasizing its promise in refining automated pair
+trading strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepSeek-Coder: When the Large Language Model Meets Programming -- The
+  Rise of Code Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14196v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14196v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daya Guo, Qihao Zhu, Dejian Yang, Zhenda Xie, Kai Dong, Wentao Zhang, Guanting Chen, Xiao Bi, Y. Wu, Y. K. Li, Fuli Luo, Yingfei Xiong, Wenfeng Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of large language models has revolutionized code
+intelligence in software development. However, the predominance of
+closed-source models has restricted extensive research and development. To
+address this, we introduce the DeepSeek-Coder series, a range of open-source
+code models with sizes from 1.3B to 33B, trained from scratch on 2 trillion
+tokens. These models are pre-trained on a high-quality project-level code
+corpus and employ a fill-in-the-blank task with a 16K window to enhance code
+generation and infilling. Our extensive evaluations demonstrate that
+DeepSeek-Coder not only achieves state-of-the-art performance among open-source
+code models across multiple benchmarks but also surpasses existing
+closed-source models like Codex and GPT-3.5. Furthermore, DeepSeek-Coder models
+are under a permissive license that allows for both research and unrestricted
+commercial use.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Can Large Language Models Understand Spatial-Temporal Data? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Liu, Shuo Yu, Runze Wang, Zhenxun Ma, Yanming Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Large Language Models (LLMs) dominate tasks like natural language
+processing and computer vision, harnessing their power for spatial-temporal
+forecasting remains challenging. The disparity between sequential text and
+complex spatial-temporal data hinders this application. To address this issue,
+this paper introduces STG-LLM, an innovative approach empowering LLMs for
+spatial-temporal forecasting. We tackle the data mismatch by proposing: 1)
+STG-Tokenizer: This spatial-temporal graph tokenizer transforms intricate graph
+data into concise tokens capturing both spatial and temporal relationships; 2)
+STG-Adapter: This minimalistic adapter, consisting of linear encoding and
+decoding layers, bridges the gap between tokenized data and LLM comprehension.
+By fine-tuning only a small set of parameters, it can effectively grasp the
+semantics of tokens generated by STG-Tokenizer, while preserving the original
+natural language understanding capabilities of LLMs. Extensive experiments on
+diverse spatial-temporal benchmark datasets show that STG-LLM successfully
+unlocks LLM potential for spatial-temporal forecasting. Remarkably, our
+approach achieves competitive performance on par with dedicated SOTA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Friendly Attacks to Improve Channel Coding Reliability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasiia Kurmukova, Deniz Gunduz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel approach called "friendly attack" aimed at
+enhancing the performance of error correction channel codes. Inspired by the
+concept of adversarial attacks, our method leverages the idea of introducing
+slight perturbations to the neural network input, resulting in a substantial
+impact on the network's performance. By introducing small perturbations to
+fixed-point modulated codewords before transmission, we effectively improve the
+decoder's performance without violating the input power constraint. The
+perturbation design is accomplished by a modified iterative fast gradient
+method. This study investigates various decoder architectures suitable for
+computing gradients to obtain the desired perturbations. Specifically, we
+consider belief propagation (BP) for LDPC codes; the error correcting code
+transformer, BP and neural BP (NBP) for polar codes, and neural BCJR for
+convolutional codes. We demonstrate that the proposed friendly attack method
+can improve the reliability across different channels, modulations, codes, and
+decoders. This method allows us to increase the reliability of communication
+with a legacy receiver by simply modifying the transmitted codeword
+appropriately.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Alleviating Structural Distribution Shift in Graph Anomaly Detection <span class="chip">WSDM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Gao, Xiang Wang, Xiangnan He, Zhenguang Liu, Huamin Feng, Yongdong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph anomaly detection (GAD) is a challenging binary classification problem
+due to its different structural distribution between anomalies and normal nodes
+-- abnormal nodes are a minority, therefore holding high heterophily and low
+homophily compared to normal nodes. Furthermore, due to various time factors
+and the annotation preferences of human experts, the heterophily and homophily
+can change across training and testing data, which is called structural
+distribution shift (SDS) in this paper. The mainstream methods are built on
+graph neural networks (GNNs), benefiting the classification of normals from
+aggregating homophilous neighbors, yet ignoring the SDS issue for anomalies and
+suffering from poor generalization.
+  This work solves the problem from a feature view. We observe that the degree
+of SDS varies between anomalies and normal nodes. Hence to address the issue,
+the key lies in resisting high heterophily for anomalies meanwhile benefiting
+the learning of normals from homophily. We tease out the anomaly features on
+which we constrain to mitigate the effect of heterophilous neighbors and make
+them invariant. We term our proposed framework as Graph Decomposition Network
+(GDN). Extensive experiments are conducted on two benchmark datasets, and the
+proposed framework achieves a remarkable performance boost in GAD, especially
+in an SDS environment where anomalies have largely different structural
+distribution across training and testing environments. Codes are open-sourced
+in https://github.com/blacksingular/wsdm_GDN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WSDM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ True Knowledge Comes from Practice: Aligning LLMs with Embodied
+  Environments via Reinforcement Learning <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Tan, Wentao Zhang, Shanqi Liu, Longtao Zheng, Xinrun Wang, Bo An
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the impressive performance across numerous tasks, large language
+models (LLMs) often fail in solving simple decision-making tasks due to the
+misalignment of the knowledge in LLMs with environments. On the contrary,
+reinforcement learning (RL) agents learn policies from scratch, which makes
+them always align with environments but difficult to incorporate prior
+knowledge for efficient explorations. To narrow the gap, we propose TWOSOME, a
+novel general online framework that deploys LLMs as decision-making agents to
+efficiently interact and align with embodied environments via RL without
+requiring any prepared datasets or prior knowledge of the environments.
+Firstly, we query the joint probabilities of each valid action with LLMs to
+form behavior policies. Then, to enhance the stability and robustness of the
+policies, we propose two normalization methods and summarize four prompt design
+principles. Finally, we design a novel parameter-efficient training
+architecture where the actor and critic share one frozen LLM equipped with
+low-rank adapters (LoRA) updated by PPO. We conduct extensive experiments to
+evaluate TWOSOME. i) TWOSOME exhibits significantly better sample efficiency
+and performance compared to the conventional RL method, PPO, and prompt tuning
+method, SayCan, in both classical decision-making environment, Overcooked, and
+simulated household environment, VirtualHome. ii) Benefiting from LLMs'
+open-vocabulary feature, TWOSOME shows superior generalization ability to
+unseen tasks. iii) Under our framework, there is no significant loss of the
+LLMs' original ability during online PPO finetuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy-Based Concept Bottleneck Models: Unifying Prediction, Concept
+  Intervention, and Conditional Interpretations <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyue Xu, Yi Qin, Lu Mi, Hao Wang, Xiaomeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing methods, such as concept bottleneck models (CBMs), have been
+successful in providing concept-based interpretations for black-box deep
+learning models. They typically work by predicting concepts given the input and
+then predicting the final class label given the predicted concepts. However,
+(1) they often fail to capture the high-order, nonlinear interaction between
+concepts, e.g., correcting a predicted concept (e.g., "yellow breast") does not
+help correct highly correlated concepts (e.g., "yellow belly"), leading to
+suboptimal final accuracy; (2) they cannot naturally quantify the complex
+conditional dependencies between different concepts and class labels (e.g., for
+an image with the class label "Kentucky Warbler" and a concept "black bill",
+what is the probability that the model correctly predicts another concept
+"black crown"), therefore failing to provide deeper insight into how a
+black-box model works. In response to these limitations, we propose
+Energy-based Concept Bottleneck Models (ECBMs). Our ECBMs use a set of neural
+networks to define the joint energy of candidate (input, concept, class)
+tuples. With such a unified interface, prediction, concept correction, and
+conditional dependency quantification are then represented as conditional
+probabilities, which are generated by composing different energy functions. Our
+ECBMs address both limitations of existing CBMs, providing higher accuracy and
+richer concept interpretations. Empirical results show that our approach
+outperforms the state-of-the-art on real-world datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convolutional Neural Networks can achieve binary bail judgement
+  classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Barman, Devangan Roy, Debapriya Paul, Indranil Dutta, Shouvik Kumar Guha, Samir Karmakar, Sudip Kumar Naskar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is an evident lack of implementation of Machine Learning (ML) in the
+legal domain in India, and any research that does take place in this domain is
+usually based on data from the higher courts of law and works with English
+data. The lower courts and data from the different regional languages of India
+are often overlooked. In this paper, we deploy a Convolutional Neural Network
+(CNN) architecture on a corpus of Hindi legal documents. We perform a bail
+Prediction task with the help of a CNN model and achieve an overall accuracy of
+93\% which is an improvement on the benchmark accuracy, set by Kapoor et al.
+(2022), albeit in data from 20 districts of the Indian state of Uttar Pradesh.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted on 20th International Conference on Natural Language
+  Processing (ICON)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Equivariant Manifold Neural ODEs and Differential Invariants 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emma Andersdotter, Fredrik Ohlsson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we develop a manifestly geometric framework for equivariant
+manifold neural ordinary differential equations (NODEs), and use it to analyse
+their modelling capabilities for symmetric data. First, we consider the action
+of a Lie group $G$ on a smooth manifold $M$ and establish the equivalence
+between equivariance of vector fields, symmetries of the corresponding Cauchy
+problems, and equivariance of the associated NODEs. We also propose a novel
+formulation of the equivariant NODEs in terms of the differential invariants of
+the action of $G$ on $M$, based on Lie theory for symmetries of differential
+equations, which provides an efficient parameterisation of the space of
+equivariant vector fields in a way that is agnostic to both the manifold $M$
+and the symmetry group $G$. Second, we construct augmented manifold NODEs,
+through embeddings into equivariant flows, and show that they are universal
+approximators of equivariant diffeomorphisms on any path-connected $M$.
+Furthermore, we show that the augmented NODEs can be incorporated in the
+geometric framework and parameterised using higher order differential
+invariants. Finally, we consider the induced action of $G$ on different fields
+on $M$ and show how it can be used to generalise previous work, on, e.g.,
+continuous normalizing flows, to equivariant models in any geometry.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention-based Efficient Classification for 3D MRI Image of Alzheimer's
+  Disease 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14130v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14130v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihao Lin, Ximeng Li, Yan Zhang, Jinshan Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early diagnosis of Alzheimer Diagnostics (AD) is a challenging task due to
+its subtle and complex clinical symptoms. Deep learning-assisted medical
+diagnosis using image recognition techniques has become an important research
+topic in this field. The features have to accurately capture main variations of
+anatomical brain structures. However, time-consuming is expensive for feature
+extraction by deep learning training. This study proposes a novel Alzheimer's
+disease detection model based on Convolutional Neural Networks. The model
+utilizes a pre-trained ResNet network as the backbone, incorporating
+post-fusion algorithm for 3D medical images and attention mechanisms. The
+experimental results indicate that the employed 2D fusion algorithm effectively
+improves the model's training expense. And the introduced attention mechanism
+accurately weights important regions in images, further enhancing the model's
+diagnostic accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric
+  Algorithm-System Co-Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14112v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14112v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojun Xia, Zhen Zheng, Xiaoxia Wu, Shiyang Chen, Zhewei Yao, Stephen Youn, Arash Bakhtiari, Michael Wyatt, Donglin Zhuang, Zhongzhu Zhou, Olatunji Ruwase, Yuxiong He, Shuaiwen Leon Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Six-bit quantization (FP6) can effectively reduce the size of large language
+models (LLMs) and preserve the model quality consistently across varied
+applications. However, existing systems do not provide Tensor Core support for
+FP6 quantization and struggle to achieve practical performance improvements
+during LLM inference. It is challenging to support FP6 quantization on GPUs due
+to (1) unfriendly memory access of model weights with irregular bit-width and
+(2) high runtime overhead of weight de-quantization. To address these problems,
+we propose TC-FPx, the first full-stack GPU kernel design scheme with unified
+Tensor Core support of float-point weights for various quantization bit-width.
+We integrate TC-FPx kernel into an existing inference system, providing new
+end-to-end support (called FP6-LLM) for quantized LLM inference, where better
+trade-offs between inference cost and model quality are achieved. Experiments
+show that FP6-LLM enables the inference of LLaMA-70b using only a single GPU,
+achieving 1.69x-2.65x higher normalized inference throughput than the FP16
+baseline. The source code will be publicly available soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Cheaper Inference in Deep Networks with Lower Bit-Width
+  Accumulators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14110v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14110v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaniv Blumenfeld, Itay Hubara, Daniel Soudry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The majority of the research on the quantization of Deep Neural Networks
+(DNNs) is focused on reducing the precision of tensors visible by high-level
+frameworks (e.g., weights, activations, and gradients). However, current
+hardware still relies on high-accuracy core operations. Most significant is the
+operation of accumulating products. This high-precision accumulation operation
+is gradually becoming the main computational bottleneck. This is because, so
+far, the usage of low-precision accumulators led to a significant degradation
+in performance. In this work, we present a simple method to train and fine-tune
+high-end DNNs, to allow, for the first time, utilization of cheaper, $12$-bits
+accumulators, with no significant degradation in accuracy. Lastly, we show that
+as we decrease the accumulation precision further, using fine-grained gradient
+approximations can improve the DNN accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CompactifAI: Extreme Compression of Large Language Models using
+  Quantum-Inspired Tensor Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrei Tomut, Saeed S. Jahromi, Sukhbinder Singh, Faysal Ishtiaq, Cesar Muñoz, Prabdeep Singh Bajaj, Ali Elborady, Gianni del Bimbo, Mehrazin Alizadeh, David Montero, Pablo Martin-Ramiro, Muhammad Ibrahim, Oussama Tahiri Alaoui, John Malcolm, Samuel Mugel, Roman Orus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) such as ChatGPT and LlaMA are advancing rapidly
+in generative Artificial Intelligence (AI), but their immense size poses
+significant challenges, such as huge training and inference costs, substantial
+energy demands, and limitations for on-site deployment. Traditional compression
+methods such as pruning, distillation, and low-rank approximation focus on
+reducing the effective number of neurons in the network, while quantization
+focuses on reducing the numerical precision of individual weights to reduce the
+model size while keeping the number of neurons fixed. While these compression
+methods have been relatively successful in practice, there's no compelling
+reason to believe that truncating the number of neurons is an optimal strategy.
+In this context, this paper introduces CompactifAI, an innovative LLM
+compression approach using quantum-inspired Tensor Networks that focuses on the
+model's correlation space instead, allowing for a more controlled, refined and
+interpretable model compression. Our method is versatile and can be implemented
+with - or on top of - other compression techniques. As a benchmark, we
+demonstrate that CompactifAI alone enables compression of the LlaMA-2 7B model
+to only $30\%$ of its original size while recovering over $90\%$ of the
+original accuracy after a brief distributed retraining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning under Label Noise through Few-Shot Human-in-the-Loop Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaqib Saeed, Dimitris Spathis, Jungwoo Oh, Edward Choi, Ali Etemad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wearable technologies enable continuous monitoring of various health metrics,
+such as physical activity, heart rate, sleep, and stress levels. A key
+challenge with wearable data is obtaining quality labels. Unlike modalities
+like video where the videos themselves can be effectively used to label objects
+or events, wearable data do not contain obvious cues about the physical
+manifestation of the users and usually require rich metadata. As a result,
+label noise can become an increasingly thorny issue when labeling such data. In
+this paper, we propose a novel solution to address noisy label learning,
+entitled Few-Shot Human-in-the-Loop Refinement (FHLR). Our method initially
+learns a seed model using weak labels. Next, it fine-tunes the seed model using
+a handful of expert corrections. Finally, it achieves better generalizability
+and robustness by merging the seed and fine-tuned models via weighted parameter
+averaging. We evaluate our approach on four challenging tasks and datasets, and
+compare it against eight competitive baselines designed to deal with noisy
+labels. We show that FHLR achieves significantly better performance when
+learning from noisy labels and achieves state-of-the-art by a large margin,
+with up to 19% accuracy improvement under symmetric and asymmetric noise.
+Notably, we find that FHLR is particularly robust to increased label noise,
+unlike prior works that suffer from severe performance degradation. Our work
+not only achieves better generalization in high-stakes health sensing
+benchmarks but also sheds light on how noise affects commonly-used models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ McUDI: Model-Centric Unsupervised Degradation Indicator for Failure
+  Prediction AIOps Solutions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorena Poenaru-Olaru, Luis Cruz, Jan Rellermeyer, Arie van Deursen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the continuous change in operational data, AIOps solutions suffer from
+performance degradation over time. Although periodic retraining is the
+state-of-the-art technique to preserve the failure prediction AIOps models'
+performance over time, this technique requires a considerable amount of labeled
+data to retrain. In AIOps obtaining label data is expensive since it requires
+the availability of domain experts to intensively annotate it. In this paper,
+we present McUDI, a model-centric unsupervised degradation indicator that is
+capable of detecting the exact moment the AIOps model requires retraining as a
+result of changes in data. We further show how employing McUDI in the
+maintenance pipeline of AIOps solutions can reduce the number of samples that
+require annotations with 30k for job failure prediction and 260k for disk
+failure prediction while achieving similar performance with periodic
+retraining.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Modular Approach to Automatic Cyber Threat Attribution using Opinion
+  Pools 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14090v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14090v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koen T. W. Teuwen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cyber threat attribution can play an important role in increasing resilience
+against digital threats. Recent research focuses on automating the threat
+attribution process and on integrating it with other efforts, such as threat
+hunting. To support increasing automation of the cyber threat attribution
+process, this paper proposes a modular architecture as an alternative to
+current monolithic automated approaches. The modular architecture can utilize
+opinion pools to combine the output of concrete attributors. The proposed
+solution increases the tractability of the threat attribution problem and
+offers increased usability and interpretability, as opposed to monolithic
+alternatives. In addition, a Pairing Aggregator is proposed as an aggregation
+method that forms pairs of attributors based on distinct features to produce
+intermediary results before finally producing a single Probability Mass
+Function (PMF) as output. The Pairing Aggregator sequentially applies both the
+logarithmic opinion pool and the linear opinion pool. An experimental
+validation suggests that the modular approach does not result in decreased
+performance and can even enhance precision and recall compared to monolithic
+alternatives. The results also suggest that the Pairing Aggregator can improve
+precision over the linear and logarithmic opinion pools. Furthermore, the
+improved k-accuracy in the experiment suggests that forensic experts can
+leverage the resulting PMF during their manual attribution processes to enhance
+their efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For source code see:
+  https://github.com/Koen1999/modular-threat-attribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Likely Counterfactuals Using Sum-Product Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiri Nemecek, Tomas Pevny, Jakub Marecek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to user demand and recent regulation (GDPR, AI Act), decisions made by AI
+systems need to be explained. These decisions are often explainable only post
+hoc, where counterfactual explanations are popular. The question of what
+constitutes the best counterfactual explanation must consider multiple aspects,
+where "distance from the sample" is the most common. We argue that this
+requirement frequently leads to explanations that are unlikely and, therefore,
+of limited value. Here, we present a system that provides high-likelihood
+explanations. We show that the search for the most likely explanations
+satisfying many common desiderata for counterfactual explanations can be
+modeled using mixed-integer optimization (MIO). In the process, we propose an
+MIO formulation of a Sum-Product Network (SPN) and use the SPN to estimate the
+likelihood of a counterfactual, which can be of independent interest. A
+numerical comparison against several methods for generating counterfactual
+explanations is provided.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerating Fractional PINNs using Operational Matrices of Derivative 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tayebeh Taheri, Alireza Afzal Aghaei, Kourosh Parand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel operational matrix method to accelerate the
+training of fractional Physics-Informed Neural Networks (fPINNs). Our approach
+involves a non-uniform discretization of the fractional Caputo operator,
+facilitating swift computation of fractional derivatives within Caputo-type
+fractional differential problems with $0<\alpha<1$. In this methodology, the
+operational matrix is precomputed, and during the training phase, automatic
+differentiation is replaced with a matrix-vector product. While our methodology
+is compatible with any network, we particularly highlight its successful
+implementation in PINNs, emphasizing the enhanced accuracy achieved when
+utilizing the Legendre Neural Block (LNB) architecture. LNB incorporates
+Legendre polynomials into the PINN structure, providing a significant boost in
+accuracy. The effectiveness of our proposed method is validated across diverse
+differential equations, including Delay Differential Equations (DDEs) and
+Systems of Differential Algebraic Equations (DAEs). To demonstrate its
+versatility, we extend the application of the method to systems of differential
+equations, specifically addressing nonlinear Pantograph fractional-order
+DDEs/DAEs. The results are supported by a comprehensive analysis of numerical
+outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProCNS: Progressive Prototype Calibration and Noise Suppression for
+  Weakly-Supervised Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Y. Liu, L. Lin, K. K. Y. Wong, X. Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-supervised segmentation (WSS) has emerged as a solution to mitigate
+the conflict between annotation cost and model performance by adopting sparse
+annotation formats (e.g., point, scribble, block, etc.). Typical approaches
+attempt to exploit anatomy and topology priors to directly expand sparse
+annotations into pseudo-labels. However, due to a lack of attention to the
+ambiguous edges in medical images and insufficient exploration of sparse
+supervision, existing approaches tend to generate erroneous and overconfident
+pseudo proposals in noisy regions, leading to cumulative model error and
+performance degradation. In this work, we propose a novel WSS approach, named
+ProCNS, encompassing two synergistic modules devised with the principles of
+progressive prototype calibration and noise suppression. Specifically, we
+design a Prototype-based Regional Spatial Affinity (PRSA) loss to maximize the
+pair-wise affinities between spatial and semantic elements, providing our model
+of interest with more reliable guidance. The affinities are derived from the
+input images and the prototype-refined predictions. Meanwhile, we propose an
+Adaptive Noise Perception and Masking (ANPM) module to obtain more enriched and
+representative prototype representations, which adaptively identifies and masks
+noisy regions within the pseudo proposals, reducing potential erroneous
+interference during prototype computation. Furthermore, we generate specialized
+soft pseudo-labels for the noisy regions identified by ANPM, providing
+supplementary supervision. Extensive experiments on three medical image
+segmentation tasks involving different modalities demonstrate that the proposed
+framework significantly outperforms representative state-of-the-art methods
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Sinkhorn Gradient Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huminhao Zhu, Fangyikang Wang, Chao Zhang, Hanbin Zhao, Hui Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wasserstein Gradient Flows (WGF) with respect to specific functionals have
+been widely used in the machine learning literature. Recently, neural networks
+have been adopted to approximate certain intractable parts of the underlying
+Wasserstein gradient flow and result in efficient inference procedures. In this
+paper, we introduce the Neural Sinkhorn Gradient Flow (NSGF) model, which
+parametrizes the time-varying velocity field of the Wasserstein gradient flow
+w.r.t. the Sinkhorn divergence to the target distribution starting a given
+source distribution. We utilize the velocity field matching training scheme in
+NSGF, which only requires samples from the source and target distribution to
+compute an empirical velocity field approximation. Our theoretical analyses
+show that as the sample size increases to infinity, the mean-field limit of the
+empirical approximation converges to the true underlying velocity field. To
+further enhance model efficiency on high-dimensional tasks, a two-phase NSGF++
+model is devised, which first follows the Sinkhorn flow to approach the image
+manifold quickly ($\le 5$ NFEs) and then refines the samples along a simple
+straight flow. Numerical experiments with synthetic and real-world benchmark
+datasets support our theoretical results and demonstrate the effectiveness of
+the proposed methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Novel application of Relief Algorithm in cascaded artificial neural
+  network to predict wind speed for wind power resource assessment in India 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14065v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14065v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hasmat Malik, Amit Kumar Yadav, Fausto Pedro García Márquez, Jesús María Pinar-Pérez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wind power generated by wind has non-schedule nature due to stochastic nature
+of meteorological variable. Hence energy business and control of wind power
+generation requires prediction of wind speed (WS) from few seconds to different
+time steps in advance. To deal with prediction shortcomings, various WS
+prediction methods have been used. Predictive data mining offers variety of
+methods for WS predictions where artificial neural network (ANN) is one of the
+reliable and accurate methods. It is observed from the result of this study
+that ANN gives better accuracy in comparison conventional model. The accuracy
+of WS prediction models is found to be dependent on input parameters and
+architecture type algorithms utilized. So the selection of most relevant input
+parameters is important research area in WS predicton field. The objective of
+the paper is twofold: first extensive review of ANN for wind power and WS
+prediction is carried out. Discussion and analysis of feature selection using
+Relief Algorithm (RA) in WS prediction are considered for different Indian
+sites. RA identify atmospheric pressure, solar radiation and relative humidity
+are relevant input variables. Based on relevant input variables Cascade ANN
+model is developed and prediction accuracy is evaluated. It is found that root
+mean square error (RMSE) for comparison between predicted and measured WS for
+training and testing wind speed are found to be 1.44 m/s and 1.49 m/s
+respectively. The developed cascade ANN model can be used to predict wind speed
+for sites where there are not WS measuring instruments are installed in India.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Malik, H., Yadav, A. K., M\'arquez, F. P. G., & Pinar-P\'erez, J. M.
+  (2022). Novel application of Relief Algorithm in cascaded artificial neural
+  network to predict wind speed for wind power resource assessment in India.
+  Energy Strategy Reviews, 41, 100864</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Left/Right Brain, human motor control and the implications for robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14057v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14057v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jarrad Rinaldo, Levin Kuhlmann, Jason Friedman, Gideon Kowadlo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Network movement controllers promise a variety of advantages over
+conventional control methods however they are not widely adopted due to their
+inability to produce reliably precise movements. This research explores a
+bilateral neural network architecture as a control system for motor tasks. We
+aimed to achieve hemispheric specialisation similar to what is observed in
+humans across different tasks; the dominant system (usually the right hand,
+left hemisphere) excels at tasks involving coordination and efficiency of
+movement, and the non-dominant system performs better at tasks requiring
+positional stability. Specialisation was achieved by training the hemispheres
+with different loss functions tailored toward the expected behaviour of the
+respective hemispheres. We compared bilateral models with and without
+specialised hemispheres, with and without inter-hemispheric connectivity
+(representing the biological Corpus Callosum), and unilateral models with and
+without specialisation. The models were trained and tested on two tasks common
+in the human motor control literature: the random reach task, suited to the
+dominant system, a model with better coordination, and the hold position task,
+suited to the non-dominant system, a model with more stable movement. Each
+system out-performed the non-favoured system in its preferred task. For both
+tasks, a bilateral model outperforms the 'non-preferred' hand, and is as good
+or better than the 'preferred' hand. The Corpus Callosum tends to improve
+performance, but not always for the specialised models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Novel Quadratic Constraints for Extending LipSDP beyond Slope-Restricted
+  Activations <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patricia Pauli, Aaron Havens, Alexandre Araujo, Siddharth Garg, Farshad Khorrami, Frank Allgöwer, Bin Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, semidefinite programming (SDP) techniques have shown great promise
+in providing accurate Lipschitz bounds for neural networks. Specifically, the
+LipSDP approach (Fazlyab et al., 2019) has received much attention and provides
+the least conservative Lipschitz upper bounds that can be computed with
+polynomial time guarantees. However, one main restriction of LipSDP is that its
+formulation requires the activation functions to be slope-restricted on
+$[0,1]$, preventing its further use for more general activation functions such
+as GroupSort, MaxMin, and Householder. One can rewrite MaxMin activations for
+example as residual ReLU networks. However, a direct application of LipSDP to
+the resultant residual ReLU networks is conservative and even fails in
+recovering the well-known fact that the MaxMin activation is 1-Lipschitz. Our
+paper bridges this gap and extends LipSDP beyond slope-restricted activation
+functions. To this end, we provide novel quadratic constraints for GroupSort,
+MaxMin, and Householder activations via leveraging their underlying properties
+such as sum preservation. Our proposed analysis is general and provides a
+unified approach for estimating $\ell_2$ and $\ell_\infty$ Lipschitz bounds for
+a rich class of neural network architectures, including non-residual and
+residual neural networks and implicit models, with GroupSort, MaxMin, and
+Householder activations. Finally, we illustrate the utility of our approach
+with a variety of experiments and show that our proposed SDPs generate less
+conservative Lipschitz bounds in comparison to existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted as a conference paper at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse and Transferable Universal Singular Vectors Attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kseniia Kuvshinova, Olga Tsymboi, Ivan Oseledets
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research in the field of adversarial attacks and models' vulnerability is
+one of the fundamental directions in modern machine learning. Recent studies
+reveal the vulnerability phenomenon, and understanding the mechanisms behind
+this is essential for improving neural network characteristics and
+interpretability. In this paper, we propose a novel sparse universal white-box
+adversarial attack. Our approach is based on truncated power iteration
+providing sparsity to $(p,q)$-singular vectors of the hidden layers of Jacobian
+matrices. Using the ImageNet benchmark validation subset, we analyze the
+proposed method in various settings, achieving results comparable to dense
+baselines with more than a 50% fooling rate while damaging only 5% of pixels
+and utilizing 256 samples for perturbation fitting. We also show that our
+algorithm admits higher attack magnitude without affecting the human ability to
+solve the task. Furthermore, we investigate that the constructed perturbations
+are highly transferable among different models without significantly decreasing
+the fooling rate. Our findings demonstrate the vulnerability of
+state-of-the-art models to sparse attacks and highlight the importance of
+developing robust machine learning systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards a Systems Theory of Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14029v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14029v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Dörfler, Zhiyu He, Giuseppe Belgioioso, Saverio Bolognani, John Lygeros, Michael Muehlebach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditionally, numerical algorithms are seen as isolated pieces of code
+confined to an {\em in silico} existence. However, this perspective is not
+appropriate for many modern computational approaches in control, learning, or
+optimization, wherein {\em in vivo} algorithms interact with their environment.
+Examples of such {\em open} include various real-time optimization-based
+control strategies, reinforcement learning, decision-making architectures,
+online optimization, and many more. Further, even {\em closed} algorithms in
+learning or optimization are increasingly abstracted in block diagrams with
+interacting dynamic modules and pipelines. In this opinion paper, we state our
+vision on a to-be-cultivated {\em systems theory of algorithms} and argue in
+favour of viewing algorithms as open dynamical systems interacting with other
+algorithms, physical systems, humans, or databases. Remarkably, the manifold
+tools developed under the umbrella of systems theory also provide valuable
+insights into this burgeoning paradigm shift and its accompanying challenges in
+the algorithmic world. We survey various instances where the principles of
+algorithmic systems theory are being developed and outline pertinent modeling,
+analysis, and design challenges.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Risk of Federated Learning to Skew Fine-Tuning Features and
+  Underperform Out-of-Distribution Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14027v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14027v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengyao Du, Miao Zhang, Yuwen Pu, Kai Xu, Shouling Ji, Quanjun Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To tackle the scarcity and privacy issues associated with domain-specific
+datasets, the integration of federated learning in conjunction with fine-tuning
+has emerged as a practical solution. However, our findings reveal that
+federated learning has the risk of skewing fine-tuning features and
+compromising the out-of-distribution robustness of the model. By introducing
+three robustness indicators and conducting experiments across diverse robust
+datasets, we elucidate these phenomena by scrutinizing the diversity,
+transferability, and deviation within the model feature space. To mitigate the
+negative impact of federated learning on model robustness, we introduce GNP, a
+\underline{G}eneral \underline{N}oisy \underline{P}rojection-based robust
+algorithm, ensuring no deterioration of accuracy on the target distribution.
+Specifically, the key strategy for enhancing model robustness entails the
+transfer of robustness from the pre-trained model to the fine-tuned model,
+coupled with adding a small amount of Gaussian noise to augment the
+representative capacity of the model. Comprehensive experimental results
+demonstrate that our approach markedly enhances the robustness across diverse
+scenarios, encompassing various parameter-efficient fine-tuning methods and
+confronting different levels of data heterogeneity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DNA Sequence Classification with Compressors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Şükrü Ozan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies in DNA sequence classification have leveraged sophisticated
+machine learning techniques, achieving notable accuracy in categorizing complex
+genomic data. Among these, methods such as k-mer counting have proven effective
+in distinguishing sequences from varied species like chimpanzees, dogs, and
+humans, becoming a staple in contemporary genomic research. However, these
+approaches often demand extensive computational resources, posing a challenge
+in terms of scalability and efficiency. Addressing this issue, our study
+introduces a novel adaptation of Jiang et al.'s compressor-based,
+parameter-free classification method, specifically tailored for DNA sequence
+analysis. This innovative approach utilizes a variety of compression
+algorithms, such as Gzip, Brotli, and LZMA, to efficiently process and classify
+genomic sequences. Not only does this method align with the current
+state-of-the-art in terms of accuracy, but it also offers a more
+resource-efficient alternative to traditional machine learning methods. Our
+comprehensive evaluation demonstrates the proposed method's effectiveness in
+accurately classifying DNA sequences from multiple species. We present a
+detailed analysis of the performance of each algorithm used, highlighting the
+strengths and limitations of our approach in various genomic contexts.
+Furthermore, we discuss the broader implications of our findings for
+bioinformatics, particularly in genomic data processing and analysis. The
+results of our study pave the way for more efficient and scalable DNA sequence
+classification methods, offering significant potential for advancements in
+genomic research and applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerating Retrieval-Augmented Language Model Serving with Speculation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Zhang, Alan Zhu, Lijie Yang, Yihua Xu, Lanting Li, Phitchaya Mangpo Phothilimthana, Zhihao Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented language models (RaLM) have demonstrated the potential to
+solve knowledge-intensive natural language processing (NLP) tasks by combining
+a non-parametric knowledge base with a parametric language model. Instead of
+fine-tuning a fully parametric model, RaLM excels at its low-cost adaptation to
+the latest data and better source attribution mechanisms. Among various RaLM
+approaches, iterative RaLM delivers a better generation quality due to a more
+frequent interaction between the retriever and the language model. Despite the
+benefits, iterative RaLM usually encounters high overheads due to the frequent
+retrieval step. To this end, we propose RaLMSpec, a speculation-inspired
+framework that provides generic speed-up over iterative RaLM while preserving
+the same model outputs through speculative retrieval and batched verification.
+By further incorporating prefetching, optimal speculation stride scheduler, and
+asynchronous verification, RaLMSpec can automatically exploit the acceleration
+potential to the fullest. For naive iterative RaLM serving, extensive
+evaluations over three language models on four downstream QA datasets
+demonstrate that RaLMSpec can achieve a speed-up ratio of 1.75-2.39x,
+1.04-1.39x, and 1.31-1.77x when the retriever is an exact dense retriever,
+approximate dense retriever, and sparse retriever respectively compared with
+the baseline. For KNN-LM serving, RaLMSpec can achieve a speed-up ratio up to
+7.59x and 2.45x when the retriever is an exact dense retriever and approximate
+dense retriever, respectively, compared with the baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Domain Few-Shot Learning via Adaptive <span class="highlight-title">Transformer</span> Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naeem Paeedeh, Mahardhika Pratama, Muhammad Anwar Ma'sum, Wolfgang Mayer, Zehong Cao, Ryszard Kowlczyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most few-shot learning works rely on the same domain assumption between the
+base and the target tasks, hindering their practical applications. This paper
+proposes an adaptive transformer network (ADAPTER), a simple but effective
+solution for cross-domain few-shot learning where there exist large domain
+shifts between the base task and the target task. ADAPTER is built upon the
+idea of bidirectional cross-attention to learn transferable features between
+the two domains. The proposed architecture is trained with DINO to produce
+diverse, and less biased features to avoid the supervision collapse problem.
+Furthermore, the label smoothing approach is proposed to improve the
+consistency and reliability of the predictions by also considering the
+predicted labels of the close samples in the embedding space. The performance
+of ADAPTER is rigorously evaluated in the BSCD-FSL benchmarks in which it
+outperforms prior arts with significant margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Consideration in Knowledge-based Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Consistent Natural-Language Explanations via
+  Explanation-Consistency Finetuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanda Chen, Chandan Singh, Xiaodong Liu, Simiao Zuo, Bin Yu, He He, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often generate convincing, fluent explanations.
+However, different from humans, they often generate inconsistent explanations
+on different inputs. For example, an LLM may generate the explanation "all
+birds can fly" when answering the question "Can sparrows fly?" but meanwhile
+answer "no" to the related question "Can penguins fly?". Explanations should be
+consistent across related examples so that they allow a human to simulate the
+LLM's decision process on multiple examples. We propose explanation-consistency
+finetuning (EC-finetuning), a method that adapts LLMs to generate more
+consistent natural-language explanations on related examples. EC-finetuning
+involves finetuning LLMs on synthetic data that is carefully constructed to
+contain consistent explanations. Across a variety of question-answering
+datasets in various domains, EC-finetuning yields a 10.0% relative explanation
+consistency improvement on four finetuning datasets, and generalizes to seven
+out-of-distribution datasets not seen during finetuning (+4.5% relative). Code
+is available at https://github.com/yandachen/explanation-consistency-finetuning .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2307.08678</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leeroo Orchestrator: Elevating LLMs Performance Through Model
+  Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13979v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13979v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alireza Mohammadshahi, Ali Shaikh, Majid Yazdani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose an architecture to harness the collective knowledge
+of multiple trained LLMs to create a new state-of-the-art. At the core of this
+framework is a LLM-based orchestrator that is adept at picking the right
+underlying LLM experts for optimal task execution. Inspired by self-play in
+reinforcement learning, we created a loop of query generation, orchestration,
+and evaluation to generate training data for the orchestrator. Our evaluation
+focused on the MMLU benchmark, employing models with 7B, 13B, and 34B
+parameters available on Hugging Face. The results demonstrate new
+state-of-the-art open-source models: Our Leeroo orchestrator achieves
+performance on par with the Mixtral model while incurring only two-thirds of
+its cost. Moreover, increasing the allowed cost surpasses Mixtral's accuracy by
+over 5% at the same cost level, reaching an accuracy of 75.9%. Further
+enhancements were observed when integrating GPT4 into the underlying model
+pool. The Leeroo orchestrator nearly matches GPT4's performance at half the
+cost and even exceeds GPT4's results with a 25% cost reduction. These findings
+illustrate the potential of our architecture in creating state-of-the-art and
+cost-effective LLMs by optimizing the synergy between multiple LLMs to achieve
+superior performance outcomes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Determinants of Mode Choice Using Statistical and Machine
+  Learning Techniques in the Indian Megacity of Bengaluru 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanmay Ghosh, Nithin Nagaraj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The decision making involved behind the mode choice is critical for
+transportation planning. While statistical learning techniques like discrete
+choice models have been used traditionally, machine learning (ML) models have
+gained traction recently among the transportation planners due to their higher
+predictive performance. However, the black box nature of ML models pose
+significant interpretability challenges, limiting their practical application
+in decision and policy making. This study utilised a dataset of $1350$
+households belonging to low and low-middle income bracket in the city of
+Bengaluru to investigate mode choice decision making behaviour using
+Multinomial logit model and ML classifiers like decision trees, random forests,
+extreme gradient boosting and support vector machines. In terms of accuracy,
+random forest model performed the best ($0.788$ on training data and $0.605$ on
+testing data) compared to all the other models. This research has adopted
+modern interpretability techniques like feature importance and individual
+conditional expectation plots to explain the decision making behaviour using ML
+models. A higher travel costs significantly reduce the predicted probability of
+bus usage compared to other modes (a $0.66\%$ and $0.34\%$ reduction using
+Random Forests and XGBoost model for $10\%$ increase in travel cost). However,
+reducing travel time by $10\%$ increases the preference for the metro ($0.16\%$
+in Random Forests and 0.42% in XGBoost). This research augments the ongoing
+research on mode choice analysis using machine learning techniques, which would
+help in improving the understanding of the performance of these models with
+real-world data in terms of both accuracy and interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>65 pages, 26 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stochastic Weakly Convex Optimization Beyond Lipschitz Continuity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenzhi Gao, Qi Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers stochastic weakly convex optimization without the
+standard Lipschitz continuity assumption. Based on new adaptive regularization
+(stepsize) strategies, we show that a wide class of stochastic algorithms,
+including the stochastic subgradient method, preserve the $\mathcal{O} ( 1 /
+\sqrt{K})$ convergence rate with constant failure rate. Our analyses rest on
+rather weak assumptions: the Lipschitz parameter can be either bounded by a
+general growth function of $\|x\|$ or locally estimated through independent
+random samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Long-Term Time-Series Forecasting via Meta <span class="highlight-title">Transformer</span> Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Anwar Ma'sum, MD Rasel Sarkar, Mahardhika Pratama, Savitha Ramasamy, Sreenatha Anavatti, Lin Liu,  Habibullah, Ryszard Kowalczyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A reliable long-term time-series forecaster is highly demanded in practice
+but comes across many challenges such as low computational and memory
+footprints as well as robustness against dynamic learning environments. This
+paper proposes Meta-Transformer Networks (MANTRA) to deal with the dynamic
+long-term time-series forecasting tasks. MANTRA relies on the concept of fast
+and slow learners where a collection of fast learners learns different aspects
+of data distributions while adapting quickly to changes. A slow learner tailors
+suitable representations to fast learners. Fast adaptations to dynamic
+environments are achieved using the universal representation transformer layers
+producing task-adapted representations with a small number of parameters. Our
+experiments using four datasets with different prediction lengths demonstrate
+the advantage of our approach with at least $3\%$ improvements over the
+baseline algorithms for both multivariate and univariate settings. Source codes
+of MANTRA are publicly available in
+\url{https://github.com/anwarmaxsum/MANTRA}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Consideration in IEEE Transactions on Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Networked Multiagent Reinforcement Learning for Peer-to-Peer Energy
+  Trading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Feng, Andrew L. Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Utilizing distributed renewable and energy storage resources in local
+distribution networks via peer-to-peer (P2P) energy trading has long been
+touted as a solution to improve energy systems' resilience and sustainability.
+Consumers and prosumers (those who have energy generation resources), however,
+do not have the expertise to engage in repeated P2P trading, and the
+zero-marginal costs of renewables present challenges in determining fair market
+prices. To address these issues, we propose multi-agent reinforcement learning
+(MARL) frameworks to help automate consumers' bidding and management of their
+solar PV and energy storage resources, under a specific P2P clearing mechanism
+that utilizes the so-called supply-demand ratio. In addition, we show how the
+MARL frameworks can integrate physical network constraints to realize voltage
+control, hence ensuring physical feasibility of the P2P energy trading and
+paving way for real-world implementations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning with Hidden Markov Models for Discovering
+  Decision-Making Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingche Guo, Donglin Zeng, Yuanjia Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Major depressive disorder (MDD) presents challenges in diagnosis and
+treatment due to its complex and heterogeneous nature. Emerging evidence
+indicates that reward processing abnormalities may serve as a behavioral marker
+for MDD. To measure reward processing, patients perform computer-based
+behavioral tasks that involve making choices or responding to stimulants that
+are associated with different outcomes. Reinforcement learning (RL) models are
+fitted to extract parameters that measure various aspects of reward processing
+to characterize how patients make decisions in behavioral tasks. Recent
+findings suggest the inadequacy of characterizing reward learning solely based
+on a single RL model; instead, there may be a switching of decision-making
+processes between multiple strategies. An important scientific question is how
+the dynamics of learning strategies in decision-making affect the reward
+learning ability of individuals with MDD. Motivated by the probabilistic reward
+task (PRT) within the EMBARC study, we propose a novel RL-HMM framework for
+analyzing reward-based decision-making. Our model accommodates learning
+strategy switching between two distinct approaches under a hidden Markov model
+(HMM): subjects making decisions based on the RL model or opting for random
+choices. We account for continuous RL state space and allow time-varying
+transition probabilities in the HMM. We introduce a computationally efficient
+EM algorithm for parameter estimation and employ a nonparametric bootstrap for
+inference. We apply our approach to the EMBARC study to show that MDD patients
+are less engaged in RL compared to the healthy controls, and engagement is
+associated with brain activities in the negative affect circuitry during an
+emotional conflict task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards 3D Molecule-Text Interpretation in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sihang Li, Zhiyuan Liu, Yanchen Luo, Xiang Wang, Xiangnan He, Kenji Kawaguchi, Tat-Seng Chua, Qi Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language Models (LMs) have greatly influenced diverse domains. However, their
+inherent limitation in comprehending 3D molecular structures has considerably
+constrained their potential in the biomolecular domain. To bridge this gap, we
+focus on 3D molecule-text interpretation, and propose 3D-MoLM: 3D-Molecular
+Language Modeling. Specifically, 3D-MoLM enables an LM to interpret and analyze
+3D molecules by equipping the LM with a 3D molecular encoder. This integration
+is achieved by a 3D molecule-text projector, bridging the 3D molecular
+encoder's representation space and the LM's input space. Moreover, to enhance
+3D-MoLM's ability of cross-modal molecular understanding and instruction
+following, we meticulously curated a 3D molecule-centric instruction tuning
+dataset -- 3D-MoIT. Through 3D molecule-text alignment and 3D molecule-centric
+instruction tuning, 3D-MoLM establishes an integration of 3D molecular encoder
+and LM. It significantly surpasses existing baselines on downstream tasks,
+including molecule-text retrieval, molecule captioning, and more challenging
+open-text molecular QA tasks, especially focusing on 3D-dependent properties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LocMoE: A Low-overhead MoE for Large Language Model Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13920v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13920v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Li, Zhijie Sun, Xuan He, Li Zeng, Yi Lin, Entong Li, Binfan Zheng, Rongqian Zhao, Xin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Mixtures-of-Experts (MoE) model is a widespread distributed and
+integrated learning method for large language models (LLM), which is favored
+due to its ability to sparsify and expand models efficiently. However, the
+performance of MoE is limited by load imbalance and high latency of All-To-All
+communication, along with relatively redundant computation owing to large
+expert capacity. Load imbalance may result from existing routing policies that
+consistently tend to select certain experts. The frequent inter-node
+communication in the All-To-All procedure also significantly prolongs the
+training time. To alleviate the above performance problems, we propose a novel
+routing strategy that combines load balance and locality by converting partial
+inter-node communication to that of intra-node. Notably, we elucidate that
+there is a minimum threshold for expert capacity, calculated through the
+maximal angular deviation between the gating weights of the experts and the
+assigned tokens. We port these modifications on the PanGu-Sigma model based on
+the MindSpore framework with multi-level routing and conduct experiments on
+Ascend clusters. The experiment results demonstrate that the proposed LocMoE
+reduces training time per epoch by 12.68% to 22.24% compared to classical
+routers, such as hash router and switch router, without impacting the model
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spectral Clustering for Discrete Distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13913v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13913v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixiao Wang, Dong Qiao, Jicong Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discrete distribution clustering (D2C) was often solved by Wasserstein
+barycenter methods. These methods are under a common assumption that clusters
+can be well represented by barycenters, which may not hold in many real
+applications. In this work, we propose a simple yet effective framework based
+on spectral clustering and distribution affinity measures (e.g., maximum mean
+discrepancy and Wasserstein distance) for D2C. To improve the scalability, we
+propose to use linear optimal transport to construct affinity matrices
+efficiently on large datasets. We provide theoretical guarantees for the
+success of the proposed methods in clustering distributions. Experiments on
+synthetic and real data show that our methods outperform the baselines largely
+in terms of both clustering accuracy and computational efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of Deep Learning and Foundation Models for Time Series
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John A. Miller, Mohammed Aldosari, Farah Saeed, Nasid Habib Barna, Subas Rana, I. Budak Arpinar, Ninghao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning has been successfully applied to many application domains, yet
+its advantages have been slow to emerge for time series forecasting. For
+example, in the well-known Makridakis (M) Competitions, hybrids of traditional
+statistical or machine learning techniques have only recently become the top
+performers. With the recent architectural advances in deep learning being
+applied to time series forecasting (e.g., encoder-decoders with attention,
+transformers, and graph neural networks), deep learning has begun to show
+significant advantages. Still, in the area of pandemic prediction, there remain
+challenges for deep learning models: the time series is not long enough for
+effective training, unawareness of accumulated scientific knowledge, and
+interpretability of the model. To this end, the development of foundation
+models (large deep learning models with extensive pre-training) allows models
+to understand patterns and acquire knowledge that can be applied to new related
+problems before extensive training data becomes available. Furthermore, there
+is a vast amount of knowledge available that deep learning models can tap into,
+including Knowledge Graphs and Large Language Models fine-tuned with scientific
+domain knowledge. There is ongoing research examining how to utilize or inject
+such knowledge into deep learning models. In this survey, several
+state-of-the-art modeling techniques are reviewed, and suggestions for further
+work are provided.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empowering Machines to Think Like Chemists: Unveiling Molecular
+  Structure-Polarity Relationships with Hierarchical Symbolic Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Lou, Chengchun Liu, Yuntian Chen, Fanyang Mo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thin-layer chromatography (TLC) is a crucial technique in molecular polarity
+analysis. Despite its importance, the interpretability of predictive models for
+TLC, especially those driven by artificial intelligence, remains a challenge.
+Current approaches, utilizing either high-dimensional molecular fingerprints or
+domain-knowledge-driven feature engineering, often face a dilemma between
+expressiveness and interpretability. To bridge this gap, we introduce
+Unsupervised Hierarchical Symbolic Regression (UHiSR), combining hierarchical
+neural networks and symbolic regression. UHiSR automatically distills
+chemical-intuitive polarity indices, and discovers interpretable equations that
+link molecular structure to chromatographic behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Modal Prototype based Multimodal Federated Learning under Severely
+  Missing Modality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huy Q. Le, Chu Myaet Thwal, Yu Qiao, Ye Lin Tun, Minh N. H. Nguyen, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal federated learning (MFL) has emerged as a decentralized machine
+learning paradigm, allowing multiple clients with different modalities to
+collaborate on training a machine learning model across diverse data sources
+without sharing their private data. However, challenges, such as data
+heterogeneity and severely missing modalities, pose crucial hindrances to the
+robustness of MFL, significantly impacting the performance of global model. The
+absence of a modality introduces misalignment during the local training phase,
+stemming from zero-filling in the case of clients with missing modalities.
+Consequently, achieving robust generalization in global model becomes
+imperative, especially when dealing with clients that have incomplete data. In
+this paper, we propose Multimodal Federated Cross Prototype Learning (MFCPL), a
+novel approach for MFL under severely missing modalities by conducting the
+complete prototypes to provide diverse modality knowledge in modality-shared
+level with the cross-modal regularization and modality-specific level with
+cross-modal contrastive mechanism. Additionally, our approach introduces the
+cross-modal alignment to provide regularization for modality-specific features,
+thereby enhancing overall performance, particularly in scenarios involving
+severely missing modalities. Through extensive experiments on three multimodal
+datasets, we demonstrate the effectiveness of MFCPL in mitigating these
+challenges and improving the overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A comparative study of zero-shot inference with large language models
+  and supervised modeling in breast cancer pathology classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Madhumita Sushil, Travis Zack, Divneet Mandair, Zhiwei Zheng, Ahmed Wali, Yan-Ning Yu, Yuwei Quan, Atul J. Butte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although supervised machine learning is popular for information extraction
+from clinical notes, creating large annotated datasets requires extensive
+domain expertise and is time-consuming. Meanwhile, large language models (LLMs)
+have demonstrated promising transfer learning capability. In this study, we
+explored whether recent LLMs can reduce the need for large-scale data
+annotations. We curated a manually-labeled dataset of 769 breast cancer
+pathology reports, labeled with 13 categories, to compare zero-shot
+classification capability of the GPT-4 model and the GPT-3.5 model with
+supervised classification performance of three model architectures: random
+forests classifier, long short-term memory networks with attention (LSTM-Att),
+and the UCSF-BERT model. Across all 13 tasks, the GPT-4 model performed either
+significantly better than or as well as the best supervised model, the LSTM-Att
+model (average macro F1 score of 0.83 vs. 0.75). On tasks with high imbalance
+between labels, the differences were more prominent. Frequent sources of GPT-4
+errors included inferences from multiple samples and complex task design. On
+complex tasks where large annotated datasets cannot be easily collected, LLMs
+can reduce the burden of large-scale data labeling. However, if the use of LLMs
+is prohibitive, the use of simpler supervised models with large annotated
+datasets can provide comparable results. LLMs demonstrated the potential to
+speed up the execution of clinical NLP studies by reducing the need for
+curating large annotated datasets. This may result in an increase in the
+utilization of NLP-based variables and outcomes in observational clinical
+studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constant Stepsize Q-learning: Distributional Convergence, Bias and
+  Extrapolation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Zhang, Qiaomin Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic Approximation (SA) is a widely used algorithmic approach in
+various fields, including optimization and reinforcement learning (RL). Among
+RL algorithms, Q-learning is particularly popular due to its empirical success.
+In this paper, we study asynchronous Q-learning with constant stepsize, which
+is commonly used in practice for its fast convergence. By connecting the
+constant stepsize Q-learning to a time-homogeneous Markov chain, we show the
+distributional convergence of the iterates in Wasserstein distance and
+establish its exponential convergence rate. We also establish a Central Limit
+Theory for Q-learning iterates, demonstrating the asymptotic normality of the
+averaged iterates. Moreover, we provide an explicit expansion of the asymptotic
+bias of the averaged iterate in stepsize. Specifically, the bias is
+proportional to the stepsize up to higher-order terms and we provide an
+explicit expression for the linear coefficient. This precise characterization
+of the bias allows the application of Richardson-Romberg (RR) extrapolation
+technique to construct a new estimate that is provably closer to the optimal Q
+function. Numerical results corroborate our theoretical finding on the
+improvement of the RR extrapolation method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is Temperature Sample Efficient for Softmax Gaussian Mixture of Experts? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13875v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13875v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huy Nguyen, Pedram Akbarian, Nhat Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dense-to-sparse gating mixture of experts (MoE) has recently become an
+effective alternative to a well-known sparse MoE. Rather than fixing the number
+of activated experts as in the latter model, which could limit the
+investigation of potential experts, the former model utilizes the temperature
+to control the softmax weight distribution and the sparsity of the MoE during
+training in order to stabilize the expert specialization. Nevertheless, while
+there are previous attempts to theoretically comprehend the sparse MoE, a
+comprehensive analysis of the dense-to-sparse gating MoE has remained elusive.
+Therefore, we aim to explore the impacts of the dense-to-sparse gate on the
+maximum likelihood estimation under the Gaussian MoE in this paper. We
+demonstrate that due to interactions between the temperature and other model
+parameters via some partial differential equations, the convergence rates of
+parameter estimations are slower than any polynomial rates, and could be as
+slow as $\mathcal{O}(1/\log(n))$, where $n$ denotes the sample size. To address
+this issue, we propose using a novel activation dense-to-sparse gate, which
+routes the output of a linear layer to an activation function before delivering
+them to the softmax function. By imposing linearly independence conditions on
+the activation function and its derivatives, we show that the parameter
+estimation rates are significantly improved to polynomial rates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>53 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Edge Conditional Node Update Graph Neural Network for Multi-variate Time
+  Series Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13872v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13872v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hayoung Jo, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid advancement in cyber-physical systems, the increasing number
+of sensors has significantly complicated manual monitoring of system states.
+Consequently, graph-based time-series anomaly detection methods have gained
+attention due to their ability to explicitly represent relationships between
+sensors. However, these methods often apply a uniform source node
+representation across all connected target nodes, even when updating different
+target node representations. Moreover, the graph attention mechanism, commonly
+used to infer unknown graph structures, could constrain the diversity of source
+node representations. In this paper, we introduce the Edge Conditional
+Node-update Graph Neural Network (ECNU-GNN). Our model, equipped with an edge
+conditional node update module, dynamically transforms source node
+representations based on connected edges to represent target nodes aptly. We
+validate performance on three real-world datasets: SWaT, WADI, and PSM. Our
+model demonstrates 5.4%, 12.4%, and 6.0% higher performance, respectively,
+compared to best F1 baseline models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DyEdgeGAT: Dynamic Edge via Graph Attention for Early Fault Detection in
+  IIoT Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03761v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03761v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengjie Zhao, Olga Fink
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the Industrial Internet of Things (IIoT), condition monitoring sensor
+signals from complex systems often exhibit nonlinear and stochastic
+spatial-temporal dynamics under varying conditions. These complex dynamics make
+fault detection particularly challenging. While previous methods effectively
+model these dynamics, they often neglect the evolution of relationships between
+sensor signals. Undetected shifts in these relationships can lead to
+significant system failures. Furthermore, these methods frequently misidentify
+novel operating conditions as faults. Addressing these limitations, we propose
+DyEdgeGAT (Dynamic Edge via Graph Attention), a novel approach for early-stage
+fault detection in IIoT systems. DyEdgeGAT's primary innovation lies in a novel
+graph inference scheme for multivariate time series that tracks the evolution
+of relationships between time series, enabled by dynamic edge construction.
+Another key innovation of DyEdgeGAT is its ability to incorporate operating
+condition contexts into node dynamics modeling, enhancing its accuracy and
+robustness. We rigorously evaluated DyEdgeGAT using both a synthetic dataset,
+simulating varying levels of fault severity, and a real-world industrial-scale
+multiphase flow facility benchmark with diverse fault types under varying
+operating conditions and detection complexities. The results show that
+DyEdgeGAT significantly outperforms other baseline methods in fault detection,
+particularly in the early stages with low severity, and exhibits robust
+performance under novel operating conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain-invariant Clinical Representation Learning by Bridging Data
+  Distribution Shift across EMR <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07799v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07799v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongji Zhang, Yuhang Wang, Yinghao Zhu, Xinyu Ma, Tianlong Wang, Chaohe Zhang, Yasha Wang, Liantao Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the limited information about emerging diseases, symptoms are hard to
+be noticed and recognized, so that the window for clinical intervention could
+be ignored. An effective prognostic model is expected to assist doctors in
+making right diagnosis and designing personalized treatment plan, so to
+promptly prevent unfavorable outcomes. However, in the early stage of a
+disease, limited data collection and clinical experiences, plus the concern out
+of privacy and ethics, may result in restricted data availability for
+reference, to the extent that even data labels are difficult to mark correctly.
+In addition, Electronic Medical Record (EMR) data of different diseases or of
+different sources of the same disease can prove to be having serious
+cross-dataset feature misalignment problems, greatly mutilating the efficiency
+of deep learning models. This article introduces a domain-invariant
+representation learning method to build a transition model from source dataset
+to target dataset. By way of constraining the distribution shift of features
+generated in disparate domains, domain-invariant features that are exclusively
+relative to downstream tasks are captured, so to cultivate a unified
+domain-invariant encoder across various task domains to achieve better feature
+representation. Experimental results of several target tasks demonstrate that
+our proposed model outperforms competing baseline methods and has higher rate
+of training convergence, especially in dealing with limited data amount. A
+multitude of experiences have proven the efficacy of our method to provide more
+accurate predictions concerning newly emergent pandemics and other diseases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Correlation Clustering with Active Learning of Pairwise Similarities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10295v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10295v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linus Aronsson, Morteza Haghir Chehreghani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Correlation clustering is a well-known unsupervised learning setting that
+deals with positive and negative pairwise similarities. In this paper, we study
+the case where the pairwise similarities are not given in advance and must be
+queried in a cost-efficient way. Thereby, we develop a generic active learning
+framework for this task that benefits from several advantages, e.g.,
+flexibility in the type of feedback that a user/annotator can provide,
+adaptation to any correlation clustering algorithm and query strategy, and
+robustness to noise. In addition, we propose and analyze a number of novel
+query strategies suited to this setting. We demonstrate the effectiveness of
+our framework and the proposed query strategies via several experimental
+studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convergence Rate Maximization for Split Learning-based Control of EMG
+  Prosthetic Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03233v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03233v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matea Marinova, Daniel Denkovski, Hristijan Gjoreski, Zoran Hadzi-Velkov, Valentin Rakovic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Split Learning (SL) is a promising Distributed Learning approach in
+electromyography (EMG) based prosthetic control, due to its applicability
+within resource-constrained environments. Other learning approaches, such as
+Deep Learning and Federated Learning (FL), provide suboptimal solutions, since
+prosthetic devices are extremely limited in terms of processing power and
+battery life. The viability of implementing SL in such scenarios is caused by
+its inherent model partitioning, with clients executing the smaller model
+segment. However, selecting an inadequate cut layer hinders the training
+process in SL systems. This paper presents an algorithm for optimal cut layer
+selection in terms of maximizing the convergence rate of the model. The
+performance evaluation demonstrates that the proposed algorithm substantially
+accelerates the convergence in an EMG pattern recognition task for improving
+prosthetic device control.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures, corrected typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Label Noise through Data Ambiguation <span class="chip">AAAI-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13764v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13764v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Lienen, Eyke Hüllermeier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Label noise poses an important challenge in machine learning, especially in
+deep learning, in which large models with high expressive power dominate the
+field. Models of that kind are prone to memorizing incorrect labels, thereby
+harming generalization performance. Many methods have been proposed to address
+this problem, including robust loss functions and more complex label correction
+approaches. Robust loss functions are appealing due to their simplicity, but
+typically lack flexibility, while label correction usually adds substantial
+complexity to the training setup. In this paper, we suggest to address the
+shortcomings of both methodologies by "ambiguating" the target information,
+adding additional, complementary candidate labels in case the learner is not
+sufficiently convinced of the observed training label. More precisely, we
+leverage the framework of so-called superset learning to construct set-valued
+targets based on a confidence threshold, which deliver imprecise yet more
+reliable beliefs about the ground-truth, effectively helping the learner to
+suppress the memorization effect. In an extensive empirical evaluation, our
+method demonstrates favorable learning behavior on synthetic and real-world
+noise, confirming the effectiveness in detecting and correcting erroneous
+training labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper incl. appendix accepted at AAAI-2024 (cf. copyright remark on
+  title page), 20 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI in Supply Chain Risk Assessment: A Systematic Literature <span class="highlight-title">Review</span> and
+  Bibliometric Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abrar Jahin, Saleh Akram Naife, Anik Kumar Saha, M. F. Mridha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supply chain risk assessment (SCRA) has witnessed a profound evolution
+through the integration of artificial intelligence (AI) and machine learning
+(ML) techniques, revolutionizing predictive capabilities and risk mitigation
+strategies. The significance of this evolution stems from the critical role of
+robust risk management strategies in ensuring operational resilience and
+continuity within modern supply chains. Previous reviews have outlined
+established methodologies but have overlooked emerging AI/ML techniques,
+leaving a notable research gap in understanding their practical implications
+within SCRA. This paper conducts a systematic literature review combined with a
+comprehensive bibliometric analysis. We meticulously examined 1,717 papers and
+derived key insights from a select group of 48 articles published between 2014
+and 2023. The review fills this research gap by addressing pivotal research
+questions, and exploring existing AI/ML techniques, methodologies, findings,
+and future trajectories, thereby providing a more encompassing view of the
+evolving landscape of SCRA. Our study unveils the transformative impact of
+AI/ML models, such as Random Forest, XGBoost, and hybrids, in substantially
+enhancing precision within SCRA. It underscores adaptable post-COVID
+strategies, advocating for resilient contingency plans and aligning with
+evolving risk landscapes. Significantly, this review surpasses previous
+examinations by accentuating emerging AI/ML techniques and their practical
+implications within SCRA. Furthermore, it highlights the contributions through
+a comprehensive bibliometric analysis, revealing publication trends,
+influential authors, and highly cited articles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TurboSVM-FL: Boosting Federated Learning through SVM Aggregation for
+  Lazy Clients <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12012v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12012v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengdi Wang, Anna Bodonhelyi, Efe Bozkir, Enkelejda Kasneci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning is a distributed collaborative machine learning paradigm
+that has gained strong momentum in recent years. In federated learning, a
+central server periodically coordinates models with clients and aggregates the
+models trained locally by clients without necessitating access to local data.
+Despite its potential, the implementation of federated learning continues to
+encounter several challenges, predominantly the slow convergence that is
+largely due to data heterogeneity. The slow convergence becomes particularly
+problematic in cross-device federated learning scenarios where clients may be
+strongly limited by computing power and storage space, and hence counteracting
+methods that induce additional computation or memory cost on the client side
+such as auxiliary objective terms and larger training iterations can be
+impractical. In this paper, we propose a novel federated aggregation strategy,
+TurboSVM-FL, that poses no additional computation burden on the client side and
+can significantly accelerate convergence for federated classification task,
+especially when clients are "lazy" and train their models solely for few epochs
+for next global aggregation. TurboSVM-FL extensively utilizes support vector
+machine to conduct selective aggregation and max-margin spread-out
+regularization on class embeddings. We evaluate TurboSVM-FL on multiple
+datasets including FEMNIST, CelebA, and Shakespeare using user-independent
+validation with non-iid data distribution. Our results show that TurboSVM-FL
+can significantly outperform existing popular algorithms on convergence rate
+and reduce communication rounds while delivering better test metrics including
+accuracy, F1 score, and MCC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the AAAI Conference on Artificial Intelligence 2024
+  (AAAI'24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Objective Optimization for Sparse Deep Multi-Task Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12243v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12243v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S. S. Hotegni, M. Berkemeier, S. Peitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Different conflicting optimization criteria arise naturally in various Deep
+Learning scenarios. These can address different main tasks (i.e., in the
+setting of Multi-Task Learning), but also main and secondary tasks such as loss
+minimization versus sparsity. The usual approach is a simple weighting of the
+criteria, which formally only works in the convex setting. In this paper, we
+present a Multi-Objective Optimization algorithm using a modified Weighted
+Chebyshev scalarization for training Deep Neural Networks (DNNs) with respect
+to several tasks. By employing this scalarization technique, the algorithm can
+identify all optimal solutions of the original problem while reducing its
+complexity to a sequence of single-objective problems. The simplified problems
+are then solved using an Augmented Lagrangian method, enabling the use of
+popular optimization techniques such as Adam and Stochastic Gradient Descent,
+while efficaciously handling constraints. Our work aims to address the
+(economical and also ecological) sustainability issue of DNN models, with a
+particular focus on Deep Multi-Task models, which are typically designed with a
+very large number of weights to perform equally well on multiple tasks. Through
+experiments conducted on two Machine Learning datasets, we demonstrate the
+possibility of adaptively sparsifying the model during training without
+significantly impacting its performance, if we are willing to apply
+task-specific adaptations to the network weights. The code is available at
+https://github.com/salomonhotegni/MDMTN
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PRISM: Leveraging Prototype Patient Representations with
+  Feature-Missing-Aware Calibration for EHR Data Sparsity Mitigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.04160v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.04160v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghao Zhu, Zixiang Wang, Long He, Shiyun Xie, Liantao Ma, Chengwei Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electronic Health Record (EHR) data, while rich in information, often suffers
+from sparsity, posing significant challenges in predictive modeling.
+Traditional imputation methods inadequately distinguish between real and
+imputed data, leading to potential inaccuracies in models. Addressing this, we
+introduce PRISM, a novel approach that indirectly imputes data through
+prototype representations of similar patients, thus ensuring denser and more
+accurate embeddings. PRISM innovates further with a feature confidence learner
+module, which evaluates the reliability of each feature in light of missing
+data. Additionally, it incorporates a novel patient similarity metric that
+accounts for feature confidence, avoiding overreliance on imprecise imputed
+values. Our extensive experiments on the MIMIC-III and MIMIC-IV datasets
+demonstrate PRISM's superior performance in predicting in-hospital mortality
+and 30-day readmission tasks, showcasing its effectiveness in handling EHR data
+sparsity. For the sake of reproducibility and further research, we have made
+the code publicly available at https://github.com/yhzhu99/PRISM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SymTC: A Symbiotic <span class="highlight-title">Transformer</span>-CNN Net for Instance Segmentation of
+  Lumbar Spine MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09627v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09627v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiasong Chen, Linchen Qian, Linhai Ma, Timur Urakov, Weiyong Gu, Liang Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intervertebral disc disease, a prevalent ailment, frequently leads to
+intermittent or persistent low back pain, and diagnosing and assessing of this
+disease rely on accurate measurement of vertebral bone and intervertebral disc
+geometries from lumbar MR images. Deep neural network (DNN) models may assist
+clinicians with more efficient image segmentation of individual instances
+(disks and vertebrae) of the lumbar spine in an automated way, which is termed
+as instance image segmentation. In this work, we proposed SymTC, an innovative
+lumbar spine MR image segmentation model that combines the strengths of
+Transformer and Convolutional Neural Network (CNN). Specifically, we designed a
+parallel dual-path architecture to merge CNN layers and Transformer layers, and
+we integrated a novel position embedding into the self-attention module of
+Transformer, enhancing the utilization of positional information for more
+accurate segmentation. To further improves model performance, we introduced a
+new data augmentation technique to create synthetic yet realistic MR image
+dataset, named SSMSpine, which is made publicly available. We evaluated our
+SymTC and the other 15 existing image segmentation models on our private
+in-house dataset and the public SSMSpine dataset, using two metrics, Dice
+Similarity Coefficient and 95% Hausdorff Distance. The results show that our
+SymTC has the best performance for segmenting vertebral bones and
+intervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine
+dataset are available at https://github.com/jiasongchen/SymTC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HyperSound: Generating Implicit Neural Representations of Audio Signals
+  with Hypernetworks <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.01839v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.01839v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Filip Szatkowski, Karol J. Piczak, Przemysław Spurek, Jacek Tabor, Tomasz Trzciński
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit neural representations (INRs) are a rapidly growing research field,
+which provides alternative ways to represent multimedia signals. Recent
+applications of INRs include image super-resolution, compression of
+high-dimensional signals, or 3D rendering. However, these solutions usually
+focus on visual data, and adapting them to the audio domain is not trivial.
+Moreover, it requires a separately trained model for every data sample. To
+address this limitation, we propose HyperSound, a meta-learning method
+leveraging hypernetworks to produce INRs for audio signals unseen at training
+time. We show that our approach can reconstruct sound waves with quality
+comparable to other state-of-the-art models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2022 MetaLearn workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Finetuning Foundation Models for Joint Analysis Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13536v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13536v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Vigl, Nicole Hartman, Lukas Heinrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we demonstrate that significant gains in performance and data
+efficiency can be achieved in High Energy Physics (HEP) by moving beyond the
+standard paradigm of sequential optimization or reconstruction and analysis
+components. We conceptually connect HEP reconstruction and analysis to modern
+machine learning workflows such as pretraining, finetuning, domain adaptation
+and high-dimensional embedding spaces and quantify the gains in the example
+usecase of searches of heavy resonances decaying via an intermediate di-Higgs
+system to four $b$-jets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Masked Particle Modeling on Sets: Towards <span class="highlight-title">Self-Supervised</span> High Energy
+  Physics Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13537v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13537v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Heinrich, Tobias Golling, Michael Kagan, Samuel Klein, Matthew Leigh, Margarita Osadchy, John Andrew Raine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose masked particle modeling (MPM) as a self-supervised method for
+learning generic, transferable, and reusable representations on unordered sets
+of inputs for use in high energy physics (HEP) scientific data. This work
+provides a novel scheme to perform masked modeling based pre-training to learn
+permutation invariant functions on sets. More generally, this work provides a
+step towards building large foundation models for HEP that can be generically
+pre-trained with self-supervised learning and later fine-tuned for a variety of
+down-stream tasks. In MPM, particles in a set are masked and the training
+objective is to recover their identity, as defined by a discretized token
+representation of a pre-trained vector quantized variational autoencoder. We
+study the efficacy of the method in samples of high energy jets at collider
+physics experiments, including studies on the impact of discretization,
+permutation invariance, and ordering. We also study the fine-tuning capability
+of the model, showing that it can be adapted to tasks such as supervised and
+weakly supervised jet classification, and that the model can transfer
+efficiently with small fine-tuning data sets to new classes and new data
+domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Strong and Simple Deep Learning Baseline for BCI MI Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.07159v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.07159v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassine El Ouahidi, Vincent Gripon, Bastien Pasdeloup, Ghaith Bouallegue, Nicolas Farrugia, Giulia Lioi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose EEG-SimpleConv, a straightforward 1D convolutional neural network
+for Motor Imagery decoding in BCI. Our main motivation is to propose a simple
+and performing baseline to compare to, using only very standard ingredients
+from the literature. We evaluate its performance on four EEG Motor Imagery
+datasets, including simulated online setups, and compare it to recent Deep
+Learning and Machine Learning approaches. EEG-SimpleConv is at least as good or
+far more efficient than other approaches, showing strong knowledge-transfer
+capabilities across subjects, at the cost of a low inference time. We advocate
+that using off-the-shelf ingredients rather than coming with ad-hoc solutions
+can significantly help the adoption of Deep Learning approaches for BCI. We
+make the code of the models and the experiments accessible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shabari: Delayed Decision-Making for Faster and Efficient Serverless
+  Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08859v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08859v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prasoon Sinha, Kostis Kaffes, Neeraja J. Yadwadkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Serverless computing relieves developers from the burden of resource
+management, thus providing ease-of-use to the users and the opportunity to
+optimize resource utilization for the providers. However, today's serverless
+systems lack performance guarantees for function invocations, thus limiting
+support for performance-critical applications: we observed severe performance
+variability (up to 6x). Providers lack visibility into user functions and hence
+find it challenging to right-size them: we observed heavy resource
+underutilization (up to 80%). To understand the causes behind the performance
+variability and underutilization, we conducted a measurement study of commonly
+deployed serverless functions and learned that the function performance and
+resource utilization depend crucially on function semantics and inputs. Our key
+insight is to delay making resource allocation decisions until after the
+function inputs are available. We introduce Shabari, a resource management
+framework for serverless systems that makes decisions as late as possible to
+right-size each invocation to meet functions' performance objectives (SLOs) and
+improve resource utilization. Shabari uses an online learning agent to
+right-size each function invocation based on the features of the function input
+and makes cold-start-aware scheduling decisions. For a range of serverless
+functions and inputs, Shabari reduces SLO violations by 11-73% while not
+wasting any vCPUs and reducing wasted memory by 64-94% in the median case,
+compared to state-of-the-art systems, including Aquatope, Parrotfish, and
+Cypress.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 figures, update typo in manually entered arxiv title</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Facial Action Unit Detection Based on Multi-task Learning Strategy for
+  Unlabeled Facial Images in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05207v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05207v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqiao Shang, Bin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial Action Unit (AU) detection often relies on highly-cost accurate
+labeling or inaccurate pseudo labeling techniques in recent years. How to
+introduce large amounts of unlabeled facial images in the wild into supervised
+AU detection frameworks has become a challenging problem. Additionally, nearly
+every type of AUs has the problem of unbalanced positive and negative samples.
+Inspired by other multi-task learning frameworks, we first propose a multi-task
+learning strategy boosting AU detection in the wild through jointing facial
+landmark detection and AU domain separation and reconstruction. Our introduced
+dual domains facial landmark detection framework can solve the lack of accurate
+facial landmark coordinates during the AU domain separation and reconstruction
+training process, while the parameters of homostructural facial extraction
+modules from these two similar facial tasks are shared. Moreover, we propose a
+pixel-level feature alignment scheme to maintain the consistency of features
+obtained from two separation and reconstruction processes. Furthermore, a
+weighted asymmetric loss is proposed to change the contribution of positive and
+negative samples of each type of AUs to model parameters updating. Experimental
+results on three widely used benchmarks demonstrate our superiority to most
+state-of-the-art methods for AU detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figure, submitted to Expert Systems with Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rotation Invariant Quantization for Model Compression <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.03106v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.03106v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Kampeas, Yury Nahshan, Hanoch Kremer, Gil Lederman, Shira Zaloshinski, Zheng Li, Emir Haleva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training Neural Network (NN) model compression is an attractive approach
+for deploying large, memory-consuming models on devices with limited memory
+resources. In this study, we investigate the rate-distortion tradeoff for NN
+model compression. First, we suggest a Rotation-Invariant Quantization (RIQ)
+technique that utilizes a single parameter to quantize the entire NN model,
+yielding a different rate at each layer, i.e., mixed-precision quantization.
+Then, we prove that our rotation-invariant approach is optimal in terms of
+compression. We rigorously evaluate RIQ and demonstrate its capabilities on
+various models and tasks. For example, RIQ facilitates $\times 19.4$ and
+$\times 52.9$ compression ratios on pre-trained VGG dense and pruned models,
+respectively, with $<0.4\%$ accuracy degradation. Code is available in
+\url{https://github.com/ehaleva/RIQ}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 5 figures, submitted to ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Successor-Predecessor Intrinsic Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15277v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15277v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changmin Yu, Neil Burgess, Maneesh Sahani, Samuel J. Gershman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploration is essential in reinforcement learning, particularly in
+environments where external rewards are sparse. Here we focus on exploration
+with intrinsic rewards, where the agent transiently augments the external
+rewards with self-generated intrinsic rewards. Although the study of intrinsic
+rewards has a long history, existing methods focus on composing the intrinsic
+reward based on measures of future prospects of states, ignoring the
+information contained in the retrospective structure of transition sequences.
+Here we argue that the agent can utilise retrospective information to generate
+explorative behaviour with structure-awareness, facilitating efficient
+exploration based on global instead of local information. We propose
+Successor-Predecessor Intrinsic Exploration (SPIE), an exploration algorithm
+based on a novel intrinsic reward combining prospective and retrospective
+information. We show that SPIE yields more efficient and ethologically
+plausible exploratory behaviour in environments with sparse rewards and
+bottleneck states than competing methods. We also implement SPIE in deep
+reinforcement learning agents, and show that the resulting agent achieves
+stronger empirical performance than existing methods on sparse-reward Atari
+games.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Trustworthy Edge Intelligence: From Security and Reliability
+  To Transparency and Sustainability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.17944v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.17944v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojie Wang, Beibei Wang, Yu Wu, Zhaolong Ning, Song Guo, Fei Richard Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Edge Intelligence (EI) integrates Edge Computing (EC) and Artificial
+Intelligence (AI) to push the capabilities of AI to the network edge for
+real-time, efficient and secure intelligent decision-making and computation.
+However, EI faces various challenges due to resource constraints, heterogeneous
+network environments, and diverse service requirements of different
+applications, which together affect the trustworthiness of EI in the eyes of
+stakeholders. This survey comprehensively summarizes the characteristics,
+architecture, technologies, and solutions of trustworthy EI. Specifically, we
+first emphasize the need for trustworthy EI in the context of the trend toward
+large models. We then provide an initial definition of trustworthy EI, explore
+its key characteristics and give a multi-layered architecture for trustworthy
+EI. Then, we summarize several important issues that hinder the achievement of
+trustworthy EI. Subsequently, we present enabling technologies for trustworthy
+EI systems and provide an in-depth literature review of the state-of-the-art
+solutions for realizing the trustworthiness of EI. Finally, we discuss the
+corresponding research challenges and open issues.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 6 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Grounded Object Centric Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09437v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09437v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avinash Kori, Francesco Locatello, Fabio De Sousa Ribeiro, Francesca Toni, Ben Glocker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The extraction of modular object-centric representations for downstream tasks
+is an emerging area of research. Learning grounded representations of objects
+that are guaranteed to be stable and invariant promises robust performance
+across different tasks and environments. Slot Attention (SA) learns
+object-centric representations by assigning objects to \textit{slots}, but
+presupposes a \textit{single} distribution from which all slots are randomly
+initialised. This results in an inability to learn \textit{specialized} slots
+which bind to specific object types and remain invariant to identity-preserving
+changes in object appearance. To address this, we present
+\emph{\textsc{Co}nditional \textsc{S}lot \textsc{A}ttention} (\textsc{CoSA})
+using a novel concept of \emph{Grounded Slot Dictionary} (GSD) inspired by
+vector quantization. Our proposed GSD comprises (i) canonical object-level
+property vectors and (ii) parametric Gaussian distributions, which define a
+prior over the slots. We demonstrate the benefits of our method in multiple
+downstream tasks such as scene generation, composition, and task adaptation,
+whilst remaining competitive with SA in popular object discovery benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TrojFST: Embedding Trojans in Few-shot <span class="highlight-title">Prompt</span> Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10467v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10467v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengxin Zheng, Jiaqi Xue, Xun Chen, YanShan Wang, Qian Lou, Lei Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt-tuning has emerged as a highly effective approach for adapting a
+pre-trained language model (PLM) to handle new natural language processing
+tasks with limited input samples. However, the success of prompt-tuning has led
+to adversaries attempting backdoor attacks against this technique. Previous
+prompt-based backdoor attacks faced challenges when implemented through
+few-shot prompt-tuning, requiring either full-model fine-tuning or a large
+training dataset. We observe the difficulty in constructing a prompt-based
+backdoor using few-shot prompt-tuning, which involves freezing the PLM and
+tuning a soft prompt with a restricted set of input samples. This approach
+introduces an imbalanced poisoned dataset, making it susceptible to overfitting
+and lacking attention awareness. To address these challenges, we introduce
+TrojFST for backdoor attacks within the framework of few-shot prompt-tuning.
+TrojFST comprises three modules: balanced poison learning, selective token
+poisoning, and trojan-trigger attention. In comparison to previous prompt-based
+backdoor attacks, TrojFST demonstrates significant improvements, enhancing ASR
+$> 9\%$ and CDA by $> 4\%$ across various PLMs and a diverse set of downstream
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structural Group Unfairness: Measurement and Mitigation by means of the
+  Effective Resistance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03223v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03223v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Arnaiz-Rodriguez, Georgina Curto, Nuria Oliver
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social networks contribute to the distribution of social capital, defined as
+the relationships, norms of trust and reciprocity within a community or society
+that facilitate cooperation and collective action. Social capital exists in the
+relations among individuals, such that better positioned members in a social
+network benefit from faster access to diverse information and higher influence
+on information dissemination. A variety of methods have been proposed in the
+literature to measure social capital at an individual level. However, there is
+a lack of methods to quantify social capital at a group level, which is
+particularly important when the groups are defined on the grounds of protected
+attributes. Furthermore, state-of-the-art approaches fail to model the role of
+long-range interactions between nodes in the network and their contributions to
+social capital. To fill this gap, we propose to measure the social capital of a
+group of nodes by means of their information flow and emphasize the importance
+of considering the whole network topology. Grounded in spectral graph theory,
+we introduce three effective resistance-based measures of group social capital,
+namely group isolation, group diameter and group control. We denote the social
+capital disparity among different groups in a network as structural group
+unfairness, and propose to mitigate it by means of a budgeted edge augmentation
+heuristic that systematically increases the social capital of the most
+disadvantaged group. In experiments on real networks, we uncover significant
+levels of structural group unfairness when using gender as the protected
+attribute, with females being the most disadvantaged group in comparison to
+males. We also illustrate how our proposed edge augmentation approach is able
+to not only effectively mitigate the structural group unfairness but also
+increase the social capital of all groups in the network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking the Sim-to-Real Gap in Cloth Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09543v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09543v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Blanco-Mulero, Oriol Barbany, Gokhan Alcan, Adrià Colomé, Carme Torras, Ville Kyrki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Realistic physics engines play a crucial role for learning to manipulate
+deformable objects such as garments in simulation. By doing so, researchers can
+circumvent challenges such as sensing the deformation of the object in the
+realworld. In spite of the extensive use of simulations for this task, few
+works have evaluated the reality gap between deformable object simulators and
+real-world data. We present a benchmark dataset to evaluate the sim-to-real gap
+in cloth manipulation. The dataset is collected by performing a dynamic as well
+as a quasi-static cloth manipulation task involving contact with a rigid table.
+We use the dataset to evaluate the reality gap, computational time, and
+simulation stability of four popular deformable object simulators: MuJoCo,
+Bullet, Flex, and SOFA. Additionally, we discuss the benefits and drawbacks of
+each simulator. The benchmark dataset is open-source. Supplementary material,
+videos, and code, can be found at
+https://sites.google.com/view/cloth-sim2real-benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Robotics and Automation Letters (RA-L). 8 pages, 6
+  figures. Supplementary material available at
+  https://sites.google.com/view/cloth-sim2real-benchmark</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Derivative-free Alternating Projection Algorithms for General
+  Nonconvex-Concave Minimax Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.00473v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.00473v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zi Xu, Ziqi Wang, Jingjing Shen, Yuhong Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study zeroth-order algorithms for nonconvex-concave minimax
+problems, which have attracted widely attention in machine learning, signal
+processing and many other fields in recent years. We propose a zeroth-order
+alternating randomized gradient projection (ZO-AGP) algorithm for smooth
+nonconvex-concave minimax problems, and its iteration complexity to obtain an
+$\varepsilon$-stationary point is bounded by $\mathcal{O}(\varepsilon^{-4})$,
+and the number of function value estimation is bounded by
+$\mathcal{O}(d_{x}+d_{y})$ per iteration. Moreover, we propose a zeroth-order
+block alternating randomized proximal gradient algorithm (ZO-BAPG) for solving
+block-wise nonsmooth nonconvex-concave minimax optimization problems, and the
+iteration complexity to obtain an $\varepsilon$-stationary point is bounded by
+$\mathcal{O}(\varepsilon^{-4})$ and the number of function value estimation per
+iteration is bounded by $\mathcal{O}(K d_{x}+d_{y})$. To the best of our
+knowledge, this is the first time that zeroth-order algorithms with iteration
+complexity gurantee are developed for solving both general smooth and
+block-wise nonsmooth nonconvex-concave minimax problems. Numerical results on
+data poisoning attack problem and distributed nonconvex sparse principal
+component analysis problem validate the efficiency of the proposed algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Language Models Generation Can Be Halted Early 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10818v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10818v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sofia Maria Lo Cicero Vaina, Nikita Balagansky, Daniil Gavrilov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Language models (DLMs) are a promising avenue for text generation
+due to their practical properties on tractable controllable generation. They
+also have the advantage of not having to predict text autoregressively.
+However, despite these notable features, DLMs have not yet reached the
+performance levels of their autoregressive counterparts. One of the ways to
+reduce the performance gap between these two types of language models is to
+speed up the generation of DLMs. Therefore, we propose a novel methodology to
+address this issue in this work. It enables the execution of more generation
+steps within a given time frame, leading to higher-quality outputs.
+Specifically, our methods estimate DLMs completeness of text generation and
+allow adaptive halting of the generation process. We evaluate our methods on
+Plaid, SSD, and CDCD DLMs and create a cohesive perspective on their generation
+workflows. Finally, we confirm that our methods allow halting these models and
+decrease the generation time by $10$-$40$\% without a drop in the quality of
+model samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiConStruct: Causal Concept-based Explanations through Black-Box
+  Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08534v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08534v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ricardo Moreira, Jacopo Bono, Mário Cardoso, Pedro Saleiro, Mário A. T. Figueiredo, Pedro Bizarro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model interpretability plays a central role in human-AI decision-making
+systems. Ideally, explanations should be expressed using human-interpretable
+semantic concepts. Moreover, the causal relations between these concepts should
+be captured by the explainer to allow for reasoning about the explanations.
+Lastly, explanation methods should be efficient and not compromise the
+performance of the predictive task. Despite the rapid advances in AI
+explainability in recent years, as far as we know to date, no method fulfills
+these three properties. Indeed, mainstream methods for local concept
+explainability do not produce causal explanations and incur a trade-off between
+explainability and prediction performance. We present DiConStruct, an
+explanation method that is both concept-based and causal, with the goal of
+creating more interpretable local explanations in the form of structural causal
+models and concept attributions. Our explainer works as a distillation model to
+any black-box machine learning model by approximating its predictions while
+producing the respective explanations. Because of this, DiConStruct generates
+explanations efficiently while not impacting the black-box prediction task. We
+validate our method on an image dataset and a tabular dataset, showing that
+DiConStruct approximates the black-box models with higher fidelity than other
+concept explainability baselines, while providing explanations that include the
+causal relations between the concepts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Conference on Causal Learning and Reasoning (CLeaR 2024,
+  https://www.cclear.cc/2024). To be published at Proceedings of Machine
+  Learning Research (PMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Surprising Harmfulness of Benign Overfitting for Adversarial
+  Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12236v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12236v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Hao, Tong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent empirical and theoretical studies have established the generalization
+capabilities of large machine learning models that are trained to
+(approximately or exactly) fit noisy data. In this work, we prove a surprising
+result that even if the ground truth itself is robust to adversarial examples,
+and the benignly overfitted model is benign in terms of the ``standard''
+out-of-sample risk objective, this benign overfitting process can be harmful
+when out-of-sample data are subject to adversarial manipulation. More
+specifically, our main results contain two parts: (i) the min-norm estimator in
+overparameterized linear model always leads to adversarial vulnerability in the
+``benign overfitting'' setting; (ii) we verify an asymptotic trade-off result
+between the standard risk and the ``adversarial'' risk of every ridge
+regression estimator, implying that under suitable conditions these two items
+cannot both be small at the same time by any single choice of the ridge
+regularization parameter. Furthermore, under the lazy training regime, we
+demonstrate parallel results on two-layer neural tangent kernel (NTK) model,
+which align with empirical observations in deep neural networks. Our finding
+provides theoretical insights into the puzzling phenomenon observed in
+practice, where the true target function (e.g., human) is robust against
+adverasrial attack, while beginly overfitted neural networks lead to models
+that are not robust.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Private, fair and accurate: Training large-scale, privacy-preserving AI
+  models in medical imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01622v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01622v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroosh Tayebi Arasteh, Alexander Ziller, Christiane Kuhl, Marcus Makowski, Sven Nebelung, Rickmer Braren, Daniel Rueckert, Daniel Truhn, Georgios Kaissis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence (AI) models are increasingly used in the medical
+domain. However, as medical data is highly sensitive, special precautions to
+ensure its protection are required. The gold standard for privacy preservation
+is the introduction of differential privacy (DP) to model training. Prior work
+indicates that DP has negative implications on model accuracy and fairness,
+which are unacceptable in medicine and represent a main barrier to the
+widespread use of privacy-preserving techniques. In this work, we evaluated the
+effect of privacy-preserving training of AI models regarding accuracy and
+fairness compared to non-private training. For this, we used two datasets: (1)
+A large dataset (N=193,311) of high quality clinical chest radiographs, and (2)
+a dataset (N=1,625) of 3D abdominal computed tomography (CT) images, with the
+task of classifying the presence of pancreatic ductal adenocarcinoma (PDAC).
+Both were retrospectively collected and manually labeled by experienced
+radiologists. We then compared non-private deep convolutional neural networks
+(CNNs) and privacy-preserving (DP) models with respect to privacy-utility
+trade-offs measured as area under the receiver-operator-characteristic curve
+(AUROC), and privacy-fairness trade-offs, measured as Pearson's r or
+Statistical Parity Difference. We found that, while the privacy-preserving
+trainings yielded lower accuracy, they did largely not amplify discrimination
+against age, sex or co-morbidity. Our study shows that -- under the challenging
+realistic circumstances of a real-life clinical dataset -- the
+privacy-preserving training of diagnostic deep learning models is possible with
+excellent diagnostic accuracy and fairness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Communications Medicine. 2024. Nature Portfolio</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Learning Systems are Bloated and Vulnerable 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09437v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09437v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaifeng Zhang, Fahmi Abdulqadir Ahmed, Dyako Fatih, Akayou Kitessa, Mohannad Alhanahnah, Philipp Leitner, Ahmed Ali-Eldin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Today's software is bloated with both code and features that are not used by
+most users. This bloat is prevalent across the entire software stack, from
+operating systems and applications to containers. Containers are lightweight
+virtualization technologies used to package code and dependencies, providing
+portable, reproducible and isolated environments. For their ease of use, data
+scientists often utilize machine learning containers to simplify their
+workflow. However, this convenience comes at a cost: containers are often
+bloated with unnecessary code and dependencies, resulting in very large sizes.
+In this paper, we analyze and quantify bloat in machine learning containers. We
+develop MMLB, a framework for analyzing bloat in software systems, focusing on
+machine learning containers. MMLB measures the amount of bloat at both the
+container and package levels, quantifying the sources of bloat. In addition,
+MMLB integrates with vulnerability analysis tools and performs package
+dependency analysis to evaluate the impact of bloat on container
+vulnerabilities. Through experimentation with 15 machine learning containers
+from TensorFlow, PyTorch, and Nvidia, we show that bloat accounts for up to 80%
+of machine learning container sizes, increasing container provisioning times by
+up to 370% and exacerbating vulnerabilities by up to 99%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BiTA: Bi-Directional Tuning for Lossless Acceleration in Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Lin, Hanling Yi, Hongbin Li, Yifan Yang, Xiaotian Yu, Guangming Lu, Rong Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) commonly employ autoregressive generation during
+inference, leading to high memory bandwidth demand and consequently extended
+latency. To mitigate this inefficiency, we present Bi-directional Tuning for
+lossless Acceleration (BiTA), an innovative method expediting LLMs via
+streamlined semi-autoregressive generation and draft verification. Inspired by
+the concept of prompt tuning, we enhance LLMs with a parameter-efficient design
+called bi-directional tuning for the capability in semi-autoregressive
+generation. Employing efficient tree-based decoding, the models perform draft
+candidate generation and verification in parallel, ensuring outputs identical
+to their autoregressive counterparts under greedy sampling. BiTA serves as a
+lightweight plug-in module, seamlessly boosting the inference efficiency of
+existing LLMs without requiring additional assistance models or incurring
+significant extra memory costs. Applying the proposed BiTA, LLaMA-2-70B-Chat
+achieves a 2.7$\times$ speedup on the MT-Bench benchmark. Extensive experiments
+confirm our method surpasses state-of-the-art acceleration techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An appendix has been included. Source code at
+  https://github.com/linfeng93/BiTA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semi-Supervised Active Learning for Semantic Segmentation in Unknown
+  Environments Using Informative Path Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.04402v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.04402v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Rückin, Federico Magistri, Cyrill Stachniss, Marija Popović
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation enables robots to perceive and reason about their
+environments beyond geometry. Most of such systems build upon deep learning
+approaches. As autonomous robots are commonly deployed in initially unknown
+environments, pre-training on static datasets cannot always capture the variety
+of domains and limits the robot's perception performance during missions.
+Recently, self-supervised and fully supervised active learning methods emerged
+to improve a robot's vision. These approaches rely on large in-domain
+pre-training datasets or require substantial human labelling effort. We propose
+a planning method for semi-supervised active learning of semantic segmentation
+that substantially reduces human labelling requirements compared to fully
+supervised approaches. We leverage an adaptive map-based planner guided towards
+the frontiers of unexplored space with high model uncertainty collecting
+training data for human labelling. A key aspect of our approach is to combine
+the sparse high-quality human labels with pseudo labels automatically extracted
+from highly certain environment map areas. Experimental results show that our
+method reaches segmentation performance close to fully supervised approaches
+with drastically reduced human labelling effort while outperforming
+self-supervised approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EvadeDroid: A Practical Evasion Attack on Machine Learning for Black-box
+  Android Malware Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.03301v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.03301v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamid Bostani, Veelasha Moonsamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the last decade, researchers have extensively explored the
+vulnerabilities of Android malware detectors to adversarial examples through
+the development of evasion attacks; however, the practicality of these attacks
+in real-world scenarios remains arguable. The majority of studies have assumed
+attackers know the details of the target classifiers used for malware
+detection, while in reality, malicious actors have limited access to the target
+classifiers. This paper introduces EvadeDroid, a problem-space adversarial
+attack designed to effectively evade black-box Android malware detectors in
+real-world scenarios. EvadeDroid constructs a collection of problem-space
+transformations derived from benign donors that share opcode-level similarity
+with malware apps by leveraging an n-gram-based approach. These transformations
+are then used to morph malware instances into benign ones via an iterative and
+incremental manipulation strategy. The proposed manipulation technique is a
+query-efficient optimization algorithm that can find and inject optimal
+sequences of transformations into malware apps. Our empirical evaluations,
+carried out on 1K malware apps, demonstrate the effectiveness of our approach
+in generating real-world adversarial examples in both soft- and hard-label
+settings. Our findings reveal that EvadeDroid can effectively deceive diverse
+malware detectors that utilize different features with various feature types.
+Specifically, EvadeDroid achieves evasion rates of 80%-95% against DREBIN,
+Sec-SVM, ADE-MA, MaMaDroid, and Opcode-SVM with only 1-9 queries. Furthermore,
+we show that the proposed problem-space adversarial attack is able to preserve
+its stealthiness against five popular commercial antiviruses with an average of
+79% evasion rate, thus demonstrating its feasibility in the real world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper was accepted by Elsevier Computers & Security on 20
+  December 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Risk Measures and Upper Probabilities: Coherence and Stratification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.03183v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.03183v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Fröhlich, Robert C. Williamson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning typically presupposes classical probability theory which
+implies that aggregation is built upon expectation. There are now multiple
+reasons to motivate looking at richer alternatives to classical probability
+theory as a mathematical foundation for machine learning. We systematically
+examine a powerful and rich class of alternative aggregation functionals, known
+variously as spectral risk measures, Choquet integrals or Lorentz norms. We
+present a range of characterization results, and demonstrate what makes this
+spectral family so special. In doing so we arrive at a natural stratification
+of all coherent risk measures in terms of the upper probabilities that they
+induce by exploiting results from the theory of rearrangement invariant Banach
+spaces. We empirically demonstrate how this new approach to uncertainty helps
+tackling practical machine learning problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Distributional and Risk-sensitive Reinforcement Learning with
+  Provable Regret Bounds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.14051v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.14051v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liang, Zhi-Quan Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the regret guarantee for risk-sensitive reinforcement learning
+(RSRL) via distributional reinforcement learning (DRL) methods. In particular,
+we consider finite episodic Markov decision processes whose objective is the
+entropic risk measure (EntRM) of return. By leveraging a key property of the
+EntRM, the independence property, we establish the risk-sensitive
+distributional dynamic programming framework. We then propose two novel DRL
+algorithms that implement optimism through two different schemes, including a
+model-free one and a model-based one.
+  We prove that they both attain $\tilde{\mathcal{O}}(\frac{\exp(|\beta|
+H)-1}{|\beta|}H\sqrt{S^2AK})$ regret upper bound, where $S$, $A$, $K$, and $H$
+represent the number of states, actions, episodes, and the time horizon,
+respectively. It matches RSVI2 proposed in \cite{fei2021exponential}, with
+novel distributional analysis. To the best of our knowledge, this is the first
+regret analysis that bridges DRL and RSRL in terms of sample complexity.
+  Acknowledging the computational inefficiency associated with the model-free
+DRL algorithm, we propose an alternative DRL algorithm with distribution
+representation. This approach not only maintains the established regret bounds
+but also significantly amplifies computational efficiency.
+  We also prove a tighter minimax lower bound of $\Omega(\frac{\exp(\beta
+H/6)-1}{\beta H}H\sqrt{SAT})$ for the $\beta>0$ case, which recovers the tight
+lower bound $\Omega(H\sqrt{SAT})$ in the risk-neutral setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Autoencoding of Dental Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Ziruo Ye, Thomas Ørkild, Peter Lempel Søndergaard, Søren Hauberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital dentistry has made significant advancements, yet numerous challenges
+remain. This paper introduces the FDI 16 dataset, an extensive collection of
+tooth meshes and point clouds. Additionally, we present a novel approach:
+Variational FoldingNet (VF-Net), a fully probabilistic variational autoencoder
+designed for point clouds. Notably, prior latent variable models for point
+clouds lack a one-to-one correspondence between input and output points.
+Instead, they rely on optimizing Chamfer distances, a metric that lacks a
+normalized distributional counterpart, rendering it unsuitable for
+probabilistic modeling. We replace the explicit minimization of Chamfer
+distances with a suitable encoder, increasing computational efficiency while
+simplifying the probabilistic extension. This allows for straightforward
+application in various tasks, including mesh generation, shape completion, and
+representation learning. Empirically, we provide evidence of lower
+reconstruction error in dental reconstruction and interpolation, showcasing
+state-of-the-art performance in dental sample generation while identifying
+valuable latent representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Binary structured physics-informed neural networks for solving equations
+  with rapidly changing solutions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12806v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12806v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanzhi Liu, Ruifan Wu, Ying Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs), rooted in deep learning, have
+emerged as a promising approach for solving partial differential equations
+(PDEs). By embedding the physical information described by PDEs into
+feedforward neural networks, PINNs are trained as surrogate models to
+approximate solutions without the need for label data. Nevertheless, even
+though PINNs have shown remarkable performance, they can face difficulties,
+especially when dealing with equations featuring rapidly changing solutions.
+These difficulties encompass slow convergence, susceptibility to becoming
+trapped in local minima, and reduced solution accuracy. To address these
+issues, we propose a binary structured physics-informed neural network (BsPINN)
+framework, which employs binary structured neural network (BsNN) as the neural
+network component. By leveraging a binary structure that reduces inter-neuron
+connections compared to fully connected neural networks, BsPINNs excel in
+capturing the local features of solutions more effectively and efficiently.
+These features are particularly crucial for learning the rapidly changing in
+the nature of solutions. In a series of numerical experiments solving Burgers
+equation, Euler equation, Helmholtz equation, and high-dimension Poisson
+equation, BsPINNs exhibit superior convergence speed and heightened accuracy
+compared to PINNs. From these experiments, we discover that BsPINNs resolve the
+issues caused by increased hidden layers in PINNs resulting in over-smoothing,
+and prevent the decline in accuracy due to non-smoothness of PDEs solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Context selectivity with dynamic availability enables lifelong continual
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01690v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01690v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Barry, Wulfram Gerstner, Guillaume Bellec
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  "You never forget how to ride a bike", -- but how is that possible? The brain
+is able to learn complex skills, stop the practice for years, learn other
+skills in between, and still retrieve the original knowledge when necessary.
+The mechanisms of this capability, referred to as lifelong learning (or
+continual learning, CL), are unknown. We suggest a bio-plausible
+meta-plasticity rule building on classical work in CL which we summarize in two
+principles: (i) neurons are context selective, and (ii) a local availability
+variable partially freezes the plasticity if the neuron was relevant for
+previous tasks. In a new neuro-centric formalization of these principles, we
+suggest that neuron selectivity and neuron-wide consolidation is a simple and
+viable meta-plasticity hypothesis to enable CL in the brain. In simulation,
+this simple model balances forgetting and consolidation leading to better
+transfer learning than contemporary CL algorithms on image recognition and
+natural language processing CL benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inadequacy of common stochastic neural networks for reliable clinical
+  decision support 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13657v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13657v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Lindenmeyer, Malte Blattmann, Stefan Franke, Thomas Neumuth, Daniel Schneider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Widespread adoption of AI for medical decision making is still hindered due
+to ethical and safety-related concerns. For AI-based decision support systems
+in healthcare settings it is paramount to be reliable and trustworthy. Common
+deep learning approaches, however, have the tendency towards overconfidence
+under data shift. Such inappropriate extrapolation beyond evidence-based
+scenarios may have dire consequences. This highlights the importance of
+reliable estimation of local uncertainty and its communication to the end user.
+While stochastic neural networks have been heralded as a potential solution to
+these issues, this study investigates their actual reliability in clinical
+applications. We centered our analysis on the exemplary use case of mortality
+prediction for ICU hospitalizations using EHR from MIMIC3 study. For
+predictions on the EHR time series, Encoder-Only Transformer models were
+employed. Stochasticity of model functions was achieved by incorporating common
+methods such as Bayesian neural network layers and model ensembles. Our models
+achieve state of the art performance in terms of discrimination performance
+(AUC ROC: 0.868+-0.011, AUC PR: 0.554+-0.034) and calibration on the mortality
+prediction benchmark. However, epistemic uncertainty is critically
+underestimated by the selected stochastic deep learning methods. A heuristic
+proof for the responsible collapse of the posterior distribution is provided.
+Our findings reveal the inadequacy of commonly used stochastic deep learning
+approaches to reliably recognize OoD samples. In both methods, unsubstantiated
+model confidence is not prevented due to strongly biased functional posteriors,
+rendering them inappropriate for reliable clinical decision support. This
+highlights the need for approaches with more strictly enforced or inherent
+distance-awareness to known data points, e.g., using kernel-based techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Keywords: probabilistic inference, uncertainty estimation,
+  uncertainty quantification, epistemic uncertainty, clinical prognosis,
+  electronic health records</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Individual Treatment Effects under Heterogeneous Interference
+  in Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.14080v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.14080v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyu Zhao, Yuqi Bai, Kun Kuang, Ruoxuan Xiong, Fei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimates of individual treatment effects from networked observational data
+are attracting increasing attention these days. One major challenge in network
+scenarios is the violation of the stable unit treatment value assumption
+(SUTVA), which assumes that the treatment assignment of a unit does not
+influence others' outcomes. In network data, due to interference, the outcome
+of a unit is influenced not only by its treatment (i.e., direct effects) but
+also by others' treatments (i.e., spillover effects). Furthermore, the
+influences from other units are always heterogeneous (e.g., friends with
+similar interests affect a person differently than friends with different
+interests). In this paper, we focus on the problem of estimating individual
+treatment effects (both direct and spillover effects) under heterogeneous
+interference. To address this issue, we propose a novel Dual Weighting
+Regression (DWR) algorithm by simultaneously learning attention weights that
+capture the heterogeneous interference and sample weights to eliminate the
+complex confounding bias in networks. We formulate the entire learning process
+as a bi-level optimization problem. In theory, we present generalization error
+bounds for individual treatment effect estimation. Extensive experiments on
+four benchmark datasets demonstrate that the proposed DWR algorithm outperforms
+state-of-the-art methods for estimating individual treatment effects under
+heterogeneous interference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploration and Anti-Exploration with Distributional Random Network
+  Distillation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09750v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09750v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Yang, Jian Tao, Jiafei Lyu, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploration remains a critical issue in deep reinforcement learning for an
+agent to attain high returns in unknown environments. Although the prevailing
+exploration Random Network Distillation (RND) algorithm has been demonstrated
+to be effective in numerous environments, it often needs more discriminative
+power in bonus allocation. This paper highlights the ``bonus inconsistency''
+issue within RND, pinpointing its primary limitation. To address this issue, we
+introduce the Distributional RND (DRND), a derivative of the RND. DRND enhances
+the exploration process by distilling a distribution of random networks and
+implicitly incorporating pseudo counts to improve the precision of bonus
+allocation. This refinement encourages agents to engage in more extensive
+exploration. Our method effectively mitigates the inconsistency issue without
+introducing significant computational overhead. Both theoretical analysis and
+experimental results demonstrate the superiority of our approach over the
+original RND algorithm. Our method excels in challenging online exploration
+scenarios and effectively serves as an anti-exploration mechanism in D4RL
+offline tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Realistic Synthetic Financial Transactions for Anti-Money Laundering
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16424v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16424v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik Altman, Jovan Blanuša, Luc von Niederhäusern, Béni Egressy, Andreea Anghel, Kubilay Atasu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the widespread digitization of finance and the increasing popularity of
+cryptocurrencies, the sophistication of fraud schemes devised by cybercriminals
+is growing. Money laundering -- the movement of illicit funds to conceal their
+origins -- can cross bank and national boundaries, producing complex
+transaction patterns. The UN estimates 2-5\% of global GDP or \$0.8 - \$2.0
+trillion dollars are laundered globally each year. Unfortunately, real data to
+train machine learning models to detect laundering is generally not available,
+and previous synthetic data generators have had significant shortcomings. A
+realistic, standardized, publicly-available benchmark is needed for comparing
+models and for the advancement of the area.
+  To this end, this paper contributes a synthetic financial transaction dataset
+generator and a set of synthetically generated AML (Anti-Money Laundering)
+datasets. We have calibrated this agent-based generator to match real
+transactions as closely as possible and made the datasets public. We describe
+the generator in detail and demonstrate how the datasets generated can help
+compare different machine learning models in terms of their AML abilities. In a
+key way, using synthetic data in these comparisons can be even better than
+using real data: the ground truth labels are complete, whilst many laundering
+transactions in real data are never detected.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Reasoning with Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11562v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11562v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiankai Sun, Chuanyang Zheng, Enze Xie, Zhengying Liu, Ruihang Chu, Jianing Qiu, Jiaqi Xu, Mingyu Ding, Hongyang Li, Mengzhe Geng, Yue Wu, Wenhai Wang, Junsong Chen, Zhangyue Yin, Xiaozhe Ren, Jie Fu, Junxian He, Wu Yuan, Qi Liu, Xihui Liu, Yu Li, Hao Dong, Yu Cheng, Ming Zhang, Pheng Ann Heng, Jifeng Dai, Ping Luo, Jingdong Wang, Ji-Rong Wen, Xipeng Qiu, Yike Guo, Hui Xiong, Qun Liu, Zhenguo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning, a crucial ability for complex problem-solving, plays a pivotal
+role in various real-world settings such as negotiation, medical diagnosis, and
+criminal investigation. It serves as a fundamental methodology in the field of
+Artificial General Intelligence (AGI). With the ongoing development of
+foundation models, e.g., Large Language Models (LLMs), there is a growing
+interest in exploring their abilities in reasoning tasks. In this paper, we
+introduce seminal foundation models proposed or adaptable for reasoning,
+highlighting the latest advancements in various reasoning tasks, methods, and
+benchmarks. We then delve into the potential future directions behind the
+emergence of reasoning abilities within foundation models. We also discuss the
+relevance of multimodal learning, autonomous agents, and super alignment in the
+context of reasoning. By discussing these future research directions, we hope
+to inspire researchers in their exploration of this field, stimulate further
+advancements in reasoning with foundation models, and contribute to the
+development of AGI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 Figures, 160 Pages, 750+ References, Project Page
+  https://github.com/reasoning-survey/Awesome-Reasoning-Foundation-Models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pure Exploration in Bandits with Linear Constraints <span class="chip">AISTATS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12774v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12774v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emil Carlsson, Debabrota Basu, Fredrik D. Johansson, Devdatt Dubhashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the problem of identifying the optimal policy with a fixed
+confidence level in a multi-armed bandit setup, when \emph{the arms are subject
+to linear constraints}. Unlike the standard best-arm identification problem
+which is well studied, the optimal policy in this case may not be deterministic
+and could mix between several arms. This changes the geometry of the problem
+which we characterize via an information-theoretic lower bound. We introduce
+two asymptotically optimal algorithms for this setting, one based on the
+Track-and-Stop method and the other based on a game-theoretic approach. Both
+these algorithms try to track an optimal allocation based on the lower bound
+and computed by a weighted projection onto the boundary of a normal cone.
+Finally, we provide empirical results that validate our bounds and visualize
+how constraints change the hardness of the problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AISTATS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupled Prototype Learning for Reliable Test-Time Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08703v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08703v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guowei Wang, Changxing Ding, Wentao Tan, Mingkui Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Test-time adaptation (TTA) is a task that continually adapts a pre-trained
+source model to the target domain during inference. One popular approach
+involves fine-tuning model with cross-entropy loss according to estimated
+pseudo-labels. However, its performance is significantly affected by noisy
+pseudo-labels. This study reveals that minimizing the classification error of
+each sample causes the cross-entropy loss's vulnerability to label noise. To
+address this issue, we propose a novel Decoupled Prototype Learning (DPL)
+method that features prototype-centric loss computation. First, we decouple the
+optimization of class prototypes. For each class prototype, we reduce its
+distance with positive samples and enlarge its distance with negative samples
+in a contrastive manner. This strategy prevents the model from overfitting to
+noisy pseudo-labels. Second, we propose a memory-based strategy to enhance
+DPL's robustness for the small batch sizes often encountered in TTA. We update
+each class's pseudo-feature from a memory in a momentum manner and insert an
+additional DPL loss. Finally, we introduce a consistency regularization-based
+approach to leverage samples with unconfident pseudo-labels. This approach
+transfers feature styles of samples with unconfident pseudo-labels to those
+with confident pseudo-labels. Thus, more reliable samples for TTA are created.
+The experimental results demonstrate that our methods achieve state-of-the-art
+performance on domain generalization benchmarks, and reliably improve the
+performance of self-training-based methods on image corruption benchmarks. The
+code will be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain Randomization for Robust, Affordable and Effective Closed-loop
+  Control of Soft Robots <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04136v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04136v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriele Tiboni, Andrea Protopapa, Tatiana Tommasi, Giuseppe Averta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Soft robots are gaining popularity thanks to their intrinsic safety to
+contacts and adaptability. However, the potentially infinite number of Degrees
+of Freedom makes their modeling a daunting task, and in many cases only an
+approximated description is available. This challenge makes reinforcement
+learning (RL) based approaches inefficient when deployed on a realistic
+scenario, due to the large domain gap between models and the real platform. In
+this work, we demonstrate, for the first time, how Domain Randomization (DR)
+can solve this problem by enhancing RL policies for soft robots with: i)
+robustness w.r.t. unknown dynamics parameters; ii) reduced training times by
+exploiting drastically simpler dynamic models for learning; iii) better
+environment exploration, which can lead to exploitation of environmental
+constraints for optimal performance. Moreover, we introduce a novel algorithmic
+extension to previous adaptive domain randomization methods for the automatic
+inference of dynamics parameters for deformable objects. We provide an
+extensive evaluation in simulation on four different tasks and two soft robot
+designs, opening interesting perspectives for future research on Reinforcement
+Learning for closed-loop soft robot control.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented as conference paper at IEEE/RSJ IROS 2023, Detroit, USA.
+  Project website at https://andreaprotopapa.github.io/dr-soro/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Heterogeneous Federated Learning via Personalized Generative Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13265v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13265v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zahra Taghiyarrenani, Abdallah Alabdallah, Slawomir Nowaczyk, Sepideh Pashami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) allows several clients to construct a common global
+machine-learning model without having to share their data. FL, however, faces
+the challenge of statistical heterogeneity between the client's data, which
+degrades performance and slows down the convergence toward the global model. In
+this paper, we provide theoretical proof that minimizing heterogeneity between
+clients facilitates the convergence of a global model for every single client.
+This becomes particularly important under empirical concept shifts among
+clients, rather than merely considering imbalanced classes, which have been
+studied until now. Therefore, we propose a method for knowledge transfer
+between clients where the server trains client-specific generators. Each
+generator generates samples for the corresponding client to remove the conflict
+with other clients' models. Experiments conducted on synthetic and real data,
+along with a theoretical study, support the effectiveness of our method in
+constructing a well-generalizable global model by reducing the conflict between
+local models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradient Flows for Regularized Stochastic Control Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2006.05956v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2006.05956v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Šiška, Łukasz Szpruch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies stochastic control problems with the action space taken to
+be probability measures, with the objective penalised by the relative entropy.
+We identify suitable metric space on which we construct a gradient flow for the
+measure-valued control process, in the set of admissible controls, along which
+the cost functional is guaranteed to decrease. It is shown that any invariant
+measure of this gradient flow satisfies the Pontryagin optimality principle. If
+the problem we work with is sufficiently convex, the gradient flow converges
+exponentially fast. Furthermore, the optimal measure-valued control process
+admits a Bayesian interpretation which means that one can incorporate prior
+knowledge when solving such stochastic control problems. This work is motivated
+by a desire to extend the theoretical underpinning for the convergence of
+stochastic gradient type algorithms widely employed in the reinforcement
+learning community to solve control problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convolutional Persistence Transforms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.02107v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.02107v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elchanan Solomon, Paul Bendich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider topological featurizations of data defined over
+simplicial complexes, like images and labeled graphs, obtained by convolving
+this data with various filters before computing persistence. Viewing a
+convolution filter as a local motif, the persistence diagram of the resulting
+convolution describes the way the motif is distributed across the simplicial
+complex. This pipeline, which we call convolutional persistence, extends the
+capacity of topology to observe patterns in such data. Moreover, we prove that
+(generically speaking) for any two labeled complexes one can find some filter
+for which they produce different persistence diagrams, so that the collection
+of all possible convolutional persistence diagrams is an injective invariant.
+This is proven by showing convolutional persistence to be a special case of
+another topological invariant, the Persistent Homology Transform. Other
+advantages of convolutional persistence are improved stability, greater
+flexibility for data-dependent vectorizations, and reduced computational
+complexity for certain data types. Additionally, we have a suite of experiments
+showing that convolutions greatly improve the predictive power of persistence
+on a host of classification tasks, even if one uses random filters and
+vectorizes the resulting diagrams by recording only their total persistences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated paper with new results and proofs written more clearly</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lipschitz-bounded 1D convolutional neural networks using the Cayley
+  transform and the controllability Gramian 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11835v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11835v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patricia Pauli, Ruigang Wang, Ian R. Manchester, Frank Allgöwer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We establish a layer-wise parameterization for 1D convolutional neural
+networks (CNNs) with built-in end-to-end robustness guarantees. In doing so, we
+use the Lipschitz constant of the input-output mapping characterized by a CNN
+as a robustness measure. We base our parameterization on the Cayley transform
+that parameterizes orthogonal matrices and the controllability Gramian of the
+state space representation of the convolutional layers. The proposed
+parameterization by design fulfills linear matrix inequalities that are
+sufficient for Lipschitz continuity of the CNN, which further enables
+unconstrained training of Lipschitz-bounded 1D CNNs. Finally, we train
+Lipschitz-bounded 1D CNNs for the classification of heart arrythmia data and
+show their improved robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at CDC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harmonizing Covariance and Expressiveness for Deep Hamiltonian
+  Regression in Crystalline Material Research: a Hybrid Cascaded Regression
+  Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00744v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00744v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shi Yin, Xinyang Pan, Xudong Zhu, Tianyu Gao, Haochong Zhang, Feng Wu, Lixin He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning for Hamiltonian regression of quantum systems in material
+research necessitates satisfying the covariance laws, among which achieving
+SO(3)-equivariance without sacrificing the expressiveness capability of
+networks remains an elusive challenge due to the restriction to non-linear
+mappings on guaranteeing theoretical equivariance. To alleviate the
+covariance-expressiveness dilemma, we propose a hybrid framework with two
+cascaded regression stages. The first stage, i.e., a theoretically-guaranteed
+covariant neural network modeling symmetry properties of 3D atom systems,
+predicts baseline Hamiltonians with theoretically covariant features extracted,
+assisting the second stage in learning covariance. Meanwhile, the second stage,
+powered by a non-linear 3D graph Transformer network we propose for structural
+modeling of atomic systems, refines the first stage's output as a fine-grained
+prediction of Hamiltonians with better expressiveness capability. The
+combination of a theoretically covariant yet inevitably less expressive model
+with a highly expressive non-linear network enables precise, generalizable
+predictions while maintaining robust covariance under coordinate
+transformations. Our method achieves state-of-the-art performance in
+Hamiltonian prediction for electronic structure calculations, confirmed through
+experiments on six crystalline material databases. The codes and configuration
+scripts are available in the supplementary material.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Link between Coding Theory and Cross-Validation with Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.11856v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.11856v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tapio Pahikkala, Parisa Movahedi, Ileana Montoya, Havu Miikonen, Stephan Foldes, Antti Airola, Laszlo Major
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How many different binary classification problems a single learning algorithm
+can solve on a fixed data with exactly zero or at most a given number of
+cross-validation errors? While the number in the former case is known to be
+limited by the no-free-lunch theorem, we show that the exact answers are given
+by the theory of error detecting codes. As a case study, we focus on the AUC
+performance measure and leave-pair-out cross-validation (LPOCV), in which every
+possible pair of data with different class labels is held out at a time. We
+shown that the maximal number of classification problems with fixed class
+proportion, for which a learning algorithm can achieve zero LPOCV error, equals
+the maximal number of code words in a constant weight code (CWC), with certain
+technical properties. We then generalize CWCs by introducing light CWCs and
+prove an analogous result for nonzero LPOCV errors and light CWCs. Moreover, we
+prove both upper and lower bounds on the maximal numbers of code words in light
+CWCs. Finally, as an immediate practical application, we develop new LPOCV
+based randomization tests for learning algorithms that generalize the classical
+Wilcoxon-Mann-Whitney U test.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Short vs. Long-term Coordination of Drones: When Distributed
+  Optimization Meets Deep Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09852v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09852v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuhao Qin, Evangelos Pournaras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Swarms of autonomous interactive drones, with the support of recharging
+technology, can provide compelling sensing capabilities in Smart Cities, such
+as traffic monitoring and disaster response. Existing approaches, including
+distributed optimization and deep reinforcement learning (DRL), aim to
+coordinate drones to achieve cost-effective, high-quality navigation, sensing,
+and charging. However, they face grand challenges: short-term optimization is
+not effective in dynamic environments with unanticipated changes, while
+long-term learning lacks scalability, resilience, and flexibility. To bridge
+this gap, this paper introduces a new progressive approach that combines
+short-term plan generation and selection based on distributed optimization with
+a DRL-based long-term strategic scheduling of flying direction. Extensive
+experimentation with datasets generated from realistic urban mobility
+underscores an outstanding performance of the proposed solution compared to
+state-of-the-art. We also provide compelling new insights about the role of
+drones density in different sensing missions, the energy safety of drone
+operations and how to prioritize investments for key locations of charging
+infrastructure.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-Tuning or Retrieval? Comparing Knowledge Injection in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.05934v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.05934v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oded Ovadia, Menachem Brief, Moshik Mishaeli, Oren Elisha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) encapsulate a vast amount of factual information
+within their pre-trained weights, as evidenced by their ability to answer
+diverse questions across different domains. However, this knowledge is
+inherently limited, relying heavily on the characteristics of the training
+data. Consequently, using external datasets to incorporate new information or
+refine the capabilities of LLMs on previously seen information poses a
+significant challenge. In this study, we compare two common approaches:
+unsupervised fine-tuning and retrieval-augmented generation (RAG). We evaluate
+both approaches on a variety of knowledge-intensive tasks across different
+topics. Our findings reveal that while unsupervised fine-tuning offers some
+improvement, RAG consistently outperforms it, both for existing knowledge
+encountered during training and entirely new knowledge. Moreover, we find that
+LLMs struggle to learn new factual information through unsupervised
+fine-tuning, and that exposing them to numerous variations of the same fact
+during training could alleviate this problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Temporal Inductive Path Neural Network for Temporal Knowledge Graph
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03251v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03251v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Dong, Pengyang Wang, Meng Xiao, Zhiyuan Ning, Pengfei Wang, Yuanchun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Knowledge Graph (TKG) is an extension of traditional Knowledge Graph
+(KG) that incorporates the dimension of time. Reasoning on TKGs is a crucial
+task that aims to predict future facts based on historical occurrences. The key
+challenge lies in uncovering structural dependencies within historical
+subgraphs and temporal patterns. Most existing approaches model TKGs relying on
+entity modeling, as nodes in the graph play a crucial role in knowledge
+representation. However, the real-world scenario often involves an extensive
+number of entities, with new entities emerging over time. This makes it
+challenging for entity-dependent methods to cope with extensive volumes of
+entities, and effectively handling newly emerging entities also becomes a
+significant challenge. Therefore, we propose Temporal Inductive Path Neural
+Network (TiPNN), which models historical information in an entity-independent
+perspective. Specifically, TiPNN adopts a unified graph, namely history
+temporal graph, to comprehensively capture and encapsulate information from
+history. Subsequently, we utilize the defined query-aware temporal paths on a
+history temporal graph to model historical path information related to queries
+for reasoning. Extensive experiments illustrate that the proposed model not
+only attains significant performance enhancements but also handles inductive
+settings, while additionally facilitating the provision of reasoning evidence
+through history temporal graphs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Generalizable Neural Solvers for Vehicle Routing Problems via
+  Ensemble with Transferrable Local Policy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.14104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.14104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengrui Gao, Haopu Shang, Ke Xue, Dong Li, Chao Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning has been adapted to help solve NP-hard combinatorial
+optimization problems. One prevalent way is learning to construct solutions by
+deep neural networks, which has been receiving more and more attention due to
+the high efficiency and less requirement for expert knowledge. However, many
+neural construction methods for Vehicle Routing Problems (VRPs) focus on
+synthetic problem instances with specified node distributions and limited
+scales, leading to poor performance on real-world problems which usually
+involve complex and unknown node distributions together with large scales. To
+make neural VRP solvers more practical, we design an auxiliary policy that
+learns from the local transferable topological features, named local policy,
+and integrate it with a typical construction policy (which learns from the
+global information of VRP instances) to form an ensemble policy. With joint
+training, the aggregated policies perform cooperatively and complementarily to
+boost generalization. The experimental results on two well-known benchmarks,
+TSPLIB and CVRPLIB, of travelling salesman problem and capacitated VRP show
+that the ensemble policy significantly improves both cross-distribution and
+cross-scale generalization performance, and even performs well on real-world
+problems with several thousand nodes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continuous-time Riemannian SGD and SVRG Flows on Wasserstein
+  Probabilistic Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13530v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13530v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyang Yi, Bohan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, optimization on the Riemannian manifold has provided new insights
+to the optimization community. In this regard, the manifold taken as the
+probability measure metric space equipped with the second-order Wasserstein
+distance is of particular interest, since optimization on it can be linked to
+practical sampling processes. In general, the oracle (continuous) optimization
+method on Wasserstein space is Riemannian gradient flow (i.e., Langevin
+dynamics when minimizing KL divergence). In this paper, we aim to enrich the
+continuous optimization methods in the Wasserstein space by extending the
+gradient flow into the stochastic gradient descent (SGD) flow and stochastic
+variance reduction gradient (SVRG) flow. The two flows on Euclidean space are
+standard stochastic optimization methods, while their Riemannian counterparts
+are not explored yet. By leveraging the structures in Wasserstein space, we
+construct a stochastic differential equation (SDE) to approximate the discrete
+dynamics of desired stochastic methods in the corresponded random vector space.
+Then, the flows of probability measures are naturally obtained by applying
+Fokker-Planck equation to such SDE. Furthermore, the convergence rates of the
+proposed Riemannian stochastic flows are proven, and they match the results in
+Euclidean space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ cedar: Composable and Optimized Machine Learning Input Data Pipelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mark Zhao, Emanuel Adamiak, Christos Kozyrakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The input data pipeline is an essential component of each machine learning
+(ML) training job. It is responsible for reading massive amounts of training
+data, processing batches of samples using complex transformations, and loading
+them onto training nodes at low latency and high throughput. Performant input
+data systems are becoming increasingly critical, driven by skyrocketing data
+volumes and training throughput demands. Unfortunately, current input data
+systems cannot fully leverage key performance optimizations, resulting in
+hugely inefficient infrastructures that require significant resources -- or
+worse -- underutilize expensive accelerators.
+  To address these demands, we present cedar, a programming model and framework
+that allows users to easily build, optimize, and execute input data pipelines.
+cedar presents an easy-to-use programming interface, allowing users to define
+input data pipelines using composable operators that support arbitrary ML
+frameworks and libraries. Meanwhile, cedar transparently applies a complex and
+extensible set of optimization techniques (e.g., offloading, caching,
+prefetching, fusion, and reordering). It then orchestrates processing across a
+customizable set of local and distributed compute resources in order to
+maximize processing performance and efficiency, all without user input. On
+average across six diverse input data pipelines, cedar achieves a 2.49x, 1.87x,
+2.18x, and 2.74x higher performance compared to tf.data, tf.data service, Ray
+Data, and PyTorch's DataLoader, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GNN-based Passenger Request Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.02515v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.02515v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aqsa Ashraf Makhdomi, Iqra Altaf Gillani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Passenger request prediction is essential for operations planning, control,
+and management in ride-sharing platforms. While the demand prediction problem
+has been studied extensively, the Origin-Destination (OD) flow prediction of
+passengers has received less attention from the research community. This paper
+develops a Graph Neural Network framework along with the Attention Mechanism to
+predict the OD flow of passengers. The proposed framework exploits various
+linear and non-linear dependencies that arise among requests originating from
+different locations and captures the repetition pattern and the contextual data
+of that place. Moreover, the optimal size of the grid cell that covers the road
+network and preserves the complexity and accuracy of the model is determined.
+Extensive simulations are conducted to examine the characteristics of our
+proposed approach and its various components. The results show the superior
+performance of our proposed model compared to the existing baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Probabilistic Feedback Drive User Impacts in Online Platforms? <span class="chip">AISTATS
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.05304v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.05304v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jessica Dai, Bailey Flanigan, Nika Haghtalab, Meena Jagadeesan, Chara Podimata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common explanation for negative user impacts of content recommender systems
+is misalignment between the platform's objective and user welfare. In this
+work, we show that misalignment in the platform's objective is not the only
+potential cause of unintended impacts on users: even when the platform's
+objective is fully aligned with user welfare, the platform's learning algorithm
+can induce negative downstream impacts on users. The source of these user
+impacts is that different pieces of content may generate observable user
+reactions (feedback information) at different rates; these feedback rates may
+correlate with content properties, such as controversiality or demographic
+similarity of the creator, that affect the user experience. Since differences
+in feedback rates can impact how often the learning algorithm engages with
+different content, the learning algorithm may inadvertently promote content
+with certain such properties. Using the multi-armed bandit framework with
+probabilistic feedback, we examine the relationship between feedback rates and
+a learning algorithm's engagement with individual arms for different no-regret
+algorithms. We prove that no-regret algorithms can exhibit a wide range of
+dependencies: if the feedback rate of an arm increases, some no-regret
+algorithms engage with the arm more, some no-regret algorithms engage with the
+arm less, and other no-regret algorithms engage with the arm approximately the
+same number of times. From a platform design perspective, our results highlight
+the importance of looking beyond regret when measuring an algorithm's
+performance, and assessing the nature of a learning algorithm's engagement with
+different types of content as well as their resulting downstream impacts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Authors listed in alphabetical order. Accept as poster at AISTATS
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Debiased Sample Selection for Combating Noisy Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13360v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13360v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Wei, Lei Feng, Haobo Wang, Bo An
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning with noisy labels aims to ensure model generalization given a
+label-corrupted training set. The sample selection strategy achieves promising
+performance by selecting a label-reliable subset for model training. In this
+paper, we empirically reveal that existing sample selection methods suffer from
+both data and training bias that are represented as imbalanced selected sets
+and accumulation errors in practice, respectively. However, only the training
+bias was handled in previous studies. To address this limitation, we propose a
+noIse-Tolerant Expert Model (ITEM) for debiased learning in sample selection.
+Specifically, to mitigate the training bias, we design a robust network
+architecture that integrates with multiple experts. Compared with the
+prevailing double-branch network, our network exhibits better performance of
+selection and prediction by ensembling these experts while training with fewer
+parameters. Meanwhile, to mitigate the data bias, we propose a mixed sampling
+strategy based on two weight-based data samplers. By training on the mixture of
+two class-discriminative mini-batches, the model mitigates the effect of the
+imbalanced training set while avoiding sparse representations that are easily
+caused by sampling strategies. Extensive experiments and analyses demonstrate
+the effectiveness of ITEM. Our code is available at this url
+\href{https://github.com/1998v7/ITEM}{ITEM}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MCCE: Monte Carlo sampling of realistic counterfactual explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.09790v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.09790v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Annabelle Redelmeier, Martin Jullum, Kjersti Aas, Anders Løland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MCCE: Monte Carlo sampling of valid and realistic Counterfactual
+Explanations for tabular data, a novel counterfactual explanation method that
+generates on-manifold, actionable and valid counterfactuals by modeling the
+joint distribution of the mutable features given the immutable features and the
+decision. Unlike other on-manifold methods that tend to rely on variational
+autoencoders and have strict prediction model and data requirements, MCCE
+handles any type of prediction model and categorical features with more than
+two levels. MCCE first models the joint distribution of the features and the
+decision with an autoregressive generative model where the conditionals are
+estimated using decision trees. Then, it samples a large set of observations
+from this model, and finally, it removes the samples that do not obey certain
+criteria. We compare MCCE with a range of state-of-the-art on-manifold
+counterfactual methods using four well-known data sets and show that MCCE
+outperforms these methods on all common performance metrics and speed. In
+particular, including the decision in the modeling process improves the
+efficiency of the method substantially.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Energy-based Automated Model Evaluation <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12689v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12689v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ru Peng, Heming Zou, Haobo Wang, Yawen Zeng, Zenan Huang, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional evaluation protocols on machine learning models rely heavily
+on a labeled, i.i.d-assumed testing dataset, which is not often present in real
+world applications. The Automated Model Evaluation (AutoEval) shows an
+alternative to this traditional workflow, by forming a proximal prediction
+pipeline of the testing performance without the presence of ground-truth
+labels. Despite its recent successes, the AutoEval frameworks still suffer from
+an overconfidence issue, substantial storage and computational cost. In that
+regard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that
+allows the AutoEval framework to be both more efficient and effective. The core
+of the MDE is to establish a meta-distribution statistic, on the information
+(energy) associated with individual samples, then offer a smoother
+representation enabled by energy-based learning. We further provide our
+theoretical insights by connecting the MDE with the classification loss. We
+provide extensive experiments across modalities, datasets and different
+architectural backbones to validate MDE's validity, together with its
+superiority compared with prior approaches. We also prove MDE's versatility by
+showing its seamless integration with large-scale models, and easy adaption to
+learning scenarios with noisy- or imbalanced- labels. Code and data are
+available: https://github.com/pengr/Energy_AutoEval
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR2024 poster paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mementos: A Comprehensive Benchmark for Multimodal Large Language Model
+  Reasoning over Image Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10529v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10529v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiyao Wang, Yuhang Zhou, Xiaoyu Liu, Hongjin Lu, Yuancheng Xu, Feihong He, Jaehong Yoon, Taixi Lu, Gedas Bertasius, Mohit Bansal, Huaxiu Yao, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in
+handling a variety of visual-language tasks. However, current MLLM benchmarks
+are predominantly designed to evaluate reasoning based on static information
+about a single image, and the ability of modern MLLMs to extrapolate from image
+sequences, which is essential for understanding our ever-changing world, has
+been less investigated. To address this challenge, this paper introduces
+Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning
+abilities. Mementos features 4,761 diverse image sequences with varying
+lengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning
+performance. Through a careful evaluation of nine recent MLLMs on Mementos,
+including GPT-4V and Gemini, we find that they struggle to accurately describe
+dynamic information about given image sequences, often leading to
+hallucinations/misrepresentations of objects and their corresponding behaviors.
+Our quantitative analysis and case studies identify three key factors impacting
+MLLMs' sequential image reasoning: the correlation between object and
+behavioral hallucinations, the influence of cooccurring behaviors, and the
+compounding impact of behavioral hallucinations. Our dataset is available at
+https://github.com/umd-huang-lab/Mementos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 23 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Massive Editing for Large Language Models via Meta Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.04661v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.04661v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenmien Tan, Ge Zhang, Jie Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) have enabled learning knowledge from the
+pre-training corpora, the acquired knowledge may be fundamentally incorrect or
+outdated over time, which necessitates rectifying the knowledge of the language
+model (LM) after the training. A promising approach involves employing a
+hyper-network to generate parameter shift, whereas existing hyper-networks
+suffer from inferior scalability in synchronous editing operation amount. To
+mitigate the problem, we propose the MAssive Language Model Editing Network
+(MALMEN), which formulates the parameter shift aggregation as the least square
+problem, subsequently updating the LM parameters using the normal equation. To
+accommodate editing multiple facts simultaneously with limited memory budgets,
+we separate the computation on the hyper-network and LM, enabling arbitrary
+batch size on both neural networks. Our method is evaluated by editing up to
+thousands of facts on LMs with different architectures, i.e., BERT-base, GPT-2,
+T5-XL (2.8B), and GPT-J (6B), across various knowledge-intensive NLP tasks,
+i.e., closed book fact-checking and question answering. Remarkably, MALMEN is
+capable of editing hundreds of times more facts than strong baselines with the
+identical hyper-network architecture and outperforms editor specifically
+designed for GPT. Our code is available at
+https://github.com/ChenmienTan/malmen.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The effectiveness of MAE pre-<span class="highlight-title">pretrain</span>ing for billion-scale <span class="highlight-title">pretrain</span>ing <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13496v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13496v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mannat Singh, Quentin Duval, Kalyan Vasudev Alwala, Haoqi Fan, Vaibhav Aggarwal, Aaron Adcock, Armand Joulin, Piotr Dollár, Christoph Feichtenhofer, Ross Girshick, Rohit Girdhar, Ishan Misra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper revisits the standard pretrain-then-finetune paradigm used in
+computer vision for visual recognition tasks. Typically, state-of-the-art
+foundation models are pretrained using large scale (weakly) supervised datasets
+with billions of images. We introduce an additional pre-pretraining stage that
+is simple and uses the self-supervised MAE technique to initialize the model.
+While MAE has only been shown to scale with the size of models, we find that it
+scales with the size of the training dataset as well. Thus, our MAE-based
+pre-pretraining scales with both model and data size making it applicable for
+training foundation models. Pre-pretraining consistently improves both the
+model convergence and the downstream transfer performance across a range of
+model scales (millions to billions of parameters), and dataset sizes (millions
+to billions of images). We measure the effectiveness of pre-pretraining on 10
+different visual recognition tasks spanning image classification, video
+recognition, object detection, low-shot classification and zero-shot
+recognition. Our largest model achieves new state-of-the-art results on
+iNaturalist-18 (91.7%), ImageNet-ReaL (91.1%), 1-shot ImageNet-1k (63.6%), and
+zero-shot transfer on Food-101 (96.2%). Our study reveals that model
+initialization plays a significant role, even for web-scale pretraining with
+billions of images, and our models are available publicly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Models available at
+  https://github.com/facebookresearch/maws/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging sinusoidal representation networks to predict fMRI signals
+  from EEG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.04234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.04234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yamin Li, Ange Lou, Ziyuan Xu, Shiyu Wang, Catie Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In modern neuroscience, functional magnetic resonance imaging (fMRI) has been
+a crucial and irreplaceable tool that provides a non-invasive window into the
+dynamics of whole-brain activity. Nevertheless, fMRI is limited by hemodynamic
+blurring as well as high cost, immobility, and incompatibility with metal
+implants. Electroencephalography (EEG) is complementary to fMRI and can
+directly record the cortical electrical activity at high temporal resolution,
+but has more limited spatial resolution and is unable to recover information
+about deep subcortical brain structures. The ability to obtain fMRI information
+from EEG would enable cost-effective, imaging across a wider set of brain
+regions. Further, beyond augmenting the capabilities of EEG, cross-modality
+models would facilitate the interpretation of fMRI signals. However, as both
+EEG and fMRI are high-dimensional and prone to artifacts, it is currently
+challenging to model fMRI from EEG. To address this challenge, we propose a
+novel architecture that can predict fMRI signals directly from multi-channel
+EEG without explicit feature engineering. Our model achieves this by
+implementing a Sinusoidal Representation Network (SIREN) to learn frequency
+information in brain dynamics from EEG, which serves as the input to a
+subsequent encoder-decoder to effectively reconstruct the fMRI signal from a
+specific brain region. We evaluate our model using a simultaneous EEG-fMRI
+dataset with 8 subjects and investigate its potential for predicting
+subcortical fMRI signals. The present results reveal that our model outperforms
+a recent state-of-the-art model, and indicates the potential of leveraging
+periodic activation functions in deep neural networks to model functional
+neuroimaging data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What do <span class="highlight-title">self-supervised</span> speech models know about words? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00162v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00162v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankita Pasad, Chung-Ming Chien, Shane Settle, Karen Livescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many self-supervised speech models (S3Ms) have been introduced over the last
+few years, improving performance and data efficiency on various speech tasks.
+However, these empirical successes alone do not give a complete picture of what
+is learned during pre-training. Recent work has begun analyzing how S3Ms encode
+certain properties, such as phonetic and speaker information, but we still lack
+a proper understanding of knowledge encoded at the word level and beyond. In
+this work, we use lightweight analysis methods to study segment-level
+linguistic properties -- word identity, boundaries, pronunciation, syntactic
+features, and semantic features -- encoded in S3Ms. We present a comparative
+study of layer-wise representations from ten S3Ms and find that (i) the
+frame-level representations within each word segment are not all equally
+informative, and (ii) the pre-training objective and model size heavily
+influence the accessibility and distribution of linguistic information across
+layers. We also find that on several tasks -- word discrimination, word
+segmentation, and semantic sentence similarity -- S3Ms trained with visual
+grounding outperform their speech-only counterparts. Finally, our task-based
+analyses demonstrate an improved performance on word segmentation and acoustic
+word discrimination while using simpler methods than prior work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a pre-MIT Press publication version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Adaptive Placement and Parallelism Framework for Accelerating RLHF
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11819v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11819v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youshao Xiao, Weichang Wu, Zhenglei Zhou, Fagui Mao, Shangchun Zhao, Lin Ju, Lei Liang, Xiaolu Zhang, Jun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, ChatGPT or InstructGPT like large language models (LLM) has made a
+significant impact in the AI world. Many works have attempted to reproduce the
+complex InstructGPT's training pipeline, namely Reinforcement Learning with
+Human Feedback (RLHF). However, the mainstream distributed RLHF training
+methods typically adopt a fixed model placement strategy, referred to as the
+Flattening strategy. This strategy treats all four interdependent models
+involved in RLHF as a single entity, distributing them across all devices and
+applying parallelism techniques designed for a single model, regardless of the
+different workloads inherent to each model. As a result, this strategy
+exacerbates the generation bottlenecks in the RLHF training and degrades the
+overall training efficiency. To address these issues, we propose an adaptive
+model placement framework that offers two flexible model placement strategies.
+The Interleaving strategy helps reduce memory redundancy and communication
+costs of RLHF training by placing models without dependencies on exclusive
+devices with careful orchestration. On the other hand, the Separation strategy
+improves the throughput of model training by separating the training and
+inference runtime of the RLHF pipeline with additional shadow models.
+Furthermore, our framework provides a simple user interface and allows for the
+agile allocation of models across devices in a fine-grained manner for various
+training scenarios, involving models of varying sizes and devices of different
+scales. Extensive experiments have demonstrated that our Interleaving and
+Separation strategies can achieve notable improvements up to 11X, compared to
+the current SOTA approaches. The results highlight the effectiveness and
+adaptability of our approaches in accelerating the training of distributed
+RLHF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Resilience in Sequential Prediction via Abstention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13119v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13119v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Surbhi Goel, Steve Hanneke, Shay Moran, Abhishek Shetty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of sequential prediction in the stochastic setting with
+an adversary that is allowed to inject clean-label adversarial (or
+out-of-distribution) examples. Algorithms designed to handle purely stochastic
+data tend to fail in the presence of such adversarial examples, often leading
+to erroneous predictions. This is undesirable in many high-stakes applications
+such as medical recommendations, where abstaining from predictions on
+adversarial examples is preferable to misclassification. On the other hand,
+assuming fully adversarial data leads to very pessimistic bounds that are often
+vacuous in practice.
+  To capture this motivation, we propose a new model of sequential prediction
+that sits between the purely stochastic and fully adversarial settings by
+allowing the learner to abstain from making a prediction at no cost on
+adversarial examples. Assuming access to the marginal distribution on the
+non-adversarial examples, we design a learner whose error scales with the VC
+dimension (mirroring the stochastic setting) of the hypothesis class, as
+opposed to the Littlestone dimension which characterizes the fully adversarial
+setting. Furthermore, we design a learner for VC dimension~1 classes, which
+works even in the absence of access to the marginal distribution. Our key
+technical contribution is a novel measure for quantifying uncertainty for
+learning VC classes, which may be of independent interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Graph Disentanglement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.07295v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.07295v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Zheng, Zhenfeng Zhu, Zhizhe Liu, Jian Cheng, Yao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A real-world graph has a complex topological structure, which is often formed
+by the interaction of different latent factors. However, most existing methods
+lack consideration of the intrinsic differences in relations between nodes
+caused by factor entanglement. In this paper, we propose an
+\underline{\textbf{A}}dversarial \underline{\textbf{D}}isentangled
+\underline{\textbf{G}}raph \underline{\textbf{C}}onvolutional
+\underline{\textbf{N}}etwork (ADGCN) for disentangled graph representation
+learning. To begin with, we point out two aspects of graph disentanglement that
+need to be considered, i.e., micro-disentanglement and macro-disentanglement.
+For them, a component-specific aggregation approach is proposed to achieve
+micro-disentanglement by inferring latent components that cause the links
+between nodes. On the basis of micro-disentanglement, we further propose a
+macro-disentanglement adversarial regularizer to improve the separability among
+component distributions, thus restricting the interdependence among components.
+Additionally, to reveal the topological graph structure, a diversity-preserving
+node sampling approach is proposed, by which the graph structure can be
+progressively refined in a way of local structure awareness. The experimental
+results on various real-world graph data verify that our ADGCN obtains more
+favorable performance over currently available alternatives. The source codes
+of ADGCN are available at \textit{\url{https://github.com/SsGood/ADGCN}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transfer Learning for Contextual Multi-armed Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.12612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.12612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changxiao Cai, T. Tony Cai, Hongzhe Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by a range of applications, we study in this paper the problem of
+transfer learning for nonparametric contextual multi-armed bandits under the
+covariate shift model, where we have data collected on source bandits before
+the start of the target bandit learning. The minimax rate of convergence for
+the cumulative regret is established and a novel transfer learning algorithm
+that attains the minimax regret is proposed. The results quantify the
+contribution of the data from the source domains for learning in the target
+domain in the context of nonparametric contextual multi-armed bandits.
+  In view of the general impossibility of adaptation to unknown smoothness, we
+develop a data-driven algorithm that achieves near-optimal statistical
+guarantees (up to a logarithmic factor) while automatically adapting to the
+unknown parameters over a large collection of parameter spaces under an
+additional self-similarity assumption. A simulation study is carried out to
+illustrate the benefits of utilizing the data from the auxiliary source domains
+for learning in the target domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Annals of Statistics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAiD: Speech-driven Blendshape Facial Animation with Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inkyu Park, Jaewoong Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech-driven 3D facial animation is challenging due to the scarcity of
+large-scale visual-audio datasets despite extensive research. Most prior works,
+typically focused on learning regression models on a small dataset using the
+method of least squares, encounter difficulties generating diverse lip
+movements from speech and require substantial effort in refining the generated
+outputs. To address these issues, we propose a speech-driven 3D facial
+animation with a diffusion model (SAiD), a lightweight Transformer-based U-Net
+with a cross-modality alignment bias between audio and visual to enhance lip
+synchronization. Moreover, we introduce BlendVOCA, a benchmark dataset of pairs
+of speech audio and parameters of a blendshape facial model, to address the
+scarcity of public resources. Our experimental results demonstrate that the
+proposed approach achieves comparable or superior performance in lip
+synchronization to baselines, ensures more diverse lip movements, and
+streamlines the animation editing process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Fix bug related to the font size</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph-Informed Neural Networks for Sparse Grid-Based Discontinuity
+  Detectors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13652v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13652v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Della Santa, Sandra Pieraccini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a novel approach for detecting the discontinuity
+interfaces of a discontinuous function. This approach leverages Graph-Informed
+Neural Networks (GINNs) and sparse grids to address discontinuity detection
+also in domains of dimension larger than 3. GINNs, trained to identify troubled
+points on sparse grids, exploit graph structures built on the grids to achieve
+efficient and accurate discontinuity detection performances. We also introduce
+a recursive algorithm for general sparse grid-based detectors, characterized by
+convergence properties and easy applicability. Numerical experiments on
+functions with dimensions n = 2 and n = 4 demonstrate the efficiency and robust
+generalization of GINNs in detecting discontinuity interfaces. Notably, the
+trained GINNs offer portability and versatility, allowing integration into
+various algorithms and sharing among users.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 2D-RC: Two-Dimensional Neural Network Approach for OTFS Symbol Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.08543v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.08543v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarui Xu, Karim Said, Lizhong Zheng, Lingjia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Orthogonal time frequency space (OTFS) is a promising modulation scheme for
+wireless communication in high-mobility scenarios. Recently, a reservoir
+computing (RC) based approach has been introduced for online subframe-based
+symbol detection in the OTFS system, where only a limited number of
+over-the-air (OTA) pilot symbols are utilized for training. However, this
+approach does not leverage the domain knowledge specific to the OTFS system to
+fully unlock the potential of RC. This paper introduces a novel two-dimensional
+RC (2D-RC) method that incorporates the domain knowledge of the OTFS system
+into the design for symbol detection in an online subframe-based manner.
+Specifically, as the channel interaction in the delay-Doppler (DD) domain is a
+two-dimensional (2D) circular operation, the 2D-RC is designed to have the 2D
+circular padding procedure and the 2D filtering structure to embed this
+knowledge. With the introduced architecture, 2D-RC can operate in the DD domain
+with only a single neural network, instead of necessitating multiple RCs to
+track channel variations in the time domain as in previous work. Numerical
+experiments demonstrate the advantages of the 2D-RC approach over the previous
+RC-based approach and compared model-based methods across different OTFS system
+variants and modulation orders.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, journal submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Infinite-Dimensional Regression: Learning Linear Operators <span class="chip">ALT 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06548v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06548v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinod Raman, Unique Subedi, Ambuj Tewari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of learning linear operators under squared loss
+between two infinite-dimensional Hilbert spaces in the online setting. We show
+that the class of linear operators with uniformly bounded $p$-Schatten norm is
+online learnable for any $p \in [1, \infty)$. On the other hand, we prove an
+impossibility result by showing that the class of uniformly bounded linear
+operators with respect to the operator norm is \textit{not} online learnable.
+Moreover, we show a separation between sequential uniform convergence and
+online learnability by identifying a class of bounded linear operators that is
+online learnable but uniform convergence does not hold. Finally, we prove that
+the impossibility result and the separation between uniform convergence and
+learnability also hold in the batch setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, ALT 2024 Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational quantum regression algorithm with encoded data structure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03334v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03334v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. -C. Joseph Wang, Ryan S. Bennink
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hybrid variational quantum algorithms (VQAs) are promising for solving
+practical problems such as combinatorial optimization, quantum chemistry
+simulation, quantum machine learning, and quantum error correction on noisy
+quantum computers. However, with typical random ansatz or quantum alternating
+operator ansatz, derived variational quantum algorithms become a black box for
+model interpretation. In this paper we construct a quantum regression algorithm
+wherein the quantum state directly encodes the classical data table and the
+variational parameters correspond directly to the regression coefficients which
+are real numbers by construction, providing a high degree of model
+interpretability and minimal cost to optimize with the right expressiveness.
+Instead of assuming the state preparation is given by granted, we discuss the
+state preparation with different encoders and their time complexity and overall
+resource cost. We can take advantage of the encoded data structure to cut down
+the algorithm time complexity. To the best of our knowledge, we show for the
+first time explicitly how the linkage of the classical data structure can be
+taken advantage of directly through quantum subroutines by construction. For
+nonlinear regression, our algorithm can be extended by building nonlinear
+features into the training data as demonstrated by numerical results. In
+addition, we demonstrate that the model trainability is achievable only when
+the number of features $M$ is much less than the number of records $L$ for the
+encoded data structure to justify $L\gg M$ in our resource estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CMMU: A Benchmark for Chinese Multi-modal Multi-type Question
+  Understanding and Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheqi He, Xinya Wu, Pengfei Zhou, Richeng Xuan, Guang Liu, Xi Yang, Qiannan Zhu, Hua Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal large language models(MLLMs) have achieved remarkable progress
+and demonstrated powerful knowledge comprehension and reasoning abilities.
+However, the mastery of domain-specific knowledge, which is essential for
+evaluating the intelligence of MLLMs, continues to be a challenge. Current
+multi-modal benchmarks for domain-specific knowledge concentrate on
+multiple-choice questions and are predominantly available in English, which
+imposes limitations on the comprehensiveness of the evaluation. To this end, we
+introduce CMMU, a novel benchmark for multi-modal and multi-type question
+understanding and reasoning in Chinese. CMMU consists of 3,603 questions in 7
+subjects, covering knowledge from primary to high school. The questions can be
+categorized into 3 types: multiple-choice, multiple-response, and
+fill-in-the-blank, bringing greater challenges to MLLMs. In addition, we
+propose a rigorous evaluation strategy called ShiftCheck for assessing
+multiple-choice questions. The strategy aims to reduce position bias, minimize
+the influence of randomness on correctness, and perform a quantitative analysis
+of position bias. We evaluate seven open-source MLLMs along with GPT4-V,
+Gemini-Pro, and Qwen-VL-Plus. The results demonstrate that CMMU poses a
+significant challenge to the recent MLLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Perceptual-oriented Learned Image Compression with Dynamic Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nianxiang Fu, Junxi Zhang, Huairui Wang, Zhenzhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we extend our prior research named DKIC and propose the
+perceptual-oriented learned image compression method, PO-DKIC. Specifically,
+DKIC adopts a dynamic kernel-based dynamic residual block group to enhance the
+transform coding and an asymmetric space-channel context entropy model to
+facilitate the estimation of gaussian parameters. Based on DKIC, PO-DKIC
+introduces PatchGAN and LPIPS loss to enhance visual quality. Furthermore, to
+maximize the overall perceptual quality under a rate constraint, we formulate
+this challenge into a constrained programming problem and use the Linear
+Integer Programming method for resolution. The experiments demonstrate that our
+proposed method can generate realistic images with richer textures and finer
+details when compared to state-of-the-art image compression techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLIC: Large Receptive Field Transform Coding with Adaptive Weights for
+  Learned Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09571v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09571v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jiang, Peirong Ning, Jiayu Yang, Yongqi Zhai, Feng Gao, Ronggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective Receptive field (ERF) plays an important role in transform coding,
+which determines how much redundancy can be removed at most during transform
+and how many spatial priors can be utilized to synthesize textures during
+inverse transform. Existing methods rely on stacks of small kernels, whose ERF
+remains not large enough instead, or heavy non-local attention mechanisms,
+which limit the potential of high resolution image coding. To tackle this
+issue, we propose Large Receptive Field Transform Coding with Adaptive Weights
+for Learned Image Compression (LLIC). Specifically, for the first time in
+learned image compression community, we introduce a few large kernel-based
+depth-wise convolutions to reduce more redundancy while maintaining modest
+complexity. Due to wide range of image diversity, we propose to enhance the
+adaptability of convolutions via generating weights in a self-conditioned
+manner. The large kernels cooperate with non-linear embedding and gate
+mechanisms for better expressiveness and lighter point-wise interactions. We
+also investigate improved training techniques to fully exploit the potential of
+large kernels. In addition, to enhance the interactions among channels, we
+propose the adaptive channel-wise bit allocation via generating channel
+importance factor in a self-conditioned manner. To demonstrate the
+effectiveness of proposed transform coding, we align the entropy model to
+compare with existing transform methods and obtain models LLIC-STF, LLIC-ELIC,
+LLIC-TCM. Extensive experiments demonstrate our proposed LLIC models have
+significant improvements over corresponding baselines and achieve
+state-of-the-art performances and better trade-off between performance and
+complexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short
+  Video Search Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10475v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10475v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangshuo Qiao, Xianxin Li, Xiaozhe Qu, Jie Zhang, Yang Liu, Yu Luo, Cihang Jin, Jin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Models pre-trained on large-scale image-text datasets have
+shown superior performance in downstream tasks such as image retrieval. Most of
+the images for pre-training are presented in the form of open domain
+common-sense visual elements. Differently, video covers in short video search
+scenarios are presented as user-originated contents that provide important
+visual summaries of videos. In addition, a portion of the video covers come
+with manually designed cover texts that provide semantic complements. In order
+to fill in the gaps in short video cover data, we establish the first
+large-scale cover-text benchmark for Chinese short video search scenarios.
+Specifically, we release two large-scale datasets CBVS-5M/10M to provide short
+video covers, and the manual fine-labeling dataset CBVS-20K to provide real
+user queries, which serves as an image-text benchmark test in the Chinese short
+video search field. To integrate the semantics of cover text in the case of
+modality missing, we propose UniCLIP where cover texts play a guiding role
+during training, however are not relied upon by inference. Extensive evaluation
+on CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has
+been deployed to Tencent's online video search systems with hundreds of
+millions of visits and achieved significant gains. The dataset and code are
+available at https://github.com/QQBrowserVideoSearch/CBVS-UniCLIP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SynthTab: Leveraging Synthesized Data for Guitar Tablature Transcription <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09085v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09085v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongyi Zang, Yi Zhong, Frank Cwitkowitz, Zhiyao Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Guitar tablature is a form of music notation widely used among guitarists. It
+captures not only the musical content of a piece, but also its implementation
+and ornamentation on the instrument. Guitar Tablature Transcription (GTT) is an
+important task with broad applications in music education, composition, and
+entertainment. Existing GTT datasets are quite limited in size and scope,
+rendering models trained on them prone to overfitting and incapable of
+generalizing to out-of-domain data. In order to address this issue, we present
+a methodology for synthesizing large-scale GTT audio using commercial acoustic
+and electric guitar plugins. We procure SynthTab, a dataset derived from
+DadaGP, which is a vast and diverse collection of richly annotated symbolic
+tablature. The proposed synthesis pipeline produces audio which faithfully
+adheres to the original fingerings and a subset of techniques specified in the
+tablature, and covers multiple guitars and styles for each track. Experiments
+show that pre-training a baseline GTT model on SynthTab can improve
+transcription performance when fine-tuning and testing on an individual
+dataset. More importantly, cross-dataset experiments show that pre-training
+significantly mitigates issues with overfitting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAiD: Speech-driven Blendshape Facial Animation with Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inkyu Park, Jaewoong Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech-driven 3D facial animation is challenging due to the scarcity of
+large-scale visual-audio datasets despite extensive research. Most prior works,
+typically focused on learning regression models on a small dataset using the
+method of least squares, encounter difficulties generating diverse lip
+movements from speech and require substantial effort in refining the generated
+outputs. To address these issues, we propose a speech-driven 3D facial
+animation with a diffusion model (SAiD), a lightweight Transformer-based U-Net
+with a cross-modality alignment bias between audio and visual to enhance lip
+synchronization. Moreover, we introduce BlendVOCA, a benchmark dataset of pairs
+of speech audio and parameters of a blendshape facial model, to address the
+scarcity of public resources. Our experimental results demonstrate that the
+proposed approach achieves comparable or superior performance in lip
+synchronization to baselines, ensures more diverse lip movements, and
+streamlines the animation editing process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Fix bug related to the font size</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MOS-FAD: Improving Fake Audio Detection Via Automatic Mean Opinion Score
+  Prediction <span class="chip">ICASSP2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13249v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13249v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wangjin Zhou, Zhengdong Yang, Chenhui Chu, Sheng Li, Raj Dabre, Yi Zhao, Tatsuya Kawahara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic Mean Opinion Score (MOS) prediction is employed to evaluate the
+quality of synthetic speech. This study extends the application of predicted
+MOS to the task of Fake Audio Detection (FAD), as we expect that MOS can be
+used to assess how close synthesized speech is to the natural human voice. We
+propose MOS-FAD, where MOS can be leveraged at two key points in FAD: training
+data selection and model fusion. In training data selection, we demonstrate
+that MOS enables effective filtering of samples from unbalanced datasets. In
+the model fusion, our results demonstrate that incorporating MOS as a gating
+mechanism in FAD model fusion enhances overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICASSP2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-01-24T00:00:00Z">2024-01-24</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">60</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MambaByte: Token-free Selective State Space Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junxiong Wang, Tushaar Gangavarapu, Jing Nathan Yan, Alexander M Rush
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Token-free language models learn directly from raw bytes and remove the bias
+of subword tokenization. Operating on bytes, however, results in significantly
+longer sequences, and standard autoregressive Transformers scale poorly in such
+settings. We experiment with MambaByte, a token-free adaptation of the Mamba
+state space model, trained autoregressively on byte sequences. Our experiments
+indicate the computational efficiency of MambaByte compared to other byte-level
+models. We also find MambaByte to be competitive with and even outperform
+state-of-the-art subword Transformers. Furthermore, owing to linear scaling in
+length, MambaByte benefits from fast inference compared to Transformers. Our
+findings establish the viability of MambaByte in enabling token-free language
+modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web
+  Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Yu Koh, Robert Lo, Lawrence Jang, Vikram Duvvur, Ming Chong Lim, Po-Yu Huang, Graham Neubig, Shuyan Zhou, Ruslan Salakhutdinov, Daniel Fried
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous agents capable of planning, reasoning, and executing actions on
+the web offer a promising avenue for automating computer tasks. However, the
+majority of existing benchmarks primarily focus on text-based agents,
+neglecting many natural tasks that require visual information to effectively
+solve. Given that most computer interfaces cater to human perception, visual
+information often augments textual data in ways that text-only models struggle
+to harness effectively. To bridge this gap, we introduce VisualWebArena, a
+benchmark designed to assess the performance of multimodal web agents on
+realistic \textit{visually grounded tasks}. VisualWebArena comprises of a set
+of diverse and complex web-based tasks that evaluate various capabilities of
+autonomous multimodal agents. To perform on this benchmark, agents need to
+accurately process image-text inputs, interpret natural language instructions,
+and execute actions on websites to accomplish user-defined objectives. We
+conduct an extensive evaluation of state-of-the-art LLM-based autonomous
+agents, including several multimodal models. Through extensive quantitative and
+qualitative analysis, we identify several limitations of text-only LLM agents,
+and reveal gaps in the capabilities of state-of-the-art multimodal language
+agents. VisualWebArena provides a framework for evaluating multimodal
+autonomous language agents, and offers insights towards building stronger
+autonomous agents for the web. Our code, baseline models, and data is publicly
+available at https://jykoh.com/vwa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages. Project page: https://jykoh.com/vwa</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DenoSent: A Denoising Objective for <span class="highlight-title">Self-Supervised</span> Sentence
+  Representation Learning <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinghao Wang, Junliang He, Pengyu Wang, Yunhua Zhou, Tianxiang Sun, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive-learning-based methods have dominated sentence representation
+learning. These methods regularize the representation space by pulling similar
+sentence representations closer and pushing away the dissimilar ones and have
+been proven effective in various NLP tasks, e.g., semantic textual similarity
+(STS) tasks. However, it is challenging for these methods to learn fine-grained
+semantics as they only learn from the inter-sentence perspective, i.e., their
+supervision signal comes from the relationship between data samples. In this
+work, we propose a novel denoising objective that inherits from another
+perspective, i.e., the intra-sentence perspective. By introducing both discrete
+and continuous noise, we generate noisy sentences and then train our model to
+restore them to their original form. Our empirical evaluations demonstrate that
+this approach delivers competitive results on both semantic textual similarity
+(STS) and a wide range of transfer tasks, standing up well in comparison to
+contrastive-learning-based methods. Notably, the proposed intra-sentence
+denoising objective complements existing inter-sentence contrastive
+methodologies and can be integrated with them to further enhance performance.
+Our code is available at https://github.com/xinghaow99/DenoSent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MM-LLMs: Recent Advances in MultiModal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duzhen Zhang, Yahan Yu, Chenxing Li, Jiahua Dong, Dan Su, Chenhui Chu, Dong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the past year, MultiModal Large Language Models (MM-LLMs) have undergone
+substantial advancements, augmenting off-the-shelf LLMs to support MM inputs or
+outputs via cost-effective training strategies. The resulting models not only
+preserve the inherent reasoning and decision-making capabilities of LLMs but
+also empower a diverse range of MM tasks. In this paper, we provide a
+comprehensive survey aimed at facilitating further research of MM-LLMs.
+Specifically, we first outline general design formulations for model
+architecture and training pipeline. Subsequently, we provide brief
+introductions of $26$ existing MM-LLMs, each characterized by its specific
+formulations. Additionally, we review the performance of MM-LLMs on mainstream
+benchmarks and summarize key training recipes to enhance the potency of
+MM-LLMs. Lastly, we explore promising directions for MM-LLMs while concurrently
+maintaining a real-time tracking website for the latest developments in the
+field. We hope that this survey contributes to the ongoing advancement of the
+MM-LLMs domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistency Guided Knowledge Retrieval and Denoising in LLMs for
+  Zero-shot Document-level Relation Triplet Extraction <span class="chip">WWW 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13598v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13598v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Sun, Kun Huang, Xiaocui Yang, Rong Tong, Kun Zhang, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document-level Relation Triplet Extraction (DocRTE) is a fundamental task in
+information systems that aims to simultaneously extract entities with semantic
+relations from a document. Existing methods heavily rely on a substantial
+amount of fully labeled data. However, collecting and annotating data for newly
+emerging relations is time-consuming and labor-intensive. Recent advanced Large
+Language Models (LLMs), such as ChatGPT and LLaMA, exhibit impressive long-text
+generation capabilities, inspiring us to explore an alternative approach for
+obtaining auto-labeled documents with new relations. In this paper, we propose
+a Zero-shot Document-level Relation Triplet Extraction (ZeroDocRTE) framework,
+which generates labeled data by retrieval and denoising knowledge from LLMs,
+called GenRDK. Specifically, we propose a chain-of-retrieval prompt to guide
+ChatGPT to generate labeled long-text data step by step. To improve the quality
+of synthetic data, we propose a denoising strategy based on the consistency of
+cross-document knowledge. Leveraging our denoised synthetic data, we proceed to
+fine-tune the LLaMA2-13B-Chat for extracting document-level relation triplets.
+We perform experiments for both zero-shot document-level relation and triplet
+extraction on two public datasets. The experimental results illustrate that our
+GenRDK framework outperforms strong baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by WWW 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Guided Question Answer Generation for Procedural
+  Question-Answering <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13594v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13594v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai X. Pham, Isma Hadji, Xinnuo Xu, Ziedune Degutyte, Jay Rainey, Evangelos Kazakos, Afsaneh Fazly, Georgios Tzimiropoulos, Brais Martinez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on task-specific question answering (QA). To this
+end, we introduce a method for generating exhaustive and high-quality training
+data, which allows us to train compact (e.g., run on a mobile device),
+task-specific QA models that are competitive against GPT variants. The key
+technological enabler is a novel mechanism for automatic question-answer
+generation from procedural text which can ingest large amounts of textual
+instructions and produce exhaustive in-domain QA training data. While current
+QA data generation methods can produce well-formed and varied data, their
+non-exhaustive nature is sub-optimal for training a QA model. In contrast, we
+leverage the highly structured aspect of procedural text and represent each
+step and the overall flow of the procedure as graphs. We then condition on
+graph nodes to automatically generate QA pairs in an exhaustive and
+controllable manner. Comprehensive evaluations of our method show that: 1)
+small models trained with our data achieve excellent performance on the target
+QA task, even exceeding that of GPT3 and ChatGPT despite being several orders
+of magnitude smaller. 2) semantic coverage is the key indicator for downstream
+QA performance. Crucially, while large language models excel at syntactic
+diversity, this does not necessarily result in improvements on the end QA
+model. In contrast, the higher semantic coverage provided by our method is
+critical for QA performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EACL 2024 as long paper. 25 pages including appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluation of General Large Language Models in Contextually Assessing
+  Semantic Concepts Extracted from Adult Critical Care Electronic Health Record
+  Notes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Darren Liu, Cheng Ding, Delgersuren Bold, Monique Bouvier, Jiaying Lu, Benjamin Shickel, Craig S. Jabaley, Wenhui Zhang, Soojin Park, Michael J. Young, Mark S. Wainwright, Gilles Clermont, Parisa Rashidi, Eric S. Rosenthal, Laurie Dimisko, Ran Xiao, Joo Heung Yoon, Carl Yang, Xiao Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of healthcare has increasingly turned its focus towards Large
+Language Models (LLMs) due to their remarkable performance. However, their
+performance in actual clinical applications has been underexplored. Traditional
+evaluations based on question-answering tasks don't fully capture the nuanced
+contexts. This gap highlights the need for more in-depth and practical
+assessments of LLMs in real-world healthcare settings. Objective: We sought to
+evaluate the performance of LLMs in the complex clinical context of adult
+critical care medicine using systematic and comprehensible analytic methods,
+including clinician annotation and adjudication. Methods: We investigated the
+performance of three general LLMs in understanding and processing real-world
+clinical notes. Concepts from 150 clinical notes were identified by MetaMap and
+then labeled by 9 clinicians. Each LLM's proficiency was evaluated by
+identifying the temporality and negation of these concepts using different
+prompts for an in-depth analysis. Results: GPT-4 showed overall superior
+performance compared to other LLMs. In contrast, both GPT-3.5 and
+text-davinci-003 exhibit enhanced performance when the appropriate prompting
+strategies are employed. The GPT family models have demonstrated considerable
+efficiency, evidenced by their cost-effectiveness and time-saving capabilities.
+Conclusion: A comprehensive qualitative performance evaluation framework for
+LLMs is developed and operationalized. This framework goes beyond singular
+performance aspects. With expert annotations, this methodology not only
+validates LLMs' capabilities in processing complex medical data but also
+establishes a benchmark for future LLM evaluations across specialized domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span> Weight Experiments for LLM Instruction Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathew Huerta-Enochian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a small study analyzing how prompt token classification loss
+weighting (PLW) affects the performance of 7B-size LLaMA models fine-tuned on
+instruction tasks. We recreated Stanford's Alpaca experiment with both LLaMA 1
+and LLaMA 2 using multiple instruction datasets. We found that models
+fine-tuned on our short-completion dataset have a negative quadratic
+relationship with PLW while models fine-tuned on long-completion datasets were
+unaffected by PLW.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages of content. 5 pages for limitations, acknowledgments,
+  references, and appendix. 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Malaysian Language Model Based on Mistral for Enhanced Local
+  Language Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Husein Zolkepli, Aisyah Razak, Kamarul Adha, Ariff Nazhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present significant advancements in the pretraining of
+Mistral 7B, a large-scale language model, using a dataset of 32.6 GB,
+equivalent to 1.1 billion tokens. We explore the impact of extending the
+context length, releasing models with context lengths of 4096 and 32768 tokens,
+and further refining performance with a specialized 16384 context length
+instruction-tuned model, we called it Malaysian Mistral.
+  Our experiments demonstrate the efficacy of continue pretraining and the
+influence of extended context lengths on Mistral 7B's language understanding
+capabilities. Additionally, we release a model specifically tuned with a 16384
+context length instruction, showcasing its potential for capturing nuanced
+language intricacies.
+  Furthermore, our research contributes to the benchmarking of Malaysian
+Mistral against prominent language models, including ChatGPT3.5 and Claude 2.
+We present compelling results indicating Malaysian Mistral's superior
+performance on Tatabahasa (Malay grammar) test set, particularly when
+fine-tuned with instructions.
+  All models released at
+https://huggingface.co/collections/mesolitica/malaysian-mistral-7b-6528f2ec825f4bba46c1700c
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speech<span class="highlight-title">GPT</span>-Gen: Scaling Chain-of-Information Speech Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Zhang, Xin Zhang, Jun Zhan, Shimin Li, Yaqian Zhou, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benefiting from effective speech modeling, current Speech Large Language
+Models (SLLMs) have demonstrated exceptional capabilities in in-context speech
+generation and efficient generalization to unseen speakers. However, the
+prevailing information modeling process is encumbered by certain redundancies,
+leading to inefficiencies in speech generation. We propose Chain-of-Information
+Generation (CoIG), a method for decoupling semantic and perceptual information
+in large-scale speech generation. Building on this, we develop SpeechGPT-Gen,
+an 8-billion-parameter SLLM efficient in semantic and perceptual information
+modeling. It comprises an autoregressive model based on LLM for semantic
+information modeling and a non-autoregressive model employing flow matching for
+perceptual information modeling. Additionally, we introduce the novel approach
+of infusing semantic information into the prior distribution to enhance the
+efficiency of flow matching. Extensive experimental results demonstrate that
+SpeechGPT-Gen markedly excels in zero-shot text-to-speech, zero-shot voice
+conversion, and speech-to-speech dialogue, underscoring CoIG's remarkable
+proficiency in capturing and modeling speech's semantic and perceptual
+dimensions. Code and models are available at
+https://github.com/0nutation/SpeechGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can <span class="highlight-title">GPT</span>-3.5 Generate and Code Discharge Summaries? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13512v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13512v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matúš Falis, Aryo Pradipta Gema, Hang Dong, Luke Daines, Siddharth Basetti, Michael Holder, Rose S Penfold, Alexandra Birch, Beatrice Alex
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: To investigate GPT-3.5 in generating and coding medical documents
+with ICD-10 codes for data augmentation on low-resources labels.
+  Materials and Methods: Employing GPT-3.5 we generated and coded 9,606
+discharge summaries based on lists of ICD-10 code descriptions of patients with
+infrequent (generation) codes within the MIMIC-IV dataset. Combined with the
+baseline training set, this formed an augmented training set. Neural coding
+models were trained on baseline and augmented data and evaluated on a MIMIC-IV
+test set. We report micro- and macro-F1 scores on the full codeset, generation
+codes, and their families. Weak Hierarchical Confusion Matrices were employed
+to determine within-family and outside-of-family coding errors in the latter
+codesets. The coding performance of GPT-3.5 was evaluated both on prompt-guided
+self-generated data and real MIMIC-IV data. Clinical professionals evaluated
+the clinical acceptability of the generated documents.
+  Results: Augmentation slightly hinders the overall performance of the models
+but improves performance for the generation candidate codes and their families,
+including one unseen in the baseline training data. Augmented models display
+lower out-of-family error rates. GPT-3.5 can identify ICD-10 codes by the
+prompted descriptions, but performs poorly on real data. Evaluators note the
+correctness of generated concepts while suffering in variety, supporting
+information, and narrative.
+  Discussion and Conclusion: GPT-3.5 alone is unsuitable for ICD-10 coding.
+Augmentation positively affects generation code families but mainly benefits
+codes with existing examples. Augmentation reduces out-of-family errors.
+Discharge summaries generated by GPT-3.5 state prompted concepts correctly but
+lack variety, and authenticity in narratives. They are unsuitable for clinical
+practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages; 250 words in abstract; 3,929 words in main body; 2 figures
+  (0 black and white, 2 colour); 4 tables; 34 references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How AI Ideas Affect the Creativity, Diversity, and Evolution of Human
+  Ideas: Evidence From a Large, Dynamic Experiment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Ashkinaze, Julia Mendelsohn, Li Qiwei, Ceren Budak, Eric Gilbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exposure to large language model output is rapidly increasing. How will
+seeing AI-generated ideas affect human ideas? We conducted an experiment (800+
+participants, 40+ countries) where participants viewed creative ideas that were
+from ChatGPT or prior experimental participants and then brainstormed their own
+idea. We varied the number of AI-generated examples (none, low, or high
+exposure) and if the examples were labeled as 'AI' (disclosure). Our dynamic
+experiment design -- ideas from prior participants in an experimental condition
+are used as stimuli for future participants in the same experimental condition
+-- mimics the interdependent process of cultural creation: creative ideas are
+built upon prior ideas. Hence, we capture the compounding effects of having
+LLMs 'in the culture loop'. We find that high AI exposure (but not low AI
+exposure) did not affect the creativity of individual ideas but did increase
+the average amount and rate of change of collective idea diversity. AI made
+ideas different, not better. There were no main effects of disclosure. We also
+found that self-reported creative people were less influenced by knowing an
+idea was from AI, and that participants were more likely to knowingly adopt AI
+ideas when the task was difficult. Our findings suggest that introducing AI
+ideas into society may increase collective diversity but not individual
+creativity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siwei Wu, Yizhi Li, Kang Zhu, Ge Zhang, Yiming Liang, Kaijing Ma, Chenghao Xiao, Haoran Zhang, Bohao Yang, Wenhu Chen, Wenhao Huang, Noura Al Moubayed, Jie Fu, Chenghua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal information retrieval (MMIR) is a rapidly evolving field, where
+significant progress, particularly in image-text pairing, has been made through
+advanced representation learning and cross-modality alignment research.
+However, current benchmarks for evaluating MMIR performance in image-text
+pairing within the scientific domain show a notable gap, where chart and table
+images described in scholarly language usually do not play a significant role.
+To bridge this gap, we develop a specialised scientific MMIR (SciMMIR)
+benchmark by leveraging open-access paper collections to extract data relevant
+to the scientific domain. This benchmark comprises 530K meticulously curated
+image-text pairs, extracted from figures and tables with detailed captions in
+scientific documents. We further annotate the image-text pairs with two-level
+subset-subcategory hierarchy annotations to facilitate a more comprehensive
+evaluation of the baselines. We conducted zero-shot and fine-tuning evaluations
+on prominent multi-modal image-captioning and visual language models, such as
+CLIP and BLIP. Our analysis offers critical insights for MMIR in the scientific
+domain, including the impact of pre-training and fine-tuning settings and the
+influence of the visual and textual encoders. All our data and checkpoints are
+publicly available at https://github.com/Wusiwei0410/SciMMIR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken
+  Question Answering <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chyi-Jiunn Lin, Guan-Ting Lin, Yung-Sung Chuang, Wei-Lun Wu, Shang-Wen Li, Abdelrahman Mohamed, Hung-yi Lee, Lin-shan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spoken Question Answering (SQA) is essential for machines to reply to user's
+question by finding the answer span within a given spoken passage. SQA has been
+previously achieved without ASR to avoid recognition errors and
+Out-of-Vocabulary (OOV) problems. However, the real-world problem of
+Open-domain SQA (openSQA), in which the machine needs to first retrieve
+passages that possibly contain the answer from a spoken archive in addition,
+was never considered. This paper proposes the first known end-to-end framework,
+Speech Dense Passage Retriever (SpeechDPR), for the retrieval component of the
+openSQA problem. SpeechDPR learns a sentence-level semantic representation by
+distilling knowledge from the cascading model of unsupervised ASR (UASR) and
+text dense retriever (TDR). No manually transcribed speech data is needed.
+Initial experiments showed performance comparable to the cascading model of
+UASR and TDR, and significantly better when UASR was poor, verifying this
+approach is more robust to speech recognition errors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clue-Guided Path Exploration: An Efficient Knowledge Base
+  Question-Answering Framework with Low Computational Resource Consumption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13444v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13444v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dehao Tao, Feng Huang, Yongfeng Huang, Minghu Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent times, large language models (LLMs) have showcased remarkable
+capabilities. However, updating their knowledge poses challenges, potentially
+leading to inaccuracies when confronted with unfamiliar queries. While
+integrating knowledge graphs with LLMs has been explored, existing approaches
+treat LLMs as primary decision-makers, imposing high demands on their
+capabilities. This is particularly unsuitable for LLMs with lower computational
+costs and relatively poorer performance. In this paper, we introduce a
+Clue-Guided Path Exploration framework (CGPE) that efficiently merges a
+knowledge base with an LLM, placing less stringent requirements on the model's
+capabilities. Inspired by the method humans use to manually retrieve knowledge,
+CGPE employs information from the question as clues to systematically explore
+the required knowledge path within the knowledge base. Experiments on
+open-source datasets reveal that CGPE outperforms previous methods and is
+highly applicable to LLMs with fewer parameters. In some instances, even
+ChatGLM3, with its 6 billion parameters, can rival the performance of GPT-4.
+Furthermore, the results indicate a minimal invocation frequency of CGPE on
+LLMs, suggesting reduced computational overhead. For organizations and
+individuals facing constraints in computational resources, our research offers
+significant practical value.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text Categorization Can Enhance Domain-Agnostic Stopword Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Houcemeddine Turki, Naome A. Etori, Mohamed Ali Hadj Taieb, Abdul-Hakeem Omotayo, Chris Chinenye Emezue, Mohamed Ben Aouicha, Ayodele Awokoya, Falalu Ibrahim Lawan, Doreen Nixdorf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the role of text categorization in streamlining
+stopword extraction in natural language processing (NLP), specifically focusing
+on nine African languages alongside French. By leveraging the MasakhaNEWS,
+African Stopwords Project, and MasakhaPOS datasets, our findings emphasize that
+text categorization effectively identifies domain-agnostic stopwords with over
+80% detection success rate for most examined languages. Nevertheless,
+linguistic variances result in lower detection rates for certain languages.
+Interestingly, we find that while over 40% of stopwords are common across news
+categories, less than 15% are unique to a single category. Uncommon stopwords
+add depth to text but their classification as stopwords depends on context.
+Therefore combining statistical and linguistic approaches creates comprehensive
+stopword lists, highlighting the value of our hybrid method. This research
+enhances NLP for African languages and underscores the importance of text
+categorization in stopword extraction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A Project Report for the Masakhane Research Community</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InstructDoc: A <span class="highlight-title">Dataset</span> for Zero-Shot Generalization of Visual Document
+  Understanding with Instructions <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryota Tanaka, Taichi Iki, Kyosuke Nishida, Kuniko Saito, Jun Suzuki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of completing various visual document understanding
+(VDU) tasks, e.g., question answering and information extraction, on real-world
+documents through human-written instructions. To this end, we propose
+InstructDoc, the first large-scale collection of 30 publicly available VDU
+datasets, each with diverse instructions in a unified format, which covers a
+wide range of 12 tasks and includes open document types/formats. Furthermore,
+to enhance the generalization performance on VDU tasks, we design a new
+instruction-based document reading and understanding model, InstructDr, that
+connects document images, image encoders, and large language models (LLMs)
+through a trainable bridging module. Experiments demonstrate that InstructDr
+can effectively adapt to new VDU datasets, tasks, and domains via given
+instructions and outperforms existing multimodal LLMs and ChatGPT without
+specific training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2024; project page:
+  https://github.com/nttmdlab-nlp/InstructDoc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MaLA-500: Massive Language Adaptation of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiqin Lin, Shaoxiong Ji, Jörg Tiedemann, André F. T. Martins, Hinrich Schütze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have advanced the state of the art in natural language
+processing. However, their predominant design for English or a limited set of
+languages creates a substantial gap in their effectiveness for low-resource
+languages. To bridge this gap, we introduce MaLA-500, a novel large language
+model designed to cover an extensive range of 534 languages. To train MaLA-500,
+we employ vocabulary extension and continued pretraining on LLaMA 2 with
+Glot500-c. Our experiments on SIB-200 show that MaLA-500 achieves
+state-of-the-art in-context learning results. We release MaLA-500 at
+https://huggingface.co/MaLA-LM
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Explainable Harmful Meme Detection through Multimodal Debate
+  between Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongzhan Lin, Ziyang Luo, Wei Gao, Jing Ma, Bo Wang, Ruichao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The age of social media is flooded with Internet memes, necessitating a clear
+grasp and effective identification of harmful ones. This task presents a
+significant challenge due to the implicit meaning embedded in memes, which is
+not explicitly conveyed through the surface text and image. However, existing
+harmful meme detection methods do not present readable explanations that unveil
+such implicit meaning to support their detection decisions. In this paper, we
+propose an explainable approach to detect harmful memes, achieved through
+reasoning over conflicting rationales from both harmless and harmful positions.
+Specifically, inspired by the powerful capacity of Large Language Models (LLMs)
+on text generation and reasoning, we first elicit multimodal debate between
+LLMs to generate the explanations derived from the contradictory arguments.
+Then we propose to fine-tune a small language model as the debate judge for
+harmfulness inference, to facilitate multimodal fusion between the harmfulness
+rationales and the intrinsic multimodal information within memes. In this way,
+our model is empowered to perform dialectical reasoning over intricate and
+implicit harm-indicative patterns, utilizing multimodal explanations
+originating from both harmless and harmful arguments. Extensive experiments on
+three public meme datasets demonstrate that our harmful meme detection approach
+achieves much better performance than state-of-the-art methods and exhibits a
+superior capacity for explaining the meme harmfulness of the model predictions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first work towards explainable harmful meme detection by
+  harnessing advanced LLMs</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can AI Assistants Know What They Don't Know? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinyuan Cheng, Tianxiang Sun, Xiangyang Liu, Wenwei Zhang, Zhangyue Yin, Shimin Li, Linyang Li, Kai Chen, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, AI assistants based on large language models (LLMs) show surprising
+performance in many tasks, such as dialogue, solving math problems, writing
+code, and using tools. Although LLMs possess intensive world knowledge, they
+still make factual errors when facing some knowledge intensive tasks, like
+open-domain question answering. These untruthful responses from the AI
+assistant may cause significant risks in practical applications. We believe
+that an AI assistant's refusal to answer questions it does not know is a
+crucial method for reducing hallucinations and making the assistant truthful.
+Therefore, in this paper, we ask the question "Can AI assistants know what they
+don't know and express them through natural language?" To answer this question,
+we construct a model-specific "I don't know" (Idk) dataset for an assistant,
+which contains its known and unknown questions, based on existing open-domain
+question answering datasets. Then we align the assistant with its corresponding
+Idk dataset and observe whether it can refuse to answer its unknown questions
+after alignment. Experimental results show that after alignment with Idk
+datasets, the assistant can refuse to answer most its unknown questions. For
+questions they attempt to answer, the accuracy is significantly higher than
+before the alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MF-AED-AEC: Speech Emotion Recognition by Leveraging Multimodal Fusion,
+  ASR Error Detection, and ASR Error Correction <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun He, Xiaohan Shi, Xingfeng Li, Tomoki Toda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prevalent approach in speech emotion recognition (SER) involves
+integrating both audio and textual information to comprehensively identify the
+speaker's emotion, with the text generally obtained through automatic speech
+recognition (ASR). An essential issue of this approach is that ASR errors from
+the text modality can worsen the performance of SER. Previous studies have
+proposed using an auxiliary ASR error detection task to adaptively assign
+weights of each word in ASR hypotheses. However, this approach has limited
+improvement potential because it does not address the coherence of semantic
+information in the text. Additionally, the inherent heterogeneity of different
+modalities leads to distribution gaps between their representations, making
+their fusion challenging. Therefore, in this paper, we incorporate two
+auxiliary tasks, ASR error detection (AED) and ASR error correction (AEC), to
+enhance the semantic coherence of ASR text, and further introduce a novel
+multi-modal fusion (MF) method to learn shared representations across
+modalities. We refer to our method as MF-AED-AEC. Experimental results indicate
+that MF-AED-AEC significantly outperforms the baseline model by a margin of
+4.1\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniMS-RAG: A Unified Multi-source Retrieval-Augmented Generation for
+  Personalized Dialogue Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongru Wang, Wenyu Huang, Yang Deng, Rui Wang, Zezhong Wang, Yufei Wang, Fei Mi, Jeff Z. Pan, Kam-Fai Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) has shown exceptional capabilities in many
+natual language understanding and generation tasks. However, the
+personalization issue still remains a much-coveted property, especially when it
+comes to the multiple sources involved in the dialogue system. To better plan
+and incorporate the use of multiple sources in generating personalized
+response, we firstly decompose it into three sub-tasks: Knowledge Source
+Selection, Knowledge Retrieval, and Response Generation. We then propose a
+novel Unified Multi-Source Retrieval-Augmented Generation system (UniMS-RAG)
+Specifically, we unify these three sub-tasks with different formulations into
+the same sequence-to-sequence paradigm during the training, to adaptively
+retrieve evidences and evaluate the relevance on-demand using special tokens,
+called acting tokens and evaluation tokens. Enabling language models to
+generate acting tokens facilitates interaction with various knowledge sources,
+allowing them to adapt their behavior to diverse task requirements. Meanwhile,
+evaluation tokens gauge the relevance score between the dialogue context and
+the retrieved evidence. In addition, we carefully design a self-refinement
+mechanism to iteratively refine the generated response considering 1) the
+consistency scores between the generated response and retrieved evidence; and
+2) the relevance scores. Experiments on two personalized datasets (DuLeMon and
+KBP) show that UniMS-RAG achieves state-of-the-art performance on the knowledge
+source selection and response generation task with itself as a retriever in a
+unified manner. Extensive analyses and discussions are provided for shedding
+some new perspectives for personalized dialogue systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEER: Facilitating Structured Reasoning and Explanation via
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxin Chen, Kexin Tang, Chao Yang, Fuying Ye, Yu Qiao, Yiming Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Elucidating the reasoning process with structured explanations from question
+to answer is fundamentally crucial, as it significantly enhances the
+interpretability and trustworthiness of question-answering (QA) systems.
+However, structured explanations demand models to perform intricate structured
+reasoning, which poses great challenges. Most existing methods focus on
+single-step reasoning through supervised learning, ignoring logical
+dependencies between steps. Meanwhile, existing reinforcement learning
+(RL)-based methods overlook the structured relationships, impeding RL's
+potential in structured reasoning. In this paper, we propose SEER, a novel
+method that maximizes a structure-based return to facilitate structured
+reasoning and explanation. Our proposed structure-based return precisely
+describes the hierarchical and branching structure inherent in structured
+reasoning, effectively capturing the intricate relationships between states. We
+also introduce a fine-grained reward function to meticulously delineate diverse
+reasoning steps. Extensive experiments show that SEER significantly outperforms
+state-of-the-art methods, achieving an absolute improvement of 6.9% over
+RL-based methods on EntailmentBank, a 4.4% average improvement on STREET
+benchmark, and exhibiting outstanding efficiency and cross-dataset
+generalization performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Ongoing Work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Random to Informed Data Selection: A Diversity-Based Approach to
+  Optimize Human Annotation and Few-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Alcoforado, Thomas Palmeira Ferraz, Lucas Hideki Okamura, Israel Campos Fama, Arnold Moya Lavado, Bárbara Dias Bueno, Bruno Veloso, Anna Helena Reali Costa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in Natural Language Processing is obtaining annotated data
+for supervised learning. An option is the use of crowdsourcing platforms for
+data annotation. However, crowdsourcing introduces issues related to the
+annotator's experience, consistency, and biases. An alternative is to use
+zero-shot methods, which in turn have limitations compared to their few-shot or
+fully supervised counterparts. Recent advancements driven by large language
+models show potential, but struggle to adapt to specialized domains with
+severely limited data. The most common approaches therefore involve the human
+itself randomly annotating a set of datapoints to build initial datasets. But
+randomly sampling data to be annotated is often inefficient as it ignores the
+characteristics of the data and the specific needs of the model. The situation
+worsens when working with imbalanced datasets, as random sampling tends to
+heavily bias towards the majority classes, leading to excessive annotated data.
+To address these issues, this paper contributes an automatic and informed data
+selection architecture to build a small dataset for few-shot learning. Our
+proposal minimizes the quantity and maximizes diversity of data selected for
+human annotation, while improving model performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at PROPOR 2024 - The 16th International Conference on
+  Computational Processing of Portuguese</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable Link Prediction on Large-Scale Heterogeneous Graphs with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baolong Bi, Shenghua Liu, Yiwei Wang, Lingrui Mei, Xueqi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploring the application of large-scale language models to graph learning is
+a novel endeavor. However, the vast amount of information inherent in large
+graphs poses significant challenges to this process. This paper focuses on the
+link prediction task and introduces LPNL (Link Prediction via Natural
+Language), a framework based on a large language model designed for scalable
+link prediction on large-scale heterogeneous graphs.We design novel prompts for
+link prediction that articulate graph details in natural language. We propose a
+two-stage sampling pipeline to extract crucial information from large-scale
+heterogeneous graphs, and a divide-and-conquer strategy to control the input
+token count within predefined limits, addressing the challenge of overwhelming
+information. We fine-tune a T5 model based on our self-supervised learning
+designed for for link prediction. Extensive experiments on a large public
+heterogeneous graphs demonstrate that LPNL outperforms various advanced
+baselines, highlighting its remarkable performance in link prediction tasks on
+large-scale graphs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TAT-LLM: A Specialized Language Model for Discrete Reasoning over
+  Tabular and Textual Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengbin Zhu, Ziyang Liu, Fuli Feng, Chao Wang, Moxin Li, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we address question answering (QA) over a hybrid of tabular and
+textual data that are very common content on the Web (e.g. SEC filings), where
+discrete reasoning capabilities are often required. Recently, large language
+models (LLMs) like GPT-4 have demonstrated strong multi-step reasoning
+capabilities. We then consider harnessing the amazing power of LLMs to solve
+our task. We abstract a Step-wise Pipeline for tabular and textual QA, which
+consists of three key steps, including Extractor, Reasoner and Executor, and
+initially design an instruction to instantiate the pipeline and validate that
+GPT-4 outperforms all existing methods. However, utilizing an online LLM like
+GPT-4 holds various challenges in terms of cost, latency, and data security
+risk, which motivates us to specialize smaller LLMs in this task. We develop a
+TAT-LLM language model by fine-tuning LLaMA 2 with the training data generated
+automatically from existing expert-annotated datasets following the Step-wise
+Pipeline. The experimental results have verified that our TAT-LLM model can
+outperform all baseline models, including the previous best fine-tuned models
+and very large-scale LLMs like GPT-4 on FinQA, TAT-QA and TAT-DQA benchmarks.
+We hope our work can serve as a pioneering example of specializing smaller
+language models for specific tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ULTRA: Unleash LLMs' Potential for Event Argument Extraction through
+  Hierarchical Modeling and Pair-wise Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinliang Frederick Zhang, Carter Blum, Temma Choji, Shalin Shah, Alakananda Vempala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structural extraction of events within discourse is critical since it avails
+a deeper understanding of communication patterns and behavior trends. Event
+argument extraction (EAE), at the core of event-centric understanding, is the
+task of identifying role-specific text spans (i.e., arguments) for a given
+event. Document-level EAE (DocEAE) focuses on arguments that are scattered
+across an entire document. In this work, we explore the capabilities of open
+source Large Language Models (LLMs), i.e., Flan-UL2, for the DocEAE task. To
+this end, we propose ULTRA, a hierarchical framework that extracts event
+arguments more cost-effectively -- the method needs as few as 50 annotations
+and doesn't require hitting costly API endpoints. Further, it alleviates the
+positional bias issue intrinsic to LLMs. ULTRA first sequentially reads text
+chunks of a document to generate a candidate argument set, upon which ULTRA
+learns to drop non-pertinent candidates through self-refinement. We further
+introduce LEAFER to address the challenge LLMs face in locating the exact
+boundary of an argument span. ULTRA outperforms strong baselines, which include
+strong supervised models and ChatGPT, by 9.8% when evaluated by the exact match
+(EM) metric.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MLLMReID: Multimodal Large Language Model-based Person Re-identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shan Yang, Yongfei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLM) have achieved satisfactory results in
+many tasks. However, their performance in the task of person re-identification
+(ReID) has not been explored to date. This paper will investigate how to adapt
+them for the task of ReID. An intuitive idea is to fine-tune MLLM with ReID
+image-text datasets, and then use their visual encoder as a backbone for ReID.
+However, there still exist two apparent issues: (1) Designing instructions for
+ReID, MLLMs may overfit specific instructions, and designing a variety of
+instructions will lead to higher costs. (2) Latent image feature vectors from
+LLMs are not involved in loss computation. Instructional learning, aligning
+image-text features, results in indirect optimization and a learning objective
+that inadequately utilizes features, limiting effectiveness in person feature
+learning. To address these problems, this paper proposes MLLMReID: Multimodal
+Large Language Model-based ReID. Firstly, we proposed Common Instruction, a
+simple approach that leverages the essence ability of LLMs to continue writing,
+avoiding complex and diverse instruction design. Secondly, we proposed
+DirectReID, which effectively employs the latent image feature vectors of
+images outputted by LLMs in ReID tasks. The experimental results demonstrate
+the superiority of our method. We will open-source the code on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentBoard: An Analytical Evaluation Board of Multi-turn LLM Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Ma, Junlei Zhang, Zhihao Zhu, Cheng Yang, Yujiu Yang, Yaohui Jin, Zhenzhong Lan, Lingpeng Kong, Junxian He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating large language models (LLMs) as general-purpose agents is
+essential for understanding their capabilities and facilitating their
+integration into practical applications. However, the evaluation process
+presents substantial challenges. A primary obstacle is the benchmarking of
+agent performance across diverse scenarios within a unified framework,
+especially in maintaining partially-observable environments and ensuring
+multi-round interactions. Moreover, current evaluation frameworks mostly focus
+on the final success rate, revealing few insights during the process and
+failing to provide a deep understanding of the model abilities. To address
+these challenges, we introduce AgentBoard, a pioneering comprehensive benchmark
+and accompanied open-source evaluation framework tailored to analytical
+evaluation of LLM agents. AgentBoard offers a fine-grained progress rate metric
+that captures incremental advancements as well as a comprehensive evaluation
+toolkit that features easy assessment of agents for multi-faceted analysis
+through interactive visualization. This not only sheds light on the
+capabilities and limitations of LLM agents but also propels the
+interpretability of their performance to the forefront. Ultimately, AgentBoard
+serves as a significant step towards demystifying agent behaviors and
+accelerating the development of stronger LLM agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CFMatch: Aligning Automated Answer Equivalence Evaluation with Expert
+  Judgments For Open-Domain Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongxia Li, Ishani Mondal, Yijun Liang, Huy Nghiem, Jordan Boyd-Graber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering (QA) can only make progress if we know if an answer is
+correct, but for many of the most challenging and interesting QA examples,
+current evaluation metrics to determine answer equivalence (AE) often do not
+align with human judgments, particularly more verbose, free-form answers from
+large language models (LLM). There are two challenges: a lack of data and that
+models are too big: LLM-based scorers can correlate better with human judges,
+but this task has only been tested on limited QA datasets, and even when
+available, update of the model is limited because LLMs are large and often
+expensive. We rectify both of these issues by providing clear and consistent
+guidelines for evaluating AE in machine QA adopted from professional human QA
+contests. We also introduce a combination of standard evaluation and a more
+efficient, robust, and lightweight discriminate AE classifier-based matching
+method (CFMatch, smaller than 1 MB), trained and validated to more accurately
+evaluate answer correctness in accordance with adopted expert AE rules that are
+more aligned with human judgments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, two figures, 6 tables. QA evaluation python package
+  available in https://github.com/zli12321/qa_metrics.git</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Misgendering and Assuming Gender in Machine Translation when Working
+  with Low-Resource Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sourojit Ghosh, Srishti Chatterjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This chapter focuses on gender-related errors in machine translation (MT) in
+the context of low-resource languages. We begin by explaining what low-resource
+languages are, examining the inseparable social and computational factors that
+create such linguistic hierarchies. We demonstrate through a case study of our
+mother tongue Bengali, a global language spoken by almost 300 million people
+but still classified as low-resource, how gender is assumed and inferred in
+translations to and from the high(est)-resource English when no such
+information is provided in source texts. We discuss the postcolonial and
+societal impacts of such errors leading to linguistic erasure and
+representational harms, and conclude by discussing potential solutions towards
+uplifting languages by providing them more agency in MT conversations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Upcoming Publication, Gendered Technology in Translation and
+  Interpreting Centering Rights in the Development of Language Technology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpacTor-T5: <span class="highlight-title">Pre-train</span>ing T5 Models with Span Corruption and Replaced
+  Token Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Ye, Heinrich Jiang, Afshin Rostamizadeh, Ayan Chakrabarti, Giulia DeSalvo, Jean-François Kagy, Lazaros Karydas, Gui Citovsky, Sanjiv Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training large language models is known to be extremely resource
+intensive and often times inefficient, under-utilizing the information
+encapsulated in the training text sequences. In this paper, we present SpacTor,
+a new training procedure consisting of (1) a hybrid objective combining span
+corruption (SC) and token replacement detection (RTD), and (2) a two-stage
+curriculum that optimizes the hybrid objective over the initial $\tau$
+iterations, then transitions to standard SC loss. We show empirically that the
+effectiveness of the hybrid objective is tied to the two-stage pre-training
+schedule, and provide extensive analysis on why this is the case. In our
+experiments with encoder-decoder architectures (T5) on a variety of NLP tasks,
+SpacTor-T5 yields the same downstream performance as standard SC pre-training,
+while enabling a 50% reduction in pre-training iterations and 40% reduction in
+total FLOPs. Alternatively, given the same amount of computing budget, we find
+that SpacTor results in significantly improved downstream benchmark
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9+13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TPD: Enhancing Student Language Model Reasoning via Principle Discovery
+  and Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haorui Wang, Rongzhi Zhang, Yinghao Li, Lingkai Kong, Yuchen Zhuang, Xiusi Chen, Chao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have recently showcased remarkable reasoning
+abilities. However, larger models often surpass their smaller counterparts in
+reasoning tasks, posing the challenge of effectively transferring these
+capabilities from larger models. Existing approaches heavily rely on extensive
+fine-tuning data or continuous interactions with a superior teacher LLM during
+inference. We introduce a principle-based teacher-student framework called
+``Teaching via Principle Discovery'' (TPD) to address these limitations.
+Inspired by human learning mechanisms, TPD mimics the interaction between a
+teacher and a student using a principle-based approach. The teacher LLM
+generates problem-solving instructions and corrective principles based on the
+student LLM's errors. These principles guide the refinement of instructions and
+the selection of instructive examples from a validation set. This enables the
+student model to learn from both the teacher's guidance and its own mistakes.
+Once the student model begins making inferences, TPD requires no further
+intervention from the teacher LLM or humans. Through extensive experiments
+across eight reasoning tasks, we demonstrate the effectiveness of TPD. Compared
+to standard chain-of-thought prompting, TPD significantly improves the student
+model's performance, achieving $6.2\%$ improvement on average.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Calibration Gap between Model and Human Confidence in Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mark Steyvers, Heliodoro Tejeda, Aakriti Kumar, Catarina Belem, Sheer Karny, Xinyue Hu, Lukas Mayer, Padhraic Smyth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For large language models (LLMs) to be trusted by humans they need to be
+well-calibrated in the sense that they can accurately assess and communicate
+how likely it is that their predictions are correct. Recent work has focused on
+the quality of internal LLM confidence assessments, but the question remains of
+how well LLMs can communicate this internal model confidence to human users.
+This paper explores the disparity between external human confidence in an LLM's
+responses and the internal confidence of the model. Through experiments
+involving multiple-choice questions, we systematically examine human users'
+ability to discern the reliability of LLM outputs. Our study focuses on two key
+areas: (1) assessing users' perception of true LLM confidence and (2)
+investigating the impact of tailored explanations on this perception. The
+research highlights that default explanations from LLMs often lead to user
+overestimation of both the model's confidence and its' accuracy. By modifying
+the explanations to more accurately reflect the LLM's internal confidence, we
+observe a significant shift in user perception, aligning it more closely with
+the model's actual confidence levels. This adjustment in explanatory approach
+demonstrates potential for enhancing user trust and accuracy in assessing LLM
+outputs. The findings underscore the importance of transparent communication of
+confidence levels in LLMs, particularly in high-stakes applications where
+understanding the reliability of AI-generated information is essential.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Root Causing of Cloud Incidents using In-Context Learning with
+  <span class="highlight-title">GPT</span>-4 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuchao Zhang, Supriyo Ghosh, Chetan Bansal, Rujia Wang, Minghua Ma, Yu Kang, Saravan Rajmohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Root Cause Analysis (RCA) plays a pivotal role in the incident diagnosis
+process for cloud services, requiring on-call engineers to identify the primary
+issues and implement corrective actions to prevent future recurrences.
+Improving the incident RCA process is vital for minimizing service downtime,
+customer impact and manual toil. Recent advances in artificial intelligence
+have introduced state-of-the-art Large Language Models (LLMs) like GPT-4, which
+have proven effective in tackling various AIOps problems, ranging from code
+authoring to incident management. Nonetheless, the GPT-4 model's immense size
+presents challenges when trying to fine-tune it on user data because of the
+significant GPU resource demand and the necessity for continuous model
+fine-tuning with the emergence of new data. To address the high cost of
+fine-tuning LLM, we propose an in-context learning approach for automated root
+causing, which eliminates the need for fine-tuning. We conduct extensive study
+over 100,000 production incidents, comparing several large language models
+using multiple metrics. The results reveal that our in-context learning
+approach outperforms the previous fine-tuned large language models such as
+GPT-3 by an average of 24.8\% across all metrics, with an impressive 49.7\%
+improvement over the zero-shot model. Moreover, human evaluation involving
+actual incident owners demonstrates its superiority over the fine-tuned model,
+achieving a 43.5\% improvement in correctness and an 8.7\% enhancement in
+readability. The impressive results demonstrate the viability of utilizing a
+vanilla GPT model for the RCA task, thereby avoiding the high computational and
+maintenance costs associated with a fine-tuned model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Efficacy of Large Language Models for Code Clone
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamad Khajezade, Jie Wu, Fatemeh Hendijani Fard, Gema Rodríguez-Pérez, Mohamed Sami Shehata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable success in various
+natural language processing and software engineering tasks, such as code
+generation. The LLMs are mainly utilized in the prompt-based zero/few-shot
+paradigm to guide the model in accomplishing the task. %\textbf{Goal:}
+GPT-based models are one of the popular ones studied for tasks such as code
+comment generation or test generation. These tasks are `generative' tasks.
+However, there is limited research on the usage of LLMs for `non-generative'
+tasks such as classification using the prompt-based paradigm. In this
+preliminary exploratory study, we investigated the applicability of LLMs for
+Code Clone Detection (CCD), a non-generative task. %\textbf{Method:} By
+building a mono-lingual and cross-lingual CCD dataset derived from CodeNet, we
+first investigated two different prompts using ChatGPT to detect
+\textcolor{black}{Type-4} code clones in Java-Java and Java-Ruby pairs in a
+zero-shot setting. We \textcolor{black}{then} conducted an analysis to
+understand the strengths and weaknesses of ChatGPT in CCD. %\textbf{Results:}
+ChatGPT surpasses the baselines in cross-language CCD
+\textcolor{black}{attaining an F1-score of 0.877 } and achieves comparable
+performance to fully fine-tuned models for mono-lingual CCD,
+\textcolor{black}{with an F1-score of 0.878}. Also, the
+\textcolor{black}{prompt and the} difficulty level of the problems has an
+impact on the performance of ChatGPT. \textcolor{black}{Finally,} we provide
+insights and future directions based on our initial analysis
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Unified Approach to Emotion Detection and Task-Oriented Dialogue
+  Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13789v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13789v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armand Stricker, Patrick Paroubek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In current text-based task-oriented dialogue (TOD) systems, user emotion
+detection (ED) is often overlooked or is typically treated as a separate and
+independent task, requiring additional training. In contrast, our work
+demonstrates that seamlessly unifying ED and TOD modeling brings about mutual
+benefits, and is therefore an alternative to be considered. Our method consists
+in augmenting SimpleToD, an end-to-end TOD system, by extending belief state
+tracking to include ED, relying on a single language model. We evaluate our
+approach using GPT-2 and Llama-2 on the EmoWOZ benchmark, a version of MultiWOZ
+annotated with emotions. Our results reveal a general increase in performance
+for ED and task results. Our findings also indicate that user emotions provide
+useful contextual conditioning for system responses, and can be leveraged to
+further refine responses in terms of empathy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted @ IWSDS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tweets to Citations: Unveiling the Impact of Social Media Influencers on
+  AI Research Visibility 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iain Xie Weissburg, Mehir Arora, Liangming Pan, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the number of accepted papers at AI and ML conferences reaches into the
+thousands, it has become unclear how researchers access and read research
+publications. In this paper, we investigate the role of social media
+influencers in enhancing the visibility of machine learning research,
+particularly the citation counts of papers they share. We have compiled a
+comprehensive dataset of over 8,000 papers, spanning tweets from December 2018
+to October 2023, alongside 1:1 matched controls based on publication year,
+venue, and abstract topics. Our analysis reveals a significant increase in
+citations for papers endorsed by these influencers, with median citation counts
+2-3 times higher than those of the control group. Additionally, the study
+delves into the geographic, gender, and institutional diversity of highlighted
+authors. These findings highlight the expanding influence of social media in
+scholarly communication and underscore the importance of an evolving ecosystem
+in today's digital academic landscape.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Batch Calibration: Rethinking Calibration for In-Context Learning and
+  <span class="highlight-title">Prompt</span> Engineering <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17249v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17249v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Zhou, Xingchen Wan, Lev Proleev, Diana Mincu, Jilin Chen, Katherine Heller, Subhrajit Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompting and in-context learning (ICL) have become efficient learning
+paradigms for large language models (LLMs). However, LLMs suffer from prompt
+brittleness and various bias factors in the prompt, including but not limited
+to the formatting, the choice verbalizers, and the ICL examples. To address
+this problem that results in unexpected performance degradation, calibration
+methods have been developed to mitigate the effects of these biases while
+recovering LLM performance. In this work, we first conduct a systematic
+analysis of the existing calibration methods, where we both provide a unified
+view and reveal the failure cases. Inspired by these analyses, we propose Batch
+Calibration (BC), a simple yet intuitive method that controls the contextual
+bias from the batched input, unifies various prior approaches, and effectively
+addresses the aforementioned issues. BC is zero-shot, inference-only, and
+incurs negligible additional costs. In the few-shot setup, we further extend BC
+to allow it to learn the contextual bias from labeled data. We validate the
+effectiveness of BC with PaLM 2-(S, M, L) and CLIP models and demonstrate
+state-of-the-art performance over previous calibration baselines across more
+than 10 natural language understanding and image classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. 9 pages, 9 figures, 3 tables (22 pages, 11 figures, 11
+  tables including references and appendices)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-tuning and Utilization Methods of Domain-specific LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.02981v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.02981v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheonsu Jeong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent releases of pre-trained Large Language Models (LLMs) have gained
+considerable traction, yet research on fine-tuning and employing
+domain-specific LLMs remains scarce. This study investigates approaches for
+fine-tuning and leveraging domain-specific LLMs, highlighting trends in LLMs,
+foundational models, and methods for domain-specific pre-training. Focusing on
+the financial sector, it details dataset selection, preprocessing, model
+choice, and considerations crucial for LLM fine-tuning in finance. Addressing
+the unique characteristics of financial data, the study explores the
+construction of domain-specific vocabularies and considerations for security
+and regulatory compliance. In the practical application of LLM fine-tuning, the
+study outlines the procedure and implementation for generating domain-specific
+LLMs in finance. Various financial cases, including stock price prediction,
+sentiment analysis of financial news, automated document processing, research,
+information extraction, and customer service enhancement, are exemplified. The
+study explores the potential of LLMs in the financial domain, identifies
+limitations, and proposes directions for improvement, contributing valuable
+insights for future research. Ultimately, it advances natural language
+processing technology in business, suggesting proactive LLM utilization in
+financial services across industries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Transferable are Attribute Controllers on <span class="highlight-title">Pretrain</span>ed Multilingual
+  Translation Models? <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08565v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08565v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danni Liu, Jan Niehues
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Customizing machine translation models to comply with desired attributes
+(e.g., formality or grammatical gender) is a well-studied topic. However, most
+current approaches rely on (semi-)supervised data with attribute annotations.
+This data scarcity bottlenecks democratizing such customization possibilities
+to a wider range of languages, particularly lower-resource ones. This gap is
+out of sync with recent progress in pretrained massively multilingual
+translation models. In response, we transfer the attribute controlling
+capabilities to languages without attribute-annotated data with an NLLB-200
+model as a foundation. Inspired by techniques from controllable generation, we
+employ a gradient-based inference-time controller to steer the pretrained
+model. The controller transfers well to zero-shot conditions, as it operates on
+pretrained multilingual representations and is attribute -- rather than
+language-specific. With a comprehensive comparison to finetuning-based control,
+we demonstrate that, despite finetuning's clear dominance in supervised
+settings, the gap to inference-time control closes when moving to zero-shot
+conditions, especially with new and distant target languages. The latter also
+shows stronger domain robustness. We further show that our inference-time
+control complements finetuning. A human evaluation on a real low-resource
+language, Bengali, confirms our findings. Our code is
+https://github.com/dannigt/attribute-controller-transfer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hidden Flaws Behind Expert-Level Accuracy of <span class="highlight-title">GPT</span>-4 Vision in Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08396v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08396v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiao Jin, Fangyuan Chen, Yiliang Zhou, Ziyang Xu, Justin M. Cheung, Robert Chen, Ronald M. Summers, Justin F. Rousseau, Peiyun Ni, Marc J Landsman, Sally L. Baxter, Subhi J. Al'Aref, Yijia Li, Michael F. Chiang, Yifan Peng, Zhiyong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies indicate that Generative Pre-trained Transformer 4 with Vision
+(GPT-4V) outperforms human physicians in medical challenge tasks. However,
+these evaluations primarily focused on the accuracy of multi-choice questions
+alone. Our study extends the current scope by conducting a comprehensive
+analysis of GPT-4V's rationales of image comprehension, recall of medical
+knowledge, and step-by-step multimodal reasoning when solving New England
+Journal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test
+the knowledge and diagnostic capabilities of medical professionals. Evaluation
+results confirmed that GPT-4V outperforms human physicians regarding
+multi-choice accuracy (88.0% vs. 77.0%, p=0.034). GPT-4V also performs well in
+cases where physicians incorrectly answer, with over 80% accuracy. However, we
+discovered that GPT-4V frequently presents flawed rationales in cases where it
+makes the correct final choices (27.3%), most prominent in image comprehension
+(21.6%). Regardless of GPT-4V's high accuracy in multi-choice questions, our
+findings emphasize the necessity for further in-depth evaluations of its
+rationales before integrating such models into clinical workflows.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stack Attention: Improving the Ability of <span class="highlight-title">Transformer</span>s to Model
+  Hierarchical Patterns <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01749v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01749v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian DuSell, David Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention, specifically scaled dot-product attention, has proven effective
+for natural language, but it does not have a mechanism for handling
+hierarchical patterns of arbitrary nesting depth, which limits its ability to
+recognize certain syntactic structures. To address this shortcoming, we propose
+stack attention: an attention operator that incorporates stacks, inspired by
+their theoretical connections to context-free languages (CFLs). We show that
+stack attention is analogous to standard attention, but with a latent model of
+syntax that requires no syntactic supervision. We propose two variants: one
+related to deterministic pushdown automata (PDAs) and one based on
+nondeterministic PDAs, which allows transformers to recognize arbitrary CFLs.
+We show that transformers with stack attention are very effective at learning
+CFLs that standard transformers struggle on, achieving strong results on a CFL
+with theoretically maximal parsing difficulty. We also show that stack
+attention is more effective at natural language modeling under a constrained
+parameter budget, and we include results on machine translation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 4 figures. Published as a spotlight paper at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Anisotropy Is Inherent to Self-Attention in <span class="highlight-title">Transformer</span>s <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Godey, Éric de la Clergerie, Benoît Sagot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The representation degeneration problem is a phenomenon that is widely
+observed among self-supervised learning methods based on Transformers. In NLP,
+it takes the form of anisotropy, a singular property of hidden representations
+which makes them unexpectedly close to each other in terms of angular distance
+(cosine-similarity). Some recent works tend to show that anisotropy is a
+consequence of optimizing the cross-entropy loss on long-tailed distributions
+of tokens. We show in this paper that anisotropy can also be observed
+empirically in language models with specific objectives that should not suffer
+directly from the same consequences. We also show that the anisotropy problem
+extends to Transformers trained on other modalities. Our observations suggest
+that anisotropy is actually inherent to Transformers-based models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of EACL 2024. A previous version of the paper, published
+  as arXiv:2306.07656, was presented at ACL-SRW 2023 (non-archival)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VELMA: Verbalization Embodiment of LLM Agents for Vision and Language
+  Navigation in Street View <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06082v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06082v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Schumann, Wanrong Zhu, Weixi Feng, Tsu-Jui Fu, Stefan Riezler, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incremental decision making in real-world environments is one of the most
+challenging tasks in embodied artificial intelligence. One particularly
+demanding scenario is Vision and Language Navigation~(VLN) which requires
+visual and natural language understanding as well as spatial and temporal
+reasoning capabilities. The embodied agent needs to ground its understanding of
+navigation instructions in observations of a real-world environment like Street
+View. Despite the impressive results of LLMs in other research areas, it is an
+ongoing problem of how to best connect them with an interactive visual
+environment. In this work, we propose VELMA, an embodied LLM agent that uses a
+verbalization of the trajectory and of visual environment observations as
+contextual prompt for the next action. Visual information is verbalized by a
+pipeline that extracts landmarks from the human written navigation instructions
+and uses CLIP to determine their visibility in the current panorama view. We
+show that VELMA is able to successfully follow navigation instructions in
+Street View with only two in-context examples. We further finetune the LLM
+agent on a few thousand examples and achieve 25%-30% relative improvement in
+task completion over the previous state-of-the-art for two datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Between Lines of Code: Unraveling the Distinct Patterns of Machine and
+  Human Programmers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06461v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06461v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuling Shi, Hongyu Zhang, Chengcheng Wan, Xiaodong Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have catalyzed an unprecedented wave in code
+generation. While achieving significant advances, they blur the distinctions
+between machine-and human-authored source code, causing integrity and
+authenticity issues of software artifacts. Previous methods such as DetectGPT
+have proven effective in discerning machine-generated texts, but they do not
+identify and harness the unique patterns of machine-generated code. Thus, its
+applicability falters when applied to code. In this paper, we carefully study
+the specific patterns that characterize machine and human-authored code.
+Through a rigorous analysis of code attributes such as length, lexical
+diversity, and naturalness, we expose unique pat-terns inherent to each source.
+We particularly notice that the structural segmentation of code is a critical
+factor in identifying its provenance. Based on our findings, we propose a novel
+machine-generated code detection method called DetectCodeGPT, which improves
+DetectGPT by capturing the distinct structural patterns of code. Diverging from
+conventional techniques that depend on external LLMs for perturbations,
+DetectCodeGPT perturbs the code corpus by strategically inserting spaces and
+newlines, ensuring both efficacy and efficiency. Experiment results show that
+our approach significantly outperforms state-of-the-art techniques in detecting
+machine-generated code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>paper submitted to IEEE Transactions on Software Engineering, code
+  available at https://github.com/YerbaPage/DetectCodeGPT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A New Sentence Extraction Strategy for Unsupervised Extractive
+  Summarization Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.03203v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.03203v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dehao Tao, Yingzhu Xiong, Zhongliang Yang, Yongfeng Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, text summarization methods have attracted much attention
+again thanks to the researches on neural network models. Most of the current
+text summarization methods based on neural network models are supervised
+methods which need large-scale datasets. However, large-scale datasets are
+difficult to obtain in practical applications. In this paper, we model the task
+of extractive text summarization methods from the perspective of Information
+Theory, and then describe the unsupervised extractive methods with a uniform
+framework. To improve the feature distribution and to decrease the mutual
+information of summarization sentences, we propose a new sentence extraction
+strategy which can be applied to existing unsupervised extractive methods.
+Experiments are carried out on different datasets, and results show that our
+strategy is indeed effective and in line with expectations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Baseline Analysis of Reward Models' Ability To Accurately Analyze
+  Foundation Models Under Distribution Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.14743v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.14743v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Will LeVine, Benjamin Pikus, Anthony Chen, Sean Hendryx
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models, specifically Large Language Models (LLMs), have lately
+gained wide-spread attention and adoption. Reinforcement Learning with Human
+Feedback (RLHF) involves training a reward model to capture desired behaviors,
+which is then used to align LLM's. These reward models are additionally used at
+inference-time to estimate LLM responses' adherence to those desired behaviors.
+However, there is little work measuring how robust these reward models are to
+distribution shifts. In this work, we evaluate how reward model performance -
+measured via accuracy and calibration (i.e. alignment between accuracy and
+confidence) - is affected by distribution shift. We show novel calibration
+patterns and accuracy drops due to OOD prompts and responses, and that the
+reward model is more sensitive to shifts in responses than prompts.
+Additionally, we adapt an OOD detection technique commonly used in
+classification to the reward model setting to detect these distribution shifts
+in prompts and responses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language Modeling on a SpiNNaker 2 Neuromorphic Chip 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09084v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09084v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khaleelulla Khan Nazeer, Mark Schöne, Rishav Mukherji, Bernhard Vogginger, Christian Mayr, David Kappel, Anand Subramoney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models continue to scale in size rapidly, so too does the
+computational power required to run them. Event-based networks on neuromorphic
+devices offer a potential way to reduce energy consumption for inference
+significantly. However, to date, most event-based networks that can run on
+neuromorphic hardware, including spiking neural networks (SNNs), have not
+achieved task performance even on par with LSTM models for language modeling.
+As a result, language modeling on neuromorphic devices has seemed a distant
+prospect. In this work, we demonstrate the first-ever implementation of a
+language model on a neuromorphic device - specifically the SpiNNaker 2 chip -
+based on a recently published event-based architecture called the EGRU.
+SpiNNaker 2 is a many-core neuromorphic chip designed for large-scale
+asynchronous processing, while the EGRU is architected to leverage such
+hardware efficiently while maintaining competitive task performance. This
+implementation marks the first time a neuromorphic language model matches
+LSTMs, setting the stage for taking task performance to the level of large
+language models. We also demonstrate results on a gesture recognition task
+based on inputs from a DVS camera. Overall, our results showcase the
+feasibility of this neuro-inspired neural network in hardware, highlighting
+significant gains versus conventional hardware in energy efficiency for the
+common use case of single batch inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Supporting Student Decisions on Learning Recommendations: An LLM-Based
+  Chatbot with Knowledge Graph Contextualization for Conversational
+  Explainability and Mentoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08517v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08517v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hasan Abu-Rasheed, Mohamad Hussam Abdulsalam, Christian Weber, Madjid Fathi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Student commitment towards a learning recommendation is not separable from
+their understanding of the reasons it was recommended to them; and their
+ability to modify it based on that understanding. Among explainability
+approaches, chatbots offer the potential to engage the student in a
+conversation, similar to a discussion with a peer or a mentor. The capabilities
+of chatbots, however, are still not sufficient to replace a human mentor,
+despite the advancements of generative AI (GenAI) and large language models
+(LLM). Therefore, we propose an approach to utilize chatbots as mediators of
+the conversation and sources of limited and controlled generation of
+explanations, to harvest the potential of LLMs while reducing their potential
+risks at the same time. The proposed LLM-based chatbot supports students in
+understanding learning-paths recommendations. We use a knowledge graph (KG) as
+a human-curated source of information, to regulate the LLM's output through
+defining its prompt's context. A group chat approach is developed to connect
+students with human mentors, either on demand or in cases that exceed the
+chatbot's pre-defined tasks. We evaluate the chatbot with a user study, to
+provide a proof-of-concept and highlight the potential requirements and
+limitations of utilizing chatbots in conversational explainability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>ASR for contextualized ASR with controllable style <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.07414v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.07414v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Yang, Wei Kang, Zengwei Yao, Yifan Yang, Liyong Guo, Fangjun Kuang, Long Lin, Daniel Povey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompts are crucial to large language models as they provide context
+information such as topic or logical relationships. Inspired by this, we
+propose PromptASR, a framework that integrates prompts in end-to-end automatic
+speech recognition (E2E ASR) systems to achieve contextualized ASR with
+controllable style of transcriptions. Specifically, a dedicated text encoder
+encodes the text prompts and the encodings are injected into the speech encoder
+by cross-attending the features from two modalities. When using the ground
+truth text from preceding utterances as content prompt, the proposed system
+achieves 21.9% and 6.8% relative word error rate reductions on a book reading
+dataset and an in-house dataset compared to a baseline ASR system. The system
+can also take word-level biasing lists as prompt to improve recognition
+accuracy on rare words. An additional style prompt can be given to the text
+encoder and guide the ASR system to output different styles of transcriptions.
+The code is available at icefall.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proc. ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ "Medium" LMs of Code in the Era of LLMs: Lessons From StackOverflow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03268v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03268v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manisha Mukherjee, Vincent J. Hellendoorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large pre-trained neural language models have brought immense progress to
+both NLP and software engineering. Models in OpenAI's GPT series now dwarf
+Google's BERT and Meta's RoBERTa, which previously set new benchmarks on a wide
+range of NLP applications. These models are trained on massive corpora of
+heterogeneous data from web crawls, which enables them to learn general
+language patterns and semantic relationships. However, the largest models are
+both expensive to train and deploy and are often closed-source, so we lack
+access to their data and design decisions. We argue that this trend towards
+large, general-purpose models should be complemented with single-purpose, more
+modestly sized pre-trained models. In this work, we take StackOverflow (SO) as
+a domain example in which large volumes of rich aligned code and text data is
+available. We adopt standard practices for pre-training large language models,
+including using a very large context size (2,048 tokens), batch size (0.5M
+tokens) and training set (27B tokens), coupled with a powerful toolkit
+(Megatron-LM), to train two models: SOBertBase, with 109M parameters, and
+SOBertLarge with 762M parameters, at a budget of just $\$187$ and $\$800$ each.
+We compare the performance of our models with both the previous SOTA model
+trained on SO data exclusively as well general-purpose BERT models and OpenAI's
+ChatGPT on four SO-specific downstream tasks - question quality prediction,
+closed question prediction, named entity recognition and obsoletion prediction
+(a new task we introduce). Not only do our models consistently outperform all
+baselines, the smaller model is often sufficient for strong results. Both
+models are released to the public. These results demonstrate that pre-training
+both extensively and properly on in-domain data can yield a powerful and
+affordable alternative to leveraging closed-source general-purpose models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formally Specifying the High-Level Behavior of LLM-Based Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08535v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08535v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxwell Crouse, Ibrahim Abdelaziz, Ramon Astudillo, Kinjal Basu, Soham Dan, Sadhana Kumaravel, Achille Fokoue, Pavan Kapanipathi, Salim Roukos, Luis Lastras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous, goal-driven agents powered by LLMs have recently emerged as
+promising tools for solving challenging problems without the need for
+task-specific finetuned models that can be expensive to procure. Currently, the
+design and implementation of such agents is ad hoc, as the wide variety of
+tasks that LLM-based agents may be applied to naturally means there can be no
+one-size-fits-all approach to agent design. In this work we aim to alleviate
+the difficulty of designing and implementing new agents by proposing a
+minimalistic generation framework that simplifies the process of building
+agents. The framework we introduce allows the user to define desired agent
+behaviors in a high-level, declarative specification that is then used to
+construct a decoding monitor which guarantees the LLM will produce an output
+exhibiting the desired behavior. Our declarative approach, in which the
+behavior is described without concern for how it should be implemented or
+enforced, enables rapid design, implementation, and experimentation with
+different LLM-based agents. We demonstrate how the proposed framework can be
+used to implement recent LLM-based agents (e.g., ReACT), and show how the
+flexibility of our approach can be leveraged to define a new agent with more
+complex behavior, the Plan-Act-Summarize-Solve (PASS) agent. Lastly, we
+demonstrate that our method outperforms other agents on multiple popular
+reasoning-centric question-answering benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reward Engineering for Generating Semi-structured Explanation <span class="chip">EACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08347v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08347v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiuzhou Han, Wray Buntine, Ehsan Shareghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-structured explanation depicts the implicit process of a reasoner with
+an explicit representation. This explanation highlights how available
+information in a specific query is utilised and supplemented with information a
+reasoner produces from its internal weights towards generating an answer.
+Despite the recent improvements in generative capabilities of language models,
+producing structured explanations to verify a model's true reasoning
+capabilities remains a challenge. This issue is particularly pronounced for
+not-so-large LMs (e.g., FLAN-T5-XXL). In this work, we first underscore the
+limitations of supervised fine-tuning (SFT) in tackling this challenge, and
+then introduce a carefully crafted reward engineering method in reinforcement
+learning (RL) to better address this problem. We investigate multiple reward
+aggregation methods and provide a detailed discussion which sheds light on the
+promising potential of RL for future research. Our proposed method on two
+semi-structured explanation generation benchmarks (ExplaGraph and COPA-SSE)
+achieves new state-of-the-art results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EACL2024; code is available at
+  https://github.com/Jiuzhouh/Reward-Engineering-for-Generating-SEG</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models are Zero-Shot Rankers for Recommender Systems <span class="chip">ECIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08845v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08845v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupeng Hou, Junjie Zhang, Zihan Lin, Hongyu Lu, Ruobing Xie, Julian McAuley, Wayne Xin Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large language models (LLMs) (e.g., GPT-4) have demonstrated
+impressive general-purpose task-solving abilities, including the potential to
+approach recommendation tasks. Along this line of research, this work aims to
+investigate the capacity of LLMs that act as the ranking model for recommender
+systems. We first formalize the recommendation problem as a conditional ranking
+task, considering sequential interaction histories as conditions and the items
+retrieved by other candidate generation models as candidates. To solve the
+ranking task by LLMs, we carefully design the prompting template and conduct
+extensive experiments on two widely-used datasets. We show that LLMs have
+promising zero-shot ranking abilities but (1) struggle to perceive the order of
+historical interactions, and (2) can be biased by popularity or item positions
+in the prompts. We demonstrate that these issues can be alleviated using
+specially designed prompting and bootstrapping strategies. Equipped with these
+insights, zero-shot LLMs can even challenge conventional recommendation models
+when ranking candidates are retrieved by multiple candidate generators. The
+code and processed datasets are available at
+https://github.com/RUCAIBox/LLMRank.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECIR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OWQ: Outlier-Aware Weight Quantization for Efficient Fine-Tuning and
+  Inference of Large Language Models <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02272v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02272v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changhun Lee, Jungyu Jin, Taesu Kim, Hyungjun Kim, Eunhyeok Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) with hundreds of billions of parameters require
+powerful server-grade GPUs for inference, limiting their practical deployment.
+To address this challenge, we introduce the outlier-aware weight quantization
+(OWQ) method, which aims to minimize LLM's footprint through low-precision
+representation. OWQ prioritizes a small subset of structured weights sensitive
+to quantization, storing them in high-precision, while applying highly tuned
+quantization to the remaining dense weights. This sensitivity-aware
+mixed-precision scheme reduces the quantization error notably, and extensive
+experiments demonstrate that 3.1-bit models using OWQ perform comparably to
+4-bit models optimized by OPTQ. Furthermore, OWQ incorporates a
+parameter-efficient fine-tuning for task-specific adaptation, called weak
+column tuning (WCT), enabling accurate task-specific LLM adaptation with
+minimal memory overhead in the optimized format. OWQ represents a notable
+advancement in the flexibility, efficiency, and practicality of LLM
+optimization literature. The source code is available at
+https://github.com/xvyaward/owq
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2024 (oral presentation)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CALM : A Multi-task Benchmark for Comprehensive Assessment of Language
+  Model Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12539v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12539v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vipul Gupta, Pranav Narayanan Venkit, Hugo Laurençon, Shomir Wilson, Rebecca J. Passonneau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As language models (LMs) become increasingly powerful and widely used, it is
+important to quantify them for sociodemographic bias with potential for harm.
+Prior measures of bias are sensitive to perturbations in the templates designed
+to compare performance across social groups, due to factors such as low
+diversity or limited number of templates. Also, most previous work considers
+only one NLP task. We introduce Comprehensive Assessment of Language Models
+(CALM) for robust measurement of two types of universally relevant
+sociodemographic bias, gender and race. CALM integrates sixteen datasets for
+question-answering, sentiment analysis and natural language inference. Examples
+from each dataset are filtered to produce 224 templates with high diversity
+(e.g., length, vocabulary). We assemble 50 highly frequent person names for
+each of seven distinct demographic groups to generate 78,400 prompts covering
+the three NLP tasks. Our empirical evaluation shows that CALM bias scores are
+more robust and far less sensitive than previous bias measurements to
+perturbations in the templates, such as synonym substitution, or to random
+subset selection of templates. We apply CALM to 20 large language models, and
+find that for 2 language model series, larger parameter models tend to be more
+biased than smaller ones. The T0 series is the least biased model families, of
+the 20 LLMs investigated here. The code is available at
+https://github.com/vipulgupta1011/CALM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Perplexity for Controlled Generation: An Application in
+  Detoxifying Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08491v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08491v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tassilo Klein, Moin Nabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of undesirable and factually incorrect content of large
+language models poses a significant challenge and remains largely an unsolved
+issue. This paper studies the integration of a contrastive learning objective
+for fine-tuning LLMs for implicit knowledge editing and controlled text
+generation. Optimizing the training objective entails aligning text
+perplexities in a contrastive fashion. To facilitate training the model in a
+self-supervised fashion, we leverage an off-the-shelf LLM for training data
+generation. We showcase applicability in the domain of detoxification. Herein,
+the proposed approach leads to a significant decrease in the generation of
+toxic content while preserving general utility for downstream tasks such as
+commonsense reasoning and reading comprehension. The proposed approach is
+conceptually simple but empirically powerful.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Task-Oriented Dialogues with Chitchat: a Comparative Study
+  Based on Lexical Diversity and Divergence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.14067v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.14067v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armand Stricker, Patrick Paroubek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a recent development, task-oriented dialogues (TODs) have been enriched
+with chitchat in an effort to make dialogues more diverse and engaging. This
+enhancement is particularly valuable as TODs are often confined to narrow
+domains, making the mitigation of repetitive and predictable responses a
+significant challenge. This paper presents a comparative analysis of three
+chitchat enhancements, aiming to identify the most effective approach in terms
+of diversity. Additionally, we quantify the divergence between the added
+chitchat, the original task-oriented language, and chitchat typically found in
+chitchat datasets, highlighting the top 20 divergent keywords for each
+comparison. Our findings drive a discussion on future enhancements for
+augmenting TODs, emphasizing the importance of grounding dialogues beyond the
+task to achieve more diverse and natural exchanges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted @ ASRU 2023 Code:
+  https://github.com/armandstrickernlp/Task-Chitchat-Entropy</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Large Language Models for Collective Decision-Making 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.04928v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.04928v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Papachristou, Longqi Yang, Chin-Chia Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In various work contexts, such as meeting scheduling, collaborating, and
+project planning, collective decision-making is essential but often challenging
+due to diverse individual preferences, varying work focuses, and power dynamics
+among members. To address this, we propose a system leveraging Large Language
+Models (LLMs) to facilitate group decision-making by managing conversations and
+balancing preferences among individuals. Our system aims to extract individual
+preferences from conversations and suggest options that satisfy the preferences
+of the members. We specifically apply this system to corporate meeting
+scheduling. We create synthetic employee profiles and simulate conversations at
+scale, leveraging LLMs to evaluate the system performance as a novel approach
+to conducting a user study. Our results indicate efficient coordination with
+reduced interactions between the members and the LLM-based system. The system
+refines and improves its proposed options over time, ensuring that many of the
+members' individual preferences are satisfied in an equitable way. Finally, we
+conduct a survey study involving human participants to assess our system's
+ability to aggregate preferences and reasoning about them. Our findings show
+that the system exhibits strong performance in both dimensions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Comparison with baselines, requirements analysis, expand related work</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">109</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Algebraic methods for solving recognition problems with non-crossing
+  classes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anvar Kabulov, Alimdzhan Babadzhanov, Islambek Saymanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose to consider various models of pattern recognition.
+At the same time, it is proposed to consider models in the form of two
+operators: a recognizing operator and a decision rule. Algebraic operations are
+introduced on recognizing operators, and based on the application of these
+operators, a family of recognizing algorithms is created. An upper estimate is
+constructed for the model, which guarantees the completeness of the extension.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>VII World Congress of Turkic World Mathematicians, 20-23 September
+  2023, Turkestan, Kazakhstan</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tyche: Stochastic In-Context Learning for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marianne Rakic, Hallee E. Wong, Jose Javier Gonzalez Ortiz, Beth Cimini, John Guttag, Adrian V. Dalca
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing learning-based solutions to medical image segmentation have two
+important shortcomings. First, for most new segmentation task, a new model has
+to be trained or fine-tuned. This requires extensive resources and machine
+learning expertise, and is therefore often infeasible for medical researchers
+and clinicians. Second, most existing segmentation methods produce a single
+deterministic segmentation mask for a given image. In practice however, there
+is often considerable uncertainty about what constitutes the correct
+segmentation, and different expert annotators will often segment the same image
+differently. We tackle both of these problems with Tyche, a model that uses a
+context set to generate stochastic predictions for previously unseen tasks
+without the need to retrain. Tyche differs from other in-context segmentation
+methods in two important ways. (1) We introduce a novel convolution block
+architecture that enables interactions among predictions. (2) We introduce
+in-context test-time augmentation, a new mechanism to provide prediction
+stochasticity. When combined with appropriate model design and loss functions,
+Tyche can predict a set of plausible diverse segmentation candidates for new or
+unseen medical images and segmentation tasks without the need to retrain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web
+  Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Yu Koh, Robert Lo, Lawrence Jang, Vikram Duvvur, Ming Chong Lim, Po-Yu Huang, Graham Neubig, Shuyan Zhou, Ruslan Salakhutdinov, Daniel Fried
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous agents capable of planning, reasoning, and executing actions on
+the web offer a promising avenue for automating computer tasks. However, the
+majority of existing benchmarks primarily focus on text-based agents,
+neglecting many natural tasks that require visual information to effectively
+solve. Given that most computer interfaces cater to human perception, visual
+information often augments textual data in ways that text-only models struggle
+to harness effectively. To bridge this gap, we introduce VisualWebArena, a
+benchmark designed to assess the performance of multimodal web agents on
+realistic \textit{visually grounded tasks}. VisualWebArena comprises of a set
+of diverse and complex web-based tasks that evaluate various capabilities of
+autonomous multimodal agents. To perform on this benchmark, agents need to
+accurately process image-text inputs, interpret natural language instructions,
+and execute actions on websites to accomplish user-defined objectives. We
+conduct an extensive evaluation of state-of-the-art LLM-based autonomous
+agents, including several multimodal models. Through extensive quantitative and
+qualitative analysis, we identify several limitations of text-only LLM agents,
+and reveal gaps in the capabilities of state-of-the-art multimodal language
+agents. VisualWebArena provides a framework for evaluating multimodal
+autonomous language agents, and offers insights towards building stronger
+autonomous agents for the web. Our code, baseline models, and data is publicly
+available at https://jykoh.com/vwa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages. Project page: https://jykoh.com/vwa</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Good is Chat<span class="highlight-title">GPT</span> at Face Biometrics? A First Look into Recognition,
+  Soft Biometrics, and Explainability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13641v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13641v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan DeAndres-Tame, Ruben Tolosana, Ruben Vera-Rodriguez, Aythami Morales, Julian Fierrez, Javier Ortega-Garcia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) such as GPT developed by OpenAI, have already
+shown astonishing results, introducing quick changes in our society. This has
+been intensified by the release of ChatGPT which allows anyone to interact in a
+simple conversational way with LLMs, without any experience in the field
+needed. As a result, ChatGPT has been rapidly applied to many different tasks
+such as code- and song-writer, education, virtual assistants, etc., showing
+impressive results for tasks for which it was not trained (zero-shot learning).
+  The present study aims to explore the ability of ChatGPT, based on the recent
+GPT-4 multimodal LLM, for the task of face biometrics. In particular, we
+analyze the ability of ChatGPT to perform tasks such as face verification,
+soft-biometrics estimation, and explainability of the results. ChatGPT could be
+very valuable to further increase the explainability and transparency of the
+automatic decisions in human scenarios. Experiments are carried out in order to
+evaluate the performance and robustness of ChatGPT, using popular public
+benchmarks and comparing the results with state-of-the-art methods in the
+field. The results achieved in this study show the potential of LLMs such as
+ChatGPT for face biometrics, especially to enhance explainability. For
+reproducibility reasons, we release all the code in GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Up to Excellence: Practicing Model Scaling for Photo-Realistic
+  Image Restoration In the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13627v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13627v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanghua Yu, Jinjin Gu, Zheyuan Li, Jinfan Hu, Xiangtao Kong, Xintao Wang, Jingwen He, Yu Qiao, Chao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce SUPIR (Scaling-UP Image Restoration), a groundbreaking image
+restoration method that harnesses generative prior and the power of model
+scaling up. Leveraging multi-modal techniques and advanced generative prior,
+SUPIR marks a significant advance in intelligent and realistic image
+restoration. As a pivotal catalyst within SUPIR, model scaling dramatically
+enhances its capabilities and demonstrates new potential for image restoration.
+We collect a dataset comprising 20 million high-resolution, high-quality images
+for model training, each enriched with descriptive text annotations. SUPIR
+provides the capability to restore images guided by textual prompts, broadening
+its application scope and potential. Moreover, we introduce negative-quality
+prompts to further improve perceptual quality. We also develop a
+restoration-guided sampling method to suppress the fidelity issue encountered
+in generative-based restoration. Experiments demonstrate SUPIR's exceptional
+restoration effects and its novel capacity to manipulate restoration through
+textual prompts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLLIC: Functionally Lossless Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Zhang, Xiaolin Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, DNN models for lossless image coding have surpassed their
+traditional counterparts in compression performance, reducing the bit rate by
+about ten percent for natural color images. But even with these advances,
+mathematically lossless image compression (MLLIC) ratios for natural images
+still fall short of the bandwidth and cost-effectiveness requirements of most
+practical imaging and vision systems at present and beyond. To break the
+bottleneck of MLLIC in compression performance, we question the necessity of
+MLLIC, as almost all digital sensors inherently introduce acquisition noises,
+making mathematically lossless compression counterproductive. Therefore, in
+contrast to MLLIC, we propose a new paradigm of joint denoising and compression
+called functionally lossless image compression (FLLIC), which performs lossless
+compression of optimally denoised images (the optimality may be task-specific).
+Although not literally lossless with respect to the noisy input, FLLIC aims to
+achieve the best possible reconstruction of the latent noise-free original
+image. Extensive experiments show that FLLIC achieves state-of-the-art
+performance in joint denoising and compression of noisy images and does so at a
+lower computational cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Image Retrieval : A Comprehensive Study on Photo Search using
+  the CLIP Mode 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naresh Kumar Lahajal, Harini S
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Photo search, the task of retrieving images based on textual queries, has
+witnessed significant advancements with the introduction of CLIP (Contrastive
+Language-Image Pretraining) model. CLIP leverages a vision-language pre
+training approach, wherein it learns a shared representation space for images
+and text, enabling cross-modal understanding. This model demonstrates the
+capability to understand the semantic relationships between diverse image and
+text pairs, allowing for efficient and accurate retrieval of images based on
+natural language queries. By training on a large-scale dataset containing
+images and their associated textual descriptions, CLIP achieves remarkable
+generalization, providing a powerful tool for tasks such as zero-shot learning
+and few-shot classification. This abstract summarizes the foundational
+principles of CLIP and highlights its potential impact on advancing the field
+of photo search, fostering a seamless integration of natural language
+understanding and computer vision for improved information retrieval in
+multimedia applications
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PLATE: A perception-latency aware estimator, 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rodrigo Aldana-López, Rosario Aragüés, Carlos Sagüés
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Target tracking is a popular problem with many potential applications. There
+has been a lot of effort on improving the quality of the detection of targets
+using cameras through different techniques. In general, with higher
+computational effort applied, i.e., a longer perception-latency, a better
+detection accuracy is obtained. However, it is not always useful to apply the
+longest perception-latency allowed, particularly when the environment doesn't
+require to and when the computational resources are shared between other tasks.
+In this work, we propose a new Perception-LATency aware Estimator (PLATE),
+which uses different perception configurations in different moments of time in
+order to optimize a certain performance measure. This measure takes into
+account a perception-latency and accuracy trade-off aiming for a good
+compromise between quality and resource usage. Compared to other heuristic
+frame-skipping techniques, PLATE comes with a formal complexity and optimality
+analysis. The advantages of PLATE are verified by several experiments including
+an evaluation over a standard benchmark with real data and using state of the
+art deep learning object detection methods for the perception stage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the accepted version an already published manuscript. See
+  journal reference for details</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Efficient and Effective Deep Clustering with Dynamic Grouping
+  and Prototype Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13581v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13581v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haixin Zhang, Dong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous contrastive deep clustering methods mostly focus on instance-level
+information while overlooking the member relationship within groups/clusters,
+which may significantly undermine their representation learning and clustering
+capability. Recently, some group-contrastive methods have been developed,
+which, however, typically rely on the samples of the entire dataset to obtain
+pseudo labels and lack the ability to efficiently update the group assignments
+in a batch-wise manner. To tackle these critical issues, we present a novel
+end-to-end deep clustering framework with dynamic grouping and prototype
+aggregation, termed as DigPro. Specifically, the proposed dynamic grouping
+extends contrastive learning from instance-level to group-level, which is
+effective and efficient for timely updating groups. Meanwhile, we perform
+contrastive learning on prototypes in a spherical feature space, termed as
+prototype aggregation, which aims to maximize the inter-cluster distance.
+Notably, with an expectation-maximization framework, DigPro simultaneously
+takes advantage of compact intra-cluster connections, well-separated clusters,
+and efficient group updating during the self-supervised training. Extensive
+experiments on six image benchmarks demonstrate the superior performance of our
+approach over the state-of-the-art. Code is available at
+https://github.com/Regan-Zhang/DigPro.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SegMamba: Long-range Sequential Modeling Mamba For 3D Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaohu Xing, Tian Ye, Yijun Yang, Guang Liu, Lei Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Transformer architecture has shown a remarkable ability in modeling
+global relationships. However, it poses a significant computational challenge
+when processing high-dimensional medical images. This hinders its development
+and widespread adoption in this task. Mamba, as a State Space Model (SSM),
+recently emerged as a notable manner for long-range dependencies in sequential
+modeling, excelling in natural language processing filed with its remarkable
+memory efficiency and computational speed. Inspired by its success, we
+introduce SegMamba, a novel 3D medical image \textbf{Seg}mentation
+\textbf{Mamba} model, designed to effectively capture long-range dependencies
+within whole volume features at every scale. Our SegMamba, in contrast to
+Transformer-based methods, excels in whole volume feature modeling from a state
+space model standpoint, maintaining superior processing speed, even with volume
+features at a resolution of {$64\times 64\times 64$}. Comprehensive experiments
+on the BraTS2023 dataset demonstrate the effectiveness and efficiency of our
+SegMamba. The code for SegMamba is available at:
+https://github.com/ge-xing/SegMamba
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code has released</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking the Fairness of Image Upsampling Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mike Laszkiewicz, Imant Daunhawer, Julia E. Vogt, Asja Fischer, Johannes Lederer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed a rapid development of deep generative models for
+creating synthetic media, such as images and videos. While the practical
+applications of these models in everyday tasks are enticing, it is crucial to
+assess the inherent risks regarding their fairness. In this work, we introduce
+a comprehensive framework for benchmarking the performance and fairness of
+conditional generative models. We develop a set of
+metrics$\unicode{x2013}$inspired by their supervised fairness
+counterparts$\unicode{x2013}$to evaluate the models on their fairness and
+diversity. Focusing on the specific application of image upsampling, we create
+a benchmark covering a wide variety of modern upsampling methods. As part of
+the benchmark, we introduce UnfairFace, a subset of FairFace that replicates
+the racial distribution of common large-scale face datasets. Our empirical
+study highlights the importance of using an unbiased training set and reveals
+variations in how the algorithms respond to dataset imbalances. Alarmingly, we
+find that none of the considered methods produces statistically fair and
+diverse results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PanAf20K: A Large Video <span class="highlight-title">Dataset</span> for Wild Ape Detection and Behaviour
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13554v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13554v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Otto Brookes, Majid Mirmehdi, Colleen Stephens, Samuel Angedakin, Katherine Corogenes, Dervla Dowd, Paula Dieguez, Thurston C. Hicks, Sorrel Jones, Kevin Lee, Vera Leinert, Juan Lapuente, Maureen S. McCarthy, Amelia Meier, Mizuki Murai, Emmanuelle Normand, Virginie Vergnes, Erin G. Wessling, Roman M. Wittig, Kevin Langergraber, Nuria Maldonado, Xinyu Yang, Klaus Zuberbuhler, Christophe Boesch, Mimi Arandjelovic, Hjalmar Kuhl, Tilo Burghardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the PanAf20K dataset, the largest and most diverse open-access
+annotated video dataset of great apes in their natural environment. It
+comprises more than 7 million frames across ~20,000 camera trap videos of
+chimpanzees and gorillas collected at 18 field sites in tropical Africa as part
+of the Pan African Programme: The Cultured Chimpanzee. The footage is
+accompanied by a rich set of annotations and benchmarks making it suitable for
+training and testing a variety of challenging and ecologically important
+computer vision tasks including ape detection and behaviour recognition.
+Furthering AI analysis of camera trap information is critical given the
+International Union for Conservation of Nature now lists all species in the
+great ape family as either Endangered or Critically Endangered. We hope the
+dataset can form a solid basis for engagement of the AI community to improve
+performance, efficiency, and result interpretation in order to support
+assessments of great ape presence, abundance, distribution, and behaviour and
+thereby aid conservation efforts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IJCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interleaving One-Class and Weakly-Supervised Models with Adaptive
+  Thresholding for Unsupervised Video Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13551v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13551v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongwei Nie, Hao Huang, Chengjiang Long, Qing Zhang, Pradipta Maji, Hongmin Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Without human annotations, a typical Unsupervised Video Anomaly Detection
+(UVAD) method needs to train two models that generate pseudo labels for each
+other. In previous work, the two models are closely entangled with each other,
+and it is not known how to upgrade their method without modifying their
+training framework significantly. Second, previous work usually adopts fixed
+thresholding to obtain pseudo labels, however the user-specified threshold is
+not reliable which inevitably introduces errors into the training process. To
+alleviate these two problems, we propose a novel interleaved framework that
+alternately trains a One-Class Classification (OCC) model and a
+Weakly-Supervised (WS) model for UVAD. The OCC or WS models in our method can
+be easily replaced with other OCC or WS models, which facilitates our method to
+upgrade with the most recent developments in both fields. For handling the
+fixed thresholding problem, we break through the conventional cognitive
+boundary and propose a weighted OCC model that can be trained on both normal
+and abnormal data. We also propose an adaptive mechanism for automatically
+finding the optimal threshold for the WS model in a loose to strict manner.
+Experiments demonstrate that the proposed UVAD method outperforms previous
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QAGait: Revisit Gait Recognition from a Quality Perspective <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13531v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13531v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zengbin Wang, Saihui Hou, Man Zhang, Xu Liu, Chunshui Cao, Yongzhen Huang, Peipei Li, Shibiao Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gait recognition is a promising biometric method that aims to identify
+pedestrians from their unique walking patterns. Silhouette modality, renowned
+for its easy acquisition, simple structure, sparse representation, and
+convenient modeling, has been widely employed in controlled in-the-lab
+research. However, as gait recognition rapidly advances from in-the-lab to
+in-the-wild scenarios, various conditions raise significant challenges for
+silhouette modality, including 1) unidentifiable low-quality silhouettes
+(abnormal segmentation, severe occlusion, or even non-human shape), and 2)
+identifiable but challenging silhouettes (background noise, non-standard
+posture, slight occlusion). To address these challenges, we revisit gait
+recognition pipeline and approach gait recognition from a quality perspective,
+namely QAGait. Specifically, we propose a series of cost-effective quality
+assessment strategies, including Maxmial Connect Area and Template Match to
+eliminate background noises and unidentifiable silhouettes, Alignment strategy
+to handle non-standard postures. We also propose two quality-aware loss
+functions to integrate silhouette quality into optimization within the
+embedding space. Extensive experiments demonstrate our QAGait can guarantee
+both gait reliability and performance enhancement. Furthermore, our quality
+assessment strategies can seamlessly integrate with existing gait datasets,
+showcasing our superiority. Code is available at
+https://github.com/wzb-bupt/QAGait.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Delocate: Detection and Localization for Deepfake Videos with
+  Randomly-Located Tampered Traces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13516v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13516v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Hu, Xin Liao, Difei Gao, Satoshi Tsutsui, Qian Wang, Zheng Qin, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deepfake videos are becoming increasingly realistic, showing subtle tampering
+traces on facial areasthat vary between frames. Consequently, many existing
+Deepfake detection methods struggle to detect unknown domain Deepfake videos
+while accurately locating the tampered region. To address thislimitation, we
+propose Delocate, a novel Deepfake detection model that can both recognize
+andlocalize unknown domain Deepfake videos. Ourmethod consists of two stages
+named recoveringand localization. In the recovering stage, the modelrandomly
+masks regions of interest (ROIs) and reconstructs real faces without tampering
+traces, resulting in a relatively good recovery effect for realfaces and a poor
+recovery effect for fake faces. Inthe localization stage, the output of the
+recoveryphase and the forgery ground truth mask serve assupervision to guide
+the forgery localization process. This process strategically emphasizes the
+recovery phase of fake faces with poor recovery, facilitating the localization
+of tampered regions. Ourextensive experiments on four widely used benchmark
+datasets demonstrate that Delocate not onlyexcels in localizing tampered areas
+but also enhances cross-domain detection performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2308.09921,
+  arXiv:2305.05943</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tissue Cross-Section and Pen Marking Segmentation in Whole Slide Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruben T. Lucassen, Willeke A. M. Blokx, Mitko Veta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tissue segmentation is a routine preprocessing step to reduce the
+computational cost of whole slide image (WSI) analysis by excluding background
+regions. Traditional image processing techniques are commonly used for tissue
+segmentation, but often require manual adjustments to parameter values for
+atypical cases, fail to exclude all slide and scanning artifacts from the
+background, and are unable to segment adipose tissue. Pen marking artifacts in
+particular can be a potential source of bias for subsequent analyses if not
+removed. In addition, several applications require the separation of individual
+cross-sections, which can be challenging due to tissue fragmentation and
+adjacent positioning. To address these problems, we develop a convolutional
+neural network for tissue and pen marking segmentation using a dataset of 200
+H&E stained WSIs. For separating tissue cross-sections, we propose a novel
+post-processing method based on clustering predicted centroid locations of the
+cross-sections in a 2D histogram. On an independent test set, the model
+achieved a mean Dice score of 0.981$\pm$0.033 for tissue segmentation and a
+mean Dice score of 0.912$\pm$0.090 for pen marking segmentation. The mean
+absolute difference between the number of annotated and separated
+cross-sections was 0.075$\pm$0.350. Our results demonstrate that the proposed
+model can accurately segment H&E stained tissue cross-sections and pen markings
+in WSIs while being robust to many common slide and scanning artifacts. The
+model with trained model parameters and post-processing method are made
+publicly available as a Python package called SlideSegmenter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Human Motion Stylization in Latent Space <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13505v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13505v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuan Guo, Yuxuan Mu, Xinxin Zuo, Peng Dai, Youliang Yan, Juwei Lu, Li Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion stylization aims to revise the style of an input motion while
+keeping its content unaltered. Unlike existing works that operate directly in
+pose space, we leverage the latent space of pretrained autoencoders as a more
+expressive and robust representation for motion extraction and infusion.
+Building upon this, we present a novel generative model that produces diverse
+stylization results of a single motion (latent) code. During training, a motion
+code is decomposed into two coding components: a deterministic content code,
+and a probabilistic style code adhering to a prior distribution; then a
+generator massages the random combination of content and style codes to
+reconstruct the corresponding motion codes. Our approach is versatile, allowing
+the learning of probabilistic style space from either style labeled or
+unlabeled motions, providing notable flexibility in stylization as well. In
+inference, users can opt to stylize a motion using style cues from a reference
+motion or a label. Even in the absence of explicit style input, our model
+facilitates novel re-stylization by sampling from the unconditional style prior
+distribution. Experimental results show that our proposed stylization models,
+despite their lightweight design, outperform the state-of-the-arts in style
+reeanactment, content preservation, and generalization across various
+applications and settings. Project Page: https://yxmu.foo/GenMoStyle
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for ICLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Research about the Ability of LLM in the Tamper-Detection Area 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Yang, Jizhe Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, particularly since the early 2020s, Large Language Models
+(LLMs) have emerged as the most powerful AI tools in addressing a diverse range
+of challenges, from natural language processing to complex problem-solving in
+various domains. In the field of tamper detection, LLMs are capable of
+identifying basic tampering activities.To assess the capabilities of LLMs in
+more specialized domains, we have collected five different LLMs developed by
+various companies: GPT-4, LLaMA, Bard, ERNIE Bot 4.0, and Tongyi Qianwen. This
+diverse range of models allows for a comprehensive evaluation of their
+performance in detecting sophisticated tampering instances.We devised two
+domains of detection: AI-Generated Content (AIGC) detection and manipulation
+detection. AIGC detection aims to test the ability to distinguish whether an
+image is real or AI-generated. Manipulation detection, on the other hand,
+focuses on identifying tampered images. According to our experiments, most LLMs
+can identify composite pictures that are inconsistent with logic, and only more
+powerful LLMs can distinguish logical, but visible signs of tampering to the
+human eye. All of the LLMs can't identify carefully forged images and very
+realistic images generated by AI. In the area of tamper detection, LLMs still
+have a long way to go, particularly in reliably identifying highly
+sophisticated forgeries and AI-generated images that closely mimic reality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Representations for Clustering via Partial Information
+  Discrimination and Cross-Level Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13503v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13503v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai-Xin Zhang, Dong Huang, Hua-Bao Ling, Guang-Yu Zhang, Wei-jun Sun, Zi-hao Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a novel deep image clustering approach termed PICI,
+which enforces the partial information discrimination and the cross-level
+interaction in a joint learning framework. In particular, we leverage a
+Transformer encoder as the backbone, through which the masked image modeling
+with two paralleled augmented views is formulated. After deriving the class
+tokens from the masked images by the Transformer encoder, three partial
+information learning modules are further incorporated, including the PISD
+module for training the auto-encoder via masked image reconstruction, the PICD
+module for employing two levels of contrastive learning, and the CLI module for
+mutual interaction between the instance-level and cluster-level subspaces.
+Extensive experiments have been conducted on six real-world image datasets,
+which demononstrate the superior clustering performance of the proposed PICI
+approach over the state-of-the-art deep clustering approaches. The source code
+is available at https://github.com/Regan-Zhang/PICI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LDCA: Local Descriptors with Contextual Augmentation for Few-Shot
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maofa Wang, Bingchen Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot image classification has emerged as a key challenge in the field of
+computer vision, highlighting the capability to rapidly adapt to new tasks with
+minimal labeled data. Existing methods predominantly rely on image-level
+features or local descriptors, often overlooking the holistic context
+surrounding these descriptors. In this work, we introduce a novel approach
+termed "Local Descriptor with Contextual Augmentation (LDCA)". Specifically,
+this method bridges the gap between local and global understanding uniquely by
+leveraging an adaptive global contextual enhancement module. This module
+incorporates a visual transformer, endowing local descriptors with contextual
+awareness capabilities, ranging from broad global perspectives to intricate
+surrounding nuances. By doing so, LDCA transcends traditional descriptor-based
+approaches, ensuring each local feature is interpreted within its larger visual
+narrative. Extensive experiments underscore the efficacy of our method, showing
+a maximal absolute improvement of 20\% over the next-best on fine-grained
+classification datasets, thus demonstrating significant advancements in
+few-shot classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siwei Wu, Yizhi Li, Kang Zhu, Ge Zhang, Yiming Liang, Kaijing Ma, Chenghao Xiao, Haoran Zhang, Bohao Yang, Wenhu Chen, Wenhao Huang, Noura Al Moubayed, Jie Fu, Chenghua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal information retrieval (MMIR) is a rapidly evolving field, where
+significant progress, particularly in image-text pairing, has been made through
+advanced representation learning and cross-modality alignment research.
+However, current benchmarks for evaluating MMIR performance in image-text
+pairing within the scientific domain show a notable gap, where chart and table
+images described in scholarly language usually do not play a significant role.
+To bridge this gap, we develop a specialised scientific MMIR (SciMMIR)
+benchmark by leveraging open-access paper collections to extract data relevant
+to the scientific domain. This benchmark comprises 530K meticulously curated
+image-text pairs, extracted from figures and tables with detailed captions in
+scientific documents. We further annotate the image-text pairs with two-level
+subset-subcategory hierarchy annotations to facilitate a more comprehensive
+evaluation of the baselines. We conducted zero-shot and fine-tuning evaluations
+on prominent multi-modal image-captioning and visual language models, such as
+CLIP and BLIP. Our analysis offers critical insights for MMIR in the scientific
+domain, including the impact of pre-training and fine-tuning settings and the
+influence of the visual and textual encoders. All our data and checkpoints are
+publicly available at https://github.com/Wusiwei0410/SciMMIR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Segmenting Cardiac Muscle Z-disks with Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mihaela Croitor Ibrahim, Nishant Ravikumar, Alistair Curd, Joanna Leng, Oliver Umney, Michelle Peckham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Z-disks are complex structures that delineate repeating sarcomeres in
+striated muscle. They play significant roles in cardiomyocytes such as
+providing mechanical stability for the contracting sarcomere, cell signalling
+and autophagy. Changes in Z-disk architecture have been associated with
+impaired cardiac function. Hence, there is a strong need to create tools to
+segment Z-disks from microscopy images, that overcome traditional limitations
+such as variability in image brightness and staining technique. In this study,
+we apply deep learning based segmentation models to extract Z-disks in images
+of striated muscle tissue. We leverage a novel Airyscan confocal dataset, which
+comprises high resolution images of Z-disks of healthy heart tissue, stained
+with Affimers for specific Z-disk proteins. We employed an interactive
+labelling tool, Ilastik to obtain ground truth segmentation masks and use the
+resulting data set to train and evaluate the performance of several
+state-of-the-art segmentation networks. On the test set, UNet++ achieves best
+segmentation performance for Z-disks in cardiomyocytes, with an average Dice
+score of 0.91 and outperforms other established segmentation methods including
+UNet, FPN, DeepLabv3+ and pix2pix. However, pix2pix demonstrates improved
+generalisation, when tested on an additional dataset of cardiomyocytes with a
+titin mutation. This is the first study to demonstrate that automated machine
+learning-based segmentation approaches may be used effectively to segment
+Z-disks in confocal microscopy images. Automated segmentation approaches and
+predicted segmentation masks could be used to derive morphological features of
+Z-disks (e.g. width and orientation), and subsequently, to quantify
+disease-related changes to cardiac microstructure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semi-Supervised Coupled Thin-Plate Spline Model for Rotation Correction
+  and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lang Nie, Chunyu Lin, Kang Liao, Shuaicheng Liu, Yao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thin-plate spline (TPS) is a principal warp that allows for representing
+elastic, nonlinear transformation with control point motions. With the increase
+of control points, the warp becomes increasingly flexible but usually
+encounters a bottleneck caused by undesired issues, e.g., content distortion.
+In this paper, we explore generic applications of TPS in single-image-based
+warping tasks, such as rotation correction, rectangling, and portrait
+correction. To break this bottleneck, we propose the coupled thin-plate spline
+model (CoupledTPS), which iteratively couples multiple TPS with limited control
+points into a more flexible and powerful transformation. Concretely, we first
+design an iterative search to predict new control points according to the
+current latent condition. Then, we present the warping flow as a bridge for the
+coupling of different TPS transformations, effectively eliminating
+interpolation errors caused by multiple warps. Besides, in light of the
+laborious annotation cost, we develop a semi-supervised learning scheme to
+improve warping quality by exploiting unlabeled data. It is formulated through
+dual transformation between the searched control points of unlabeled data and
+its graphic augmentation, yielding an implicit correction consistency
+constraint. Finally, we collect massive unlabeled data to exhibit the benefit
+of our semi-supervised scheme in rotation correction. Extensive experiments
+demonstrate the superiority and universality of CoupledTPS over the existing
+state-of-the-art (SoTA) solutions for rotation correction and beyond. The code
+and data will be available at https://github.com/nie-lang/CoupledTPS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Serial fusion of multi-modal biometric systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13418v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13418v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gian Luca Marcialis, Paolo Mastinu, Fabio Roli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Serial, or sequential, fusion of multiple biometric matchers has been not
+thoroughly investigated so far. However, this approach exhibits some advantages
+with respect to the widely adopted parallel approaches. In this paper, we
+propose a novel theoretical framework for the assessment of performance of such
+systems, based on a previous work of the authors. Benefits in terms of
+performance are theoretically evaluated, as well as estimation errors in the
+model parameters computation. Model is analyzed from the viewpoint of its pros
+and cons, by mean of preliminary experiments performed on NIST Biometric Score
+Set 1.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GTAutoAct: An Automatic <span class="highlight-title">Dataset</span>s Generation Framework Based on Game
+  Engine Redevelopment for Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13414v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13414v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Song, Zhan Li, Shi Chen, Kazuyuki Demachi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current datasets for action recognition tasks face limitations stemming from
+traditional collection and generation methods, including the constrained range
+of action classes, absence of multi-viewpoint recordings, limited diversity,
+poor video quality, and labor-intensive manually collection. To address these
+challenges, we introduce GTAutoAct, a innovative dataset generation framework
+leveraging game engine technology to facilitate advancements in action
+recognition. GTAutoAct excels in automatically creating large-scale,
+well-annotated datasets with extensive action classes and superior video
+quality. Our framework's distinctive contributions encompass: (1) it
+innovatively transforms readily available coordinate-based 3D human motion into
+rotation-orientated representation with enhanced suitability in multiple
+viewpoints; (2) it employs dynamic segmentation and interpolation of rotation
+sequences to create smooth and realistic animations of action; (3) it offers
+extensively customizable animation scenes; (4) it implements an autonomous
+video capture and processing pipeline, featuring a randomly navigating camera,
+with auto-trimming and labeling functionalities. Experimental results
+underscore the framework's robustness and highlights its potential to
+significantly improve action recognition model training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthetic data enables faster annotation and robust segmentation for
+  multi-object grasping in clutter <span class="chip">ICMR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13405v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13405v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongmyoung Lee, Wei Chen, Nicolas Rojas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object recognition and object pose estimation in robotic grasping continue to
+be significant challenges, since building a labelled dataset can be time
+consuming and financially costly in terms of data collection and annotation. In
+this work, we propose a synthetic data generation method that minimizes human
+intervention and makes downstream image segmentation algorithms more robust by
+combining a generated synthetic dataset with a smaller real-world dataset
+(hybrid dataset). Annotation experiments show that the proposed synthetic scene
+generation can diminish labelling time dramatically. RGB image segmentation is
+trained with hybrid dataset and combined with depth information to produce
+pixel-to-point correspondence of individual segmented objects. The object to
+grasp is then determined by the confidence score of the segmentation algorithm.
+Pick-and-place experiments demonstrate that segmentation trained on our hybrid
+dataset (98.9%, 70%) outperforms the real dataset and a publicly available
+dataset by (6.7%, 18.8%) and (2.8%, 10%) in terms of labelling and grasping
+success rate, respectively. Supplementary material is available at
+https://sites.google.com/view/synthetic-dataset-generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for 2024 10th International Conference on Mechatronics and
+  Robotics Engineering (ICMRE)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEDNet: Shallow Encoder-Decoder Network for Brain Tumor Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chollette C. Olisah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the advancement in computational modeling towards brain tumor
+segmentation, of which several models have been developed, it is evident from
+the computational complexity of existing models which are still at an all-time
+high, that performance and efficiency under clinical application scenarios are
+limited. Therefore, this paper proposes a shallow encoder and decoder network
+named SEDNet for brain tumor segmentation. The proposed network is adapted from
+the U-Net structure. Though brain tumors do not assume complex structures like
+the task the traditional U-Net was designed for, their variance in appearance,
+shape, and ambiguity of boundaries makes it a compelling complex task to solve.
+SEDNet architecture design is inspired by the localized nature of brain tumors
+in brain images, thus consists of sufficient hierarchical convolutional blocks
+in the encoding pathway capable of learning the intrinsic features of brain
+tumors in brain slices, and a decoding pathway with selective skip path
+sufficient for capturing miniature local-level spatial features alongside the
+global-level features of brain tumor. SEDNet with the integration of the
+proposed preprocessing algorithm and optimization function on the BraTS2020 set
+reserved for testing achieves impressive dice and Hausdorff scores of 0.9308,
+0.9451, 0.9026, and 0.7040, 1.2866, 0.7762 for non-enhancing tumor core (NTC),
+peritumoral edema (ED), and enhancing tumor (ET), respectively. Furthermore,
+through transfer learning with initialized SEDNet pre-trained weights, termed
+SEDNetX, a performance increase is observed. The dice and Hausdorff scores
+recorded are 0.9336, 0.9478, 0.9061, 0.6983, 1.2691, and 0.7711 for NTC, ED,
+and ET, respectively. With about 1.3 million parameters and impressive
+performance in comparison to the state-of-the-art, SEDNet(X) is shown to be
+computationally efficient for real-time clinical diagnosis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures, 3 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UNIMO-G: Unified Image Generation through Multimodal Conditional
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Li, Xue Xu, Jiachen Liu, Xinyan Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing text-to-image diffusion models primarily generate images from text
+prompts. However, the inherent conciseness of textual descriptions poses
+challenges in faithfully synthesizing images with intricate details, such as
+specific entities or scenes. This paper presents \textbf{UNIMO-G}, a simple
+multimodal conditional diffusion framework that operates on multimodal prompts
+with interleaved textual and visual inputs, which demonstrates a unified
+ability for both text-driven and subject-driven image generation. UNIMO-G
+comprises two core components: a Multimodal Large Language Model (MLLM) for
+encoding multimodal prompts, and a conditional denoising diffusion network for
+generating images based on the encoded multimodal input. We leverage a
+two-stage training strategy to effectively train the framework: firstly
+pre-training on large-scale text-image pairs to develop conditional image
+generation capabilities, and then instruction tuning with multimodal prompts to
+achieve unified image generation proficiency. A well-designed data processing
+pipeline involving language grounding and image segmentation is employed to
+construct multi-modal prompts. UNIMO-G excels in both text-to-image generation
+and zero-shot subject-driven synthesis, and is notably effective in generating
+high-fidelity images from complex multimodal prompts involving multiple image
+entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Privacy-Preserving Face Recognition in Hybrid Frequency-Color Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13386v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13386v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Han, Yong Li, Joachim Denzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition technology has been deployed in various real-life
+applications. The most sophisticated deep learning-based face recognition
+systems rely on training millions of face images through complex deep neural
+networks to achieve high accuracy. It is quite common for clients to upload
+face images to the service provider in order to access the model inference.
+However, the face image is a type of sensitive biometric attribute tied to the
+identity information of each user. Directly exposing the raw face image to the
+service provider poses a threat to the user's privacy. Current
+privacy-preserving approaches to face recognition focus on either concealing
+visual information on model input or protecting model output face embedding.
+The noticeable drop in recognition accuracy is a pitfall for most methods. This
+paper proposes a hybrid frequency-color fusion approach to reduce the input
+dimensionality of face recognition in the frequency domain. Moreover, sparse
+color information is also introduced to alleviate significant accuracy
+degradation after adding differential privacy noise. Besides, an
+identity-specific embedding mapping scheme is applied to protect original face
+embedding by enlarging the distance among identities. Lastly, secure multiparty
+computation is implemented for safely computing the embedding distance during
+model inference. The proposed method performs well on multiple widely used
+verification datasets. Moreover, it has around 2.6% to 4.2% higher accuracy
+than the state-of-the-art in the 1:N verification scenario.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is already accepted at the conference International
+  Conference on Computer Vision Theory and Applications (VISAPP) 2024 as a
+  regular paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do You Guys Want to Dance: Zero-Shot Compositional Human Dance
+  Generation with Multiple Persons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13363v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13363v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Xu, Kun Wei, Xu Yang, Cheng Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human dance generation (HDG) aims to synthesize realistic videos from images
+and sequences of driving poses. Despite great success, existing methods are
+limited to generating videos of a single person with specific backgrounds,
+while the generalizability for real-world scenarios with multiple persons and
+complex backgrounds remains unclear. To systematically measure the
+generalizability of HDG models, we introduce a new task, dataset, and
+evaluation protocol of compositional human dance generation (cHDG). Evaluating
+the state-of-the-art methods on cHDG, we empirically find that they fail to
+generalize to real-world scenarios. To tackle the issue, we propose a novel
+zero-shot framework, dubbed MultiDance-Zero, that can synthesize videos
+consistent with arbitrary multiple persons and background while precisely
+following the driving poses. Specifically, in contrast to straightforward DDIM
+or null-text inversion, we first present a pose-aware inversion method to
+obtain the noisy latent code and initialization text embeddings, which can
+accurately reconstruct the composed reference image. Since directly generating
+videos from them will lead to severe appearance inconsistency, we propose a
+compositional augmentation strategy to generate augmented images and utilize
+them to optimize a set of generalizable text embeddings. In addition,
+consistency-guided sampling is elaborated to encourage the background and
+keypoints of the estimated clean image at each reverse step to be close to
+those of the reference image, further improving the temporal consistency of
+generated videos. Extensive qualitative and quantitative results demonstrate
+the effectiveness and superiority of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear Relative Pose Estimation Founded on Pose-only Imaging Geometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Cai, Xinrui Li, Yuanxin Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How to efficiently and accurately handle image matching outliers is a
+critical issue in two-view relative estimation. The prevailing RANSAC method
+necessitates that the minimal point pairs be inliers. This paper introduces a
+linear relative pose estimation algorithm for n $( n \geq 6$) point pairs,
+which is founded on the recent pose-only imaging geometry to filter out
+outliers by proper reweighting. The proposed algorithm is able to handle planar
+degenerate scenes, and enhance robustness and accuracy in the presence of a
+substantial ratio of outliers. Specifically, we embed the linear global
+translation (LiGT) constraint into the strategies of iteratively reweighted
+least-squares (IRLS) and RANSAC so as to realize robust outlier removal.
+Simulations and real tests of the Strecha dataset show that the proposed
+algorithm achieves relative rotation accuracy improvement of 2 $\sim$ 10 times
+in face of as large as 80% outliers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EndoGaussians: Single View Dynamic Gaussian Splatting for Deformable
+  Endoscopic Tissues Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangsen Chen, Hao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accurate 3D reconstruction of deformable soft body tissues from
+endoscopic videos is a pivotal challenge in medical applications such as VR
+surgery and medical image analysis. Existing methods often struggle with
+accuracy and the ambiguity of hallucinated tissue parts, limiting their
+practical utility. In this work, we introduce EndoGaussians, a novel approach
+that employs Gaussian Splatting for dynamic endoscopic 3D reconstruction. This
+method marks the first use of Gaussian Splatting in this context, overcoming
+the limitations of previous NeRF-based techniques. Our method sets new
+state-of-the-art standards, as demonstrated by quantitative assessments on
+various endoscope datasets. These advancements make our method a promising tool
+for medical professionals, offering more reliable and efficient 3D
+reconstructions for practical applications in the medical field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NACHOS: Neural Architecture Search for Hardware Constrained Early Exit
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13330v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13330v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Gambella, Jary Pomponi, Simone Scardapane, Manuel Roveri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early Exit Neural Networks (EENNs) endow astandard Deep Neural Network (DNN)
+with Early Exit Classifiers (EECs), to provide predictions at intermediate
+points of the processing when enough confidence in classification is achieved.
+This leads to many benefits in terms of effectiveness and efficiency.
+Currently, the design of EENNs is carried out manually by experts, a complex
+and time-consuming task that requires accounting for many aspects, including
+the correct placement, the thresholding, and the computational overhead of the
+EECs. For this reason, the research is exploring the use of Neural Architecture
+Search (NAS) to automatize the design of EENNs. Currently, few comprehensive
+NAS solutions for EENNs have been proposed in the literature, and a fully
+automated, joint design strategy taking into consideration both the backbone
+and the EECs remains an open problem. To this end, this work presents Neural
+Architecture Search for Hardware Constrained Early Exit Neural Networks
+(NACHOS), the first NAS framework for the design of optimal EENNs satisfying
+constraints on the accuracy and the number of Multiply and Accumulate (MAC)
+operations performed by the EENNs at inference time. In particular, this
+provides the joint design of backbone and EECs to select a set of admissible
+(i.e., respecting the constraints) Pareto Optimal Solutions in terms of best
+tradeoff between the accuracy and number of MACs. The results show that the
+models designed by NACHOS are competitive with the state-of-the-art EENNs.
+Additionally, this work investigates the effectiveness of two novel
+regularization terms designed for the optimization of the auxiliary classifiers
+of the EENN
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Video Diffusion for Unseen Cross-Domain Video Moment
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13329v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13329v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dezhao Luo, Jiabo Huang, Shaogang Gong, Hailin Jin, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Moment Retrieval (VMR) requires precise modelling of fine-grained
+moment-text associations to capture intricate visual-language relationships.
+Due to the lack of a diverse and generalisable VMR dataset to facilitate
+learning scalable moment-text associations, existing methods resort to joint
+training on both source and target domain videos for cross-domain applications.
+Meanwhile, recent developments in vision-language multimodal models pre-trained
+on large-scale image-text and/or video-text pairs are only based on coarse
+associations (weakly labelled). They are inadequate to provide fine-grained
+moment-text correlations required for cross-domain VMR. In this work, we solve
+the problem of unseen cross-domain VMR, where certain visual and textual
+concepts do not overlap across domains, by only utilising target domain
+sentences (text prompts) without accessing their videos. To that end, we
+explore generative video diffusion for fine-grained editing of source videos
+controlled by the target sentences, enabling us to simulate target domain
+videos. We address two problems in video editing for optimising unseen domain
+VMR: (1) generation of high-quality simulation videos of different moments with
+subtle distinctions, (2) selection of simulation videos that complement
+existing source training videos without introducing harmful noise or
+unnecessary repetitions. On the first problem, we formulate a two-stage video
+diffusion generation controlled simultaneously by (1) the original video
+structure of a source video, (2) subject specifics, and (3) a target sentence
+prompt. This ensures fine-grained variations between video moments. On the
+second problem, we introduce a hybrid selection mechanism that combines two
+quantitative metrics for noise filtering and one qualitative metric for
+leveraging VMR prediction on simulation video selection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Memory Consistency Guided Divide-and-Conquer Learning for Generalized
+  Category Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanpeng Tu, Zhun Zhong, Yuxi Li, Hengshuang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalized category discovery (GCD) aims at addressing a more realistic and
+challenging setting of semi-supervised learning, where only part of the
+category labels are assigned to certain training samples. Previous methods
+generally employ naive contrastive learning or unsupervised clustering scheme
+for all the samples. Nevertheless, they usually ignore the inherent critical
+information within the historical predictions of the model being trained.
+Specifically, we empirically reveal that a significant number of salient
+unlabeled samples yield consistent historical predictions corresponding to
+their ground truth category. From this observation, we propose a Memory
+Consistency guided Divide-and-conquer Learning framework (MCDL). In this
+framework, we introduce two memory banks to record historical prediction of
+unlabeled data, which are exploited to measure the credibility of each sample
+in terms of its prediction consistency. With the guidance of credibility, we
+can design a divide-and-conquer learning strategy to fully utilize the
+discriminative information of unlabeled data while alleviating the negative
+influence of noisy labels. Extensive experimental results on multiple
+benchmarks demonstrate the generality and superiority of our method, where our
+method outperforms state-of-the-art models by a large margin on both seen and
+unseen classes of the generic image recognition and challenging semantic shift
+settings (i.e.,with +8.4% gain on CUB and +8.1% on Standford Cars).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for Improved Polyp Detection from Synthetic Narrow-Band
+  Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13315v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13315v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathias Ramm Haugland, Hemin Ali Qadir, Ilangko Balasingham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To cope with the growing prevalence of colorectal cancer (CRC), screening
+programs for polyp detection and removal have proven their usefulness.
+Colonoscopy is considered the best-performing procedure for CRC screening. To
+ease the examination, deep learning based methods for automatic polyp detection
+have been developed for conventional white-light imaging (WLI). Compared with
+WLI, narrow-band imaging (NBI) can improve polyp classification during
+colonoscopy but requires special equipment. We propose a CycleGAN-based
+framework to convert images captured with regular WLI to synthetic NBI (SNBI)
+as a pre-processing method for improving object detection on WLI when NBI is
+unavailable. This paper first shows that better results for polyp detection can
+be achieved on NBI compared to a relatively similar dataset of WLI. Secondly,
+experimental results demonstrate that our proposed modality translation can
+achieve improved polyp detection on SNBI images generated from WLI compared to
+the original WLI. This is because our WLI-to-SNBI translation model can enhance
+the observation of polyp surface patterns in the generated SNBI images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InstructDoc: A <span class="highlight-title">Dataset</span> for Zero-Shot Generalization of Visual Document
+  Understanding with Instructions <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryota Tanaka, Taichi Iki, Kyosuke Nishida, Kuniko Saito, Jun Suzuki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of completing various visual document understanding
+(VDU) tasks, e.g., question answering and information extraction, on real-world
+documents through human-written instructions. To this end, we propose
+InstructDoc, the first large-scale collection of 30 publicly available VDU
+datasets, each with diverse instructions in a unified format, which covers a
+wide range of 12 tasks and includes open document types/formats. Furthermore,
+to enhance the generalization performance on VDU tasks, we design a new
+instruction-based document reading and understanding model, InstructDr, that
+connects document images, image encoders, and large language models (LLMs)
+through a trainable bridging module. Experiments demonstrate that InstructDr
+can effectively adapt to new VDU datasets, tasks, and domains via given
+instructions and outperforms existing multimodal LLMs and ChatGPT without
+specific training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2024; project page:
+  https://github.com/nttmdlab-nlp/InstructDoc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConTextual: Evaluating Context-Sensitive Text-Rich Visual Reasoning in
+  Large Multimodal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohan Wadhawan, Hritik Bansal, Kai-Wei Chang, Nanyun Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in AI have led to the development of large multimodal
+models (LMMs) capable of processing complex tasks involving joint reasoning
+over text and visual content in the image (e.g., navigating maps in public
+places). This paper introduces ConTextual, a novel benchmark comprising
+instructions designed explicitly to evaluate LMMs' ability to perform
+context-sensitive text-rich visual reasoning. ConTextual emphasizes diverse
+real-world scenarios (e.g., time-reading, navigation, shopping and more)
+demanding a deeper understanding of the interactions between textual and visual
+elements. Our findings reveal a significant performance gap of 30.8% between
+the best-performing LMM, GPT-4V(ision), and human capabilities using human
+evaluation indicating substantial room for improvement in context-sensitive
+text-rich visual reasoning. Notably, while GPT-4V excelled in abstract
+categories like meme and quote interpretation, its overall performance still
+lagged behind humans. In addition to human evaluations, we also employed
+automatic evaluation metrics using GPT-4, uncovering similar trends in
+performance disparities. We also perform a fine-grained evaluation across
+diverse visual contexts and provide qualitative analysis which provides a
+robust framework for future advancements in the LMM design.
+https://con-textual.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChatterBox: Multi-round Multimodal Referring and Grounding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13307v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13307v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunjie Tian, Tianren Ma, Lingxi Xie, Jihao Qiu, Xi Tang, Yuan Zhang, Jianbin Jiao, Qi Tian, Qixiang Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we establish a baseline for a new task named multimodal
+multi-round referring and grounding (MRG), opening up a promising direction for
+instance-level multimodal dialogues. We present a new benchmark and an
+efficient vision-language model for this purpose. The new benchmark, named
+CB-300K, spans challenges including multi-round dialogue, complex spatial
+relationships among multiple instances, and consistent reasoning, which are
+beyond those shown in existing benchmarks. The proposed model, named
+ChatterBox, utilizes a two-branch architecture to collaboratively handle vision
+and language tasks. By tokenizing instance regions, the language branch
+acquires the ability to perceive referential information. Meanwhile, ChatterBox
+feeds a query embedding in the vision branch to a token receiver for visual
+grounding. A two-stage optimization strategy is devised, making use of both
+CB-300K and auxiliary external data to improve the model's stability and
+capacity for instance-level understanding. Experiments show that ChatterBox
+outperforms existing models in MRG both quantitatively and qualitatively,
+paving a new path towards multimodal dialogue scenarios with complicated and
+precise interactions. Code, data, and model are available at:
+https://github.com/sunsmarterjie/ChatterBox.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 6 tables, 9 figurs. Code, data, and model are available at:
+  https://github.com/sunsmarterjie/ChatterBox</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Objectification in Films: Towards a New AI Task for Video
+  Interpretation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julie Tores, Lucile Sassatelli, Hui-Yin Wu, Clement Bergman, Lea Andolfi, Victor Ecrement, Frederic Precioso, Thierry Devars, Magali Guaresi, Virginie Julliard, Sarah Lecossais
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In film gender studies, the concept of 'male gaze' refers to the way the
+characters are portrayed on-screen as objects of desire rather than subjects.
+In this article, we introduce a novel video-interpretation task, to detect
+character objectification in films. The purpose is to reveal and quantify the
+usage of complex temporal patterns operated in cinema to produce the cognitive
+perception of objectification. We introduce the ObyGaze12 dataset, made of 1914
+movie clips densely annotated by experts for objectification concepts
+identified in film studies and psychology. We evaluate recent vision models,
+show the feasibility of the task and where the challenges remain with concept
+bottleneck models. Our new dataset and code are made available to the
+community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 3 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Small Object Tracking in LiDAR Point Cloud: Learning the
+  Target-awareness Prototype and Fine-grained Search Region 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengjing Tian, Yinan Han, Xiuping Liu, Xiantong Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single Object Tracking in LiDAR point cloud is one of the most essential
+parts of environmental perception, in which small objects are inevitable in
+real-world scenarios and will bring a significant barrier to the accurate
+location. However, the existing methods concentrate more on exploring universal
+architectures for common categories and overlook the challenges that small
+objects have long been thorny due to the relative deficiency of foreground
+points and a low tolerance for disturbances. To this end, we propose a Siamese
+network-based method for small object tracking in the LiDAR point cloud, which
+is composed of the target-awareness prototype mining (TAPM) module and the
+regional grid subdivision (RGS) module. The TAPM module adopts the
+reconstruction mechanism of the masked decoder to learn the prototype in the
+feature space, aiming to highlight the presence of foreground points that will
+facilitate the subsequent location of small objects. Through the above
+prototype is capable of accentuating the small object of interest, the
+positioning deviation in feature maps still leads to high tracking errors. To
+alleviate this issue, the RGS module is proposed to recover the fine-grained
+features of the search region based on ViT and pixel shuffle layers. In
+addition, apart from the normal settings, we elaborately design a scaling
+experiment to evaluate the robustness of the different trackers on small
+objects. Extensive experiments on KITTI and nuScenes demonstrate that our
+method can effectively improve the tracking performance of small targets
+without affecting normal-sized objects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DDI-CoCo: A <span class="highlight-title">Dataset</span> For Understanding The Effect Of Color Contrast In
+  Machine-Assisted Skin Disease Detection <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming-Chang Chiu, Yingfei Wang, Yen-Ju Kuo, Pin-Yu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Skin tone as a demographic bias and inconsistent human labeling poses
+challenges in dermatology AI. We take another angle to investigate color
+contrast's impact, beyond skin tones, on malignancy detection in skin disease
+datasets: We hypothesize that in addition to skin tones, the color difference
+between the lesion area and skin also plays a role in malignancy detection
+performance of dermatology AI models. To study this, we first propose a robust
+labeling method to quantify color contrast scores of each image and validate
+our method by showing small labeling variations. More importantly, applying our
+method to \textit{the only} diverse-skin tone and pathologically-confirmed skin
+disease dataset DDI, yields \textbf{DDI-CoCo Dataset}, and we observe a
+performance gap between the high and low color difference groups. This
+disparity remains consistent across various state-of-the-art (SoTA) image
+classification models, which supports our hypothesis. Furthermore, we study the
+interaction between skin tone and color difference effects and suggest that
+color difference can be an additional reason behind model performance bias
+between skin tones. Our work provides a complementary angle to dermatology AI
+for improving skin disease detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures, 2 tables, Accepted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-Infused Automatic Image Colorization by Exploiting Audio Scene
+  Semantics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Zhao, Yanxiang Chen, Yang Zhao, Wei Jia, Zhao Zhang, Ronggang Wang, Richang Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic image colorization is inherently an ill-posed problem with
+uncertainty, which requires an accurate semantic understanding of scenes to
+estimate reasonable colors for grayscale images. Although recent
+interaction-based methods have achieved impressive performance, it is still a
+very difficult task to infer realistic and accurate colors for automatic
+colorization. To reduce the difficulty of semantic understanding of grayscale
+scenes, this paper tries to utilize corresponding audio, which naturally
+contains extra semantic information about the same scene. Specifically, a novel
+audio-infused automatic image colorization (AIAIC) network is proposed, which
+consists of three stages. First, we take color image semantics as a bridge and
+pretrain a colorization network guided by color image semantics. Second, the
+natural co-occurrence of audio and video is utilized to learn the color
+semantic correlations between audio and visual scenes. Third, the implicit
+audio semantic representation is fed into the pretrained network to finally
+realize the audio-guided colorization. The whole process is trained in a
+self-supervised manner without human annotation. In addition, an audiovisual
+colorization dataset is established for training and testing. Experiments
+demonstrate that audio guidance can effectively improve the performance of
+automatic colorization, especially for some scenes that are difficult to
+understand only from visual modality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual-modal Dynamic Traceback Learning for Medical Report Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuchang Ye, Mingyuan Meng, Mingjian Li, Dagan Feng, Jinman Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With increasing reliance on medical imaging in clinical practices, automated
+report generation from medical images is in great demand. Existing report
+generation methods typically adopt an encoder-decoder deep learning framework
+to build a uni-directional image-to-report mapping. However, such a framework
+ignores the bi-directional mutual associations between images and reports, thus
+incurring difficulties in associating the intrinsic medical meanings between
+them. Recent generative representation learning methods have demonstrated the
+benefits of dual-modal learning from both image and text modalities. However,
+these methods exhibit two major drawbacks for medical report generation: 1)
+they tend to capture morphological information and have difficulties in
+capturing subtle pathological semantic information, and 2) they predict masked
+text rely on both unmasked images and text, inevitably degrading performance
+when inference is based solely on images. In this study, we propose a new
+report generation framework with dual-modal dynamic traceback learning (DTrace)
+to overcome the two identified drawbacks and enable dual-modal learning for
+medical report generation. To achieve this, our DTrace introduces a traceback
+mechanism to control the semantic validity of generated content via
+self-assessment. Further, our DTrace introduces a dynamic learning strategy to
+adapt to various proportions of image and text input, enabling report
+generation without reliance on textual input during inference. Extensive
+experiments on two well-benchmarked datasets (IU-Xray and MIMIC-CXR) show that
+our DTrace outperforms state-of-the-art medical report generation methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing cross-domain detection: adaptive class-aware contrastive
+  <span class="highlight-title">transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziru Zeng, Yue Ding, Hongtao Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently,the detection transformer has gained substantial attention for its
+inherent minimal post-processing requirement.However,this paradigm relies on
+abundant training data,yet in the context of the cross-domain
+adaptation,insufficient labels in the target domain exacerbate issues of class
+imbalance and model performance degradation.To address these challenges, we
+propose a novel class-aware cross domain detection transformer based on the
+adversarial learning and mean-teacher framework.First,considering the
+inconsistencies between the classification and regression tasks,we introduce an
+IoU-aware prediction branch and exploit the consistency of classification and
+location scores to filter and reweight pseudo labels.Second, we devise a
+dynamic category threshold refinement to adaptively manage model
+confidence.Third,to alleviate the class imbalance,an instance-level class-aware
+contrastive learning module is presented to encourage the generation of
+discriminative features for each class,particularly benefiting minority
+classes.Experimental results across diverse domain-adaptive scenarios validate
+our method's effectiveness in improving performance and alleviating class
+imbalance issues,which outperforms the state-of-the-art transformer based
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Acceptd by Icassp 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified-Width Adaptive Dynamic Network for All-In-One Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yimin Xu, Nanxi Gao, Zhongyun Shan, Fei Chao, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contrast to traditional image restoration methods, all-in-one image
+restoration techniques are gaining increased attention for their ability to
+restore images affected by diverse and unknown corruption types and levels.
+However, contemporary all-in-one image restoration methods omit task-wise
+difficulties and employ the same networks to reconstruct images afflicted by
+diverse degradations. This practice leads to an underestimation of the task
+correlations and suboptimal allocation of computational resources. To elucidate
+task-wise complexities, we introduce a novel concept positing that intricate
+image degradation can be represented in terms of elementary degradation.
+Building upon this foundation, we propose an innovative approach, termed the
+Unified-Width Adaptive Dynamic Network (U-WADN), consisting of two pivotal
+components: a Width Adaptive Backbone (WAB) and a Width Selector (WS). The WAB
+incorporates several nested sub-networks with varying widths, which facilitates
+the selection of the most apt computations tailored to each task, thereby
+striking a balance between accuracy and computational efficiency during
+runtime. For different inputs, the WS automatically selects the most
+appropriate sub-network width, taking into account both task-specific and
+sample-specific complexities. Extensive experiments across a variety of image
+restoration tasks demonstrate that the proposed U-WADN achieves better
+performance while simultaneously reducing up to 32.3\% of FLOPs and providing
+approximately 15.7\% real-time acceleration. The code has been made available
+at \url{https://github.com/xuyimin0926/U-WADN}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Segment Any Cell: A SAM-based Auto-<span class="highlight-title">prompt</span>ing Fine-tuning Framework for
+  Nuclei Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saiyang Na, Yuzhi Guo, Feng Jiang, Hehuan Ma, Junzhou Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly evolving field of AI research, foundational models like BERT
+and GPT have significantly advanced language and vision tasks. The advent of
+pretrain-prompting models such as ChatGPT and Segmentation Anything Model (SAM)
+has further revolutionized image segmentation. However, their applications in
+specialized areas, particularly in nuclei segmentation within medical imaging,
+reveal a key challenge: the generation of high-quality, informative prompts is
+as crucial as applying state-of-the-art (SOTA) fine-tuning techniques on
+foundation models. To address this, we introduce Segment Any Cell (SAC), an
+innovative framework that enhances SAM specifically for nuclei segmentation.
+SAC integrates a Low-Rank Adaptation (LoRA) within the attention layer of the
+Transformer to improve the fine-tuning process, outperforming existing SOTA
+methods. It also introduces an innovative auto-prompt generator that produces
+effective prompts to guide segmentation, a critical factor in handling the
+complexities of nuclei segmentation in biomedical imaging. Our extensive
+experiments demonstrate the superiority of SAC in nuclei segmentation tasks,
+proving its effectiveness as a tool for pathologists and researchers. Our
+contributions include a novel prompt generation strategy, automated
+adaptability for diverse segmentation tasks, the innovative application of
+Low-Rank Attention Adaptation in SAM, and a versatile framework for semantic
+segmentation challenges.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AMANet: Advancing SAR Ship Detection with Adaptive Multi-Hierarchical
+  Attention Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaolin Ma, Junkai Cheng, Aihua Li, Yuhua Zhang, Zhilong Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, methods based on deep learning have been successfully applied to
+ship detection for synthetic aperture radar (SAR) images. Despite the
+development of numerous ship detection methodologies, detecting small and
+coastal ships remains a significant challenge due to the limited features and
+clutter in coastal environments. For that, a novel adaptive multi-hierarchical
+attention module (AMAM) is proposed to learn multi-scale features and
+adaptively aggregate salient features from various feature layers, even in
+complex environments. Specifically, we first fuse information from adjacent
+feature layers to enhance the detection of smaller targets, thereby achieving
+multi-scale feature enhancement. Then, to filter out the adverse effects of
+complex backgrounds, we dissect the previously fused multi-level features on
+the channel, individually excavate the salient regions, and adaptively
+amalgamate features originating from different channels. Thirdly, we present a
+novel adaptive multi-hierarchical attention network (AMANet) by embedding the
+AMAM between the backbone network and the feature pyramid network (FPN).
+Besides, the AMAM can be readily inserted between different frameworks to
+improve object detection. Lastly, extensive experiments on two large-scale SAR
+ship detection datasets demonstrate that our AMANet method is superior to
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Common-Sense Bias Discovery and Mitigation for Classification Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miao Zhang, Zee fryer, Ben Colman, Ali Shahriyari, Gaurav Bharaj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning model bias can arise from dataset composition: sensitive
+features correlated to the learning target disturb the model decision rule and
+lead to performance differences along the features. Existing de-biasing work
+captures prominent and delicate image features which are traceable in model
+latent space, like colors of digits or background of animals. However, using
+the latent space is not sufficient to understand all dataset feature
+correlations. In this work, we propose a framework to extract feature clusters
+in a dataset based on image descriptions, allowing us to capture both subtle
+and coarse features of the images. The feature co-occurrence pattern is
+formulated and correlation is measured, utilizing a human-in-the-loop for
+examination. The analyzed features and correlations are human-interpretable, so
+we name the method Common-Sense Bias Discovery (CSBD). Having exposed sensitive
+correlations in a dataset, we demonstrate that downstream model bias can be
+mitigated by adjusting image sampling weights, without requiring a sensitive
+group label supervision. Experiments show that our method discovers novel
+biases on multiple classification tasks for two benchmark image datasets, and
+the intervention outperforms state-of-the-art unsupervised bias mitigation
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdCorDA: Classifier Refinement via Adversarial Correction and Domain
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lulan Shen, Ali Edalati, Brett Meyer, Warren Gross, James J. Clark
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes a simple yet effective technique for refining a
+pretrained classifier network. The proposed AdCorDA method is based on
+modification of the training set and making use of the duality between network
+weights and layer inputs. We call this input space training. The method
+consists of two stages - adversarial correction followed by domain adaptation.
+Adversarial correction uses adversarial attacks to correct incorrect
+training-set classifications. The incorrectly classified samples of the
+training set are removed and replaced with the adversarially corrected samples
+to form a new training set, and then, in the second stage, domain adaptation is
+performed back to the original training set. Extensive experimental validations
+show significant accuracy boosts of over 5% on the CIFAR-100 dataset. The
+technique can be straightforwardly applied to refinement of weight-quantized
+neural networks, where experiments show substantial enhancement in performance
+over the baseline. The adversarial correction technique also results in
+enhanced robustness to adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting the Transferability of Adversarial Examples via Local Mixup and
+  Adaptive Step Size 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junlin Liu, Xinchen Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial examples are one critical security threat to various visual
+applications, where injected human-imperceptible perturbations can confuse the
+output.Generating transferable adversarial examples in the black-box setting is
+crucial but challenging in practice. Existing input-diversity-based methods
+adopt different image transformations, but may be inefficient due to
+insufficient input diversity and an identical perturbation step size. Motivated
+by the fact that different image regions have distinctive weights in
+classification, this paper proposes a black-box adversarial generative
+framework by jointly designing enhanced input diversity and adaptive step
+sizes. We design local mixup to randomly mix a group of transformed adversarial
+images, strengthening the input diversity. For precise adversarial generation,
+we project the perturbation into the $tanh$ space to relax the boundary
+constraint. Moreover, the step sizes of different regions can be dynamically
+adjusted by integrating a second-order momentum.Extensive experiments on
+ImageNet validate that our framework can achieve superior transferability
+compared to state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Style-Consistent 3D Indoor Scene Synthesis with Decoupled Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13203v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13203v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfan Zhang, Hong Huang, Zhiwei Xiong, Zhiqi Shen, Guosheng Lin, Hao Wang, Nicholas Vun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Controllable 3D indoor scene synthesis stands at the forefront of
+technological progress, offering various applications like gaming, film, and
+augmented/virtual reality. The capability to stylize and de-couple objects
+within these scenarios is a crucial factor, providing an advanced level of
+control throughout the editing process. This control extends not just to
+manipulating geometric attributes like translation and scaling but also
+includes managing appearances, such as stylization. Current methods for scene
+stylization are limited to applying styles to the entire scene, without the
+ability to separate and customize individual objects. Addressing the
+intricacies of this challenge, we introduce a unique pipeline designed for
+synthesis 3D indoor scenes. Our approach involves strategically placing objects
+within the scene, utilizing information from professionally designed bounding
+boxes. Significantly, our pipeline prioritizes maintaining style consistency
+across multiple objects within the scene, ensuring a cohesive and visually
+appealing result aligned with the desired aesthetic. The core strength of our
+pipeline lies in its ability to generate 3D scenes that are not only visually
+impressive but also exhibit features like photorealism, multi-view consistency,
+and diversity. These scenes are crafted in response to various natural language
+prompts, demonstrating the versatility and adaptability of our model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MLLMReID: Multimodal Large Language Model-based Person Re-identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shan Yang, Yongfei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLM) have achieved satisfactory results in
+many tasks. However, their performance in the task of person re-identification
+(ReID) has not been explored to date. This paper will investigate how to adapt
+them for the task of ReID. An intuitive idea is to fine-tune MLLM with ReID
+image-text datasets, and then use their visual encoder as a backbone for ReID.
+However, there still exist two apparent issues: (1) Designing instructions for
+ReID, MLLMs may overfit specific instructions, and designing a variety of
+instructions will lead to higher costs. (2) Latent image feature vectors from
+LLMs are not involved in loss computation. Instructional learning, aligning
+image-text features, results in indirect optimization and a learning objective
+that inadequately utilizes features, limiting effectiveness in person feature
+learning. To address these problems, this paper proposes MLLMReID: Multimodal
+Large Language Model-based ReID. Firstly, we proposed Common Instruction, a
+simple approach that leverages the essence ability of LLMs to continue writing,
+avoiding complex and diverse instruction design. Secondly, we proposed
+DirectReID, which effectively employs the latent image feature vectors of
+images outputted by LLMs in ReID tasks. The experimental results demonstrate
+the superiority of our method. We will open-source the code on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting Mitral Valve mTEER Surgery Outcomes Using Machine Learning
+  and Deep Learning Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tejas Vyas, Mohsena Chowdhury, Xiaojiao Xiao, Mathias Claeys, Géraldine Ong, Guanghui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mitral Transcatheter Edge-to-Edge Repair (mTEER) is a medical procedure
+utilized for the treatment of mitral valve disorders. However, predicting the
+outcome of the procedure poses a significant challenge. This paper makes the
+first attempt to harness classical machine learning (ML) and deep learning (DL)
+techniques for predicting mitral valve mTEER surgery outcomes. To achieve this,
+we compiled a dataset from 467 patients, encompassing labeled echocardiogram
+videos and patient reports containing Transesophageal Echocardiography (TEE)
+measurements detailing Mitral Valve Repair (MVR) treatment outcomes. Leveraging
+this dataset, we conducted a benchmark evaluation of six ML algorithms and two
+DL models. The results underscore the potential of ML and DL in predicting
+mTEER surgery outcomes, providing insight for future investigation and
+advancements in this domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Catch-Up Mix: Catch-Up Class for Struggling Filters in CNN <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsoo Kang, Minkoo Kang, Suhyun Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has made significant advances in computer vision, particularly
+in image classification tasks. Despite their high accuracy on training data,
+deep learning models often face challenges related to complexity and
+overfitting. One notable concern is that the model often relies heavily on a
+limited subset of filters for making predictions. This dependency can result in
+compromised generalization and an increased vulnerability to minor variations.
+While regularization techniques like weight decay, dropout, and data
+augmentation are commonly used to address this issue, they may not directly
+tackle the reliance on specific filters. Our observations reveal that the heavy
+reliance problem gets severe when slow-learning filters are deprived of
+learning opportunities due to fast-learning filters. Drawing inspiration from
+image augmentation research that combats over-reliance on specific image
+regions by removing and replacing parts of images, our idea is to mitigate the
+problem of over-reliance on strong filters by substituting highly activated
+features. To this end, we present a novel method called Catch-up Mix, which
+provides learning opportunities to a wide range of filters during training,
+focusing on filters that may lag behind. By mixing activation maps with
+relatively lower norms, Catch-up Mix promotes the development of more diverse
+representations and reduces reliance on a small subset of filters. Experimental
+results demonstrate the superiority of our method in various vision
+classification datasets, providing enhanced robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at AAAI2024, Equal contribution of first two authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Multi-domain Face Landmark Detection with Synthetic Data from
+  Diffusion model <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13191v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13191v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanming Li, Gwantae Kim, Jeong-gi Kwak, Bon-hwa Ku, Hanseok Ko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, deep learning-based facial landmark detection for in-the-wild faces
+has achieved significant improvement. However, there are still challenges in
+face landmark detection in other domains (e.g. cartoon, caricature, etc). This
+is due to the scarcity of extensively annotated training data. To tackle this
+concern, we design a two-stage training approach that effectively leverages
+limited datasets and the pre-trained diffusion model to obtain aligned pairs of
+landmarks and face in multiple domains. In the first stage, we train a
+landmark-conditioned face generation model on a large dataset of real faces. In
+the second stage, we fine-tune the above model on a small dataset of
+image-landmark pairs with text prompts for controlling the domain. Our new
+designs enable our method to generate high-quality synthetic paired datasets
+from multiple domains while preserving the alignment between landmarks and
+facial features. Finally, we fine-tuned a pre-trained face landmark detection
+model on the synthetic dataset to achieve multi-domain face landmark detection.
+Our qualitative and quantitative results demonstrate that our method
+outperforms existing methods on multi-domain face landmark detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, ICASSP 2024 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boundary and Relation Distillation for Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13174v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13174v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Zhang, Pingcheng Dong, Xinting Hu, Long Chen, Kwang-Ting Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, it has been revealed that small semantic segmentation (SS) models
+exhibit a tendency to make errors in maintaining boundary region completeness
+and preserving target region connectivity, despite their effective segmentation
+of the main object regions. To address these errors, we propose a targeted
+boundary and relation distillation (BRD) strategy using knowledge distillation
+from large teacher models to small student models. Specifically, the boundary
+distillation extracts explicit object boundaries from the hierarchical feature
+maps of the backbone network, subsequently enhancing the student model's mask
+quality in boundary regions. Concurrently, the relation distillation transfers
+implicit relations from the teacher model to the student model using
+pixel-level self-relation as a bridge, ensuring that the student's mask has
+strong target region connectivity. The proposed BRD is designed concretely for
+SS and is characterized by simplicity and efficiency. Through experimental
+evaluations on multiple SS datasets, including Pascal VOC 2012, Cityscapes,
+ADE20K, and COCO-Stuff 10K, we demonstrated that BRD significantly surpasses
+the current methods without increasing the inference costs, generating crisp
+region boundaries and smooth connecting regions that are challenging for small
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADMap: Anti-disturbance framework for reconstructing online vectorized
+  HD map 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Hu, Fanyi Wang, Yaonong Wang, Laifeng Hu, Jingwei Xu, Zhiwang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of autonomous driving, online high-definition (HD) map
+reconstruction is crucial for planning tasks. Recent research has developed
+several high-performance HD map reconstruction models to meet this necessity.
+However, the point sequences within the instance vectors may be jittery or
+jagged due to prediction bias, which can impact subsequent tasks. Therefore,
+this paper proposes the Anti-disturbance Map reconstruction framework (ADMap).
+To mitigate point-order jitter, the framework consists of three modules:
+Multi-Scale Perception Neck, Instance Interactive Attention (IIA), and Vector
+Direction Difference Loss (VDDL). By exploring the point-order relationships
+between and within instances in a cascading manner, the model can monitor the
+point-order prediction process more effectively. ADMap achieves
+state-of-the-art performance on the nuScenes and Argoverse2 datasets. Extensive
+results demonstrate its ability to produce stable and reliable map elements in
+complex and changing driving scenarios. Code and more demos are available at
+https://github.com/hht1996ok/ADMap.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Generalized Multiscale Bundle-Based Hyperspectral Sparse Unmixing
+  Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13161v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13161v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luciano Carvalho Ayres, Ricardo Augusto Borsoi, José Carlos Moreira Bermudez, Sérgio José Melo de Almeida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In hyperspectral sparse unmixing, a successful approach employs spectral
+bundles to address the variability of the endmembers in the spatial domain.
+However, the regularization penalties usually employed aggregate substantial
+computational complexity, and the solutions are very noise-sensitive. We
+generalize a multiscale spatial regularization approach to solve the unmixing
+problem by incorporating group sparsity-inducing mixed norms. Then, we propose
+a noise-robust method that can take advantage of the bundle structure to deal
+with endmember variability while ensuring inter- and intra-class sparsity in
+abundance estimation with reasonable computational cost. We also present a
+general heuristic to select the \emph{most representative} abundance estimation
+over multiple runs of the unmixing process, yielding a solution that is robust
+and highly reproducible. Experiments illustrate the robustness and consistency
+of the results when compared to related methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LAA-Net: Localized Artifact Attention Network for High-Quality Deepfakes
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dat Nguyen, Nesryne Mejri, Inder Pal Singh, Polina Kuleshova, Marcella Astrid, Anis Kacem, Enjie Ghorbel, Djamila Aouada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel approach for high-quality deepfake detection
+called Localized Artifact Attention Network (LAA-Net). Existing methods for
+high-quality deepfake detection are mainly based on a supervised binary
+classifier coupled with an implicit attention mechanism. As a result, they do
+not generalize well to unseen manipulations. To handle this issue, two main
+contributions are made. First, an explicit attention mechanism within a
+multi-task learning framework is proposed. By combining heatmap-based and
+self-consistency attention strategies, LAA-Net is forced to focus on a few
+small artifact-prone vulnerable regions. Second, an Enhanced Feature Pyramid
+Network (E-FPN) is proposed as a simple and effective mechanism for spreading
+discriminative low-level features into the final feature output, with the
+advantage of limiting redundancy. Experiments performed on several benchmarks
+show the superiority of our approach in terms of Area Under the Curve (AUC) and
+Average Precision (AP). The code will be released soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Dataset</span> and Benchmark: Novel Sensors for Autonomous Vehicle Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Spencer Carmichael, Austin Buchan, Mani Ramanagopal, Radhika Ravi, Ram Vasudevan, Katherine A. Skinner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional cameras employed in autonomous vehicle (AV) systems support many
+perception tasks, but are challenged by low-light or high dynamic range scenes,
+adverse weather, and fast motion. Novel sensors, such as event and thermal
+cameras, offer capabilities with the potential to address these scenarios, but
+they remain to be fully exploited. This paper introduces the Novel Sensors for
+Autonomous Vehicle Perception (NSAVP) dataset to facilitate future research on
+this topic. The dataset was captured with a platform including stereo event,
+thermal, monochrome, and RGB cameras as well as a high precision navigation
+system providing ground truth poses. The data was collected by repeatedly
+driving two ~8 km routes and includes varied lighting conditions and opposing
+viewpoint perspectives. We provide benchmarking experiments on the task of
+place recognition to demonstrate challenges and opportunities for novel sensors
+to enhance critical AV perception tasks. To our knowledge, the NSAVP dataset is
+the first to include stereo thermal cameras together with stereo event and
+monochrome cameras. The dataset and supporting software suite is available at:
+https://umautobots.github.io/nsavp
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Democratizing Fine-grained Visual Recognition with Large Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingxuan Liu, Subhankar Roy, Wenjing Li, Zhun Zhong, Nicu Sebe, Elisa Ricci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying subordinate-level categories from images is a longstanding task
+in computer vision and is referred to as fine-grained visual recognition
+(FGVR). It has tremendous significance in real-world applications since an
+average layperson does not excel at differentiating species of birds or
+mushrooms due to subtle differences among the species. A major bottleneck in
+developing FGVR systems is caused by the need of high-quality paired expert
+annotations. To circumvent the need of expert knowledge we propose Fine-grained
+Semantic Category Reasoning (FineR) that internally leverages the world
+knowledge of large language models (LLMs) as a proxy in order to reason about
+fine-grained category names. In detail, to bridge the modality gap between
+images and LLM, we extract part-level visual attributes from images as text and
+feed that information to a LLM. Based on the visual attributes and its internal
+world knowledge the LLM reasons about the subordinate-level category names. Our
+training-free FineR outperforms several state-of-the-art FGVR and language and
+vision assistant models and shows promise in working in the wild and in new
+domains where gathering expert annotation is arduous.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a conference paper at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffuse to Choose: Enriching Image Conditioned Inpainting in Latent
+  Diffusion Models for Virtual Try-All 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehmet Saygin Seyfioglu, Karim Bouyarmane, Suren Kumar, Amir Tavanaei, Ismail B. Tutar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As online shopping is growing, the ability for buyers to virtually visualize
+products in their settings-a phenomenon we define as "Virtual Try-All"-has
+become crucial. Recent diffusion models inherently contain a world model,
+rendering them suitable for this task within an inpainting context. However,
+traditional image-conditioned diffusion models often fail to capture the
+fine-grained details of products. In contrast, personalization-driven models
+such as DreamPaint are good at preserving the item's details but they are not
+optimized for real-time applications. We present "Diffuse to Choose," a novel
+diffusion-based image-conditioned inpainting model that efficiently balances
+fast inference with the retention of high-fidelity details in a given reference
+item while ensuring accurate semantic manipulations in the given scene content.
+Our approach is based on incorporating fine-grained features from the reference
+image directly into the latent feature maps of the main diffusion model,
+alongside with a perceptual loss to further preserve the reference item's
+details. We conduct extensive testing on both in-house and publicly available
+datasets, and show that Diffuse to Choose is superior to existing zero-shot
+diffusion inpainting methods as well as few-shot diffusion personalization
+algorithms like DreamPaint.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FoVA-Depth: Field-of-View Agnostic Depth Estimation for Cross-<span class="highlight-title">Dataset</span>
+  Generalization <span class="chip">3DV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13786v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13786v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Lichy, Hang Su, Abhishek Badki, Jan Kautz, Orazio Gallo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wide field-of-view (FoV) cameras efficiently capture large portions of the
+scene, which makes them attractive in multiple domains, such as automotive and
+robotics. For such applications, estimating depth from multiple images is a
+critical task, and therefore, a large amount of ground truth (GT) data is
+available. Unfortunately, most of the GT data is for pinhole cameras, making it
+impossible to properly train depth estimation models for large-FoV cameras. We
+propose the first method to train a stereo depth estimation model on the widely
+available pinhole data, and to generalize it to data captured with larger FoVs.
+Our intuition is simple: We warp the training data to a canonical, large-FoV
+representation and augment it to allow a single network to reason about diverse
+types of distortions that otherwise would prevent generalization. We show
+strong generalization ability of our approach on both indoor and outdoor
+datasets, which was not possible with previous methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3DV 2024 (Oral); Project Website:
+  https://research.nvidia.com/labs/lpr/fova-depth/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ S2TPVFormer: Spatio-Temporal Tri-Perspective View for temporally
+  coherent 3D Semantic Occupancy Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13785v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13785v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sathira Silva, Savindu Bhashitha Wannigama, Roshan Ragel, Gihan Jayatilaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Holistic understanding and reasoning in 3D scenes play a vital role in the
+success of autonomous driving systems. The evolution of 3D semantic occupancy
+prediction as a pretraining task for autonomous driving and robotic downstream
+tasks captures finer 3D details compared to methods like 3D detection. Existing
+approaches predominantly focus on spatial cues, often overlooking temporal
+cues. Query-based methods tend to converge on computationally intensive Voxel
+representation for encoding 3D scene information. This study introduces
+S2TPVFormer, an extension of TPVFormer, utilizing a spatiotemporal transformer
+architecture for coherent 3D semantic occupancy prediction. Emphasizing the
+importance of spatiotemporal cues in 3D scene perception, particularly in 3D
+semantic occupancy prediction, our work explores the less-explored realm of
+temporal cues. Leveraging Tri-Perspective View (TPV) representation, our
+spatiotemporal encoder generates temporally rich embeddings, improving
+prediction coherence while maintaining computational efficiency. To achieve
+this, we propose a novel Temporal Cross-View Hybrid Attention (TCVHA)
+mechanism, facilitating effective spatiotemporal information exchange across
+TPV views. Experimental evaluations on the nuScenes dataset demonstrate a
+substantial 3.1% improvement in mean Intersection over Union (mIoU) for 3D
+Semantic Occupancy compared to TPVFormer, confirming the effectiveness of the
+proposed S2TPVFormer in enhancing 3D scene perception.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tweets to Citations: Unveiling the Impact of Social Media Influencers on
+  AI Research Visibility 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iain Xie Weissburg, Mehir Arora, Liangming Pan, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the number of accepted papers at AI and ML conferences reaches into the
+thousands, it has become unclear how researchers access and read research
+publications. In this paper, we investigate the role of social media
+influencers in enhancing the visibility of machine learning research,
+particularly the citation counts of papers they share. We have compiled a
+comprehensive dataset of over 8,000 papers, spanning tweets from December 2018
+to October 2023, alongside 1:1 matched controls based on publication year,
+venue, and abstract topics. Our analysis reveals a significant increase in
+citations for papers endorsed by these influencers, with median citation counts
+2-3 times higher than those of the control group. Additionally, the study
+delves into the geographic, gender, and institutional diversity of highlighted
+authors. These findings highlight the expanding influence of social media in
+scholarly communication and underscore the importance of an evolving ecosystem
+in today's digital academic landscape.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Systematic Approach to Robustness Modelling for Deep Convolutional
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles Meyers, Mohammad Reza Saleh Sedghpour, Tommy Löfstedt, Erik Elmroth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural networks have shown to be widely applicable to a large
+number of fields when large amounts of labelled data are available. The recent
+trend has been to use models with increasingly larger sets of tunable
+parameters to increase model accuracy, reduce model loss, or create more
+adversarially robust models -- goals that are often at odds with one another.
+In particular, recent theoretical work raises questions about the ability for
+even larger models to generalize to data outside of the controlled train and
+test sets. As such, we examine the role of the number of hidden layers in the
+ResNet model, demonstrated on the MNIST, CIFAR10, CIFAR100 datasets. We test a
+variety of parameters including the size of the model, the floating point
+precision, and the noise level of both the training data and the model output.
+To encapsulate the model's predictive power and computational cost, we provide
+a method that uses induced failures to model the probability of failure as a
+function of time and relate that to a novel metric that allows us to quickly
+determine whether or not the cost of training a model outweighs the cost of
+attacking it. Using this approach, we are able to approximate the expected
+failure rate using a small number of specially crafted samples rather than
+increasingly larger benchmark datasets. We demonstrate the efficacy of this
+technique on both the MNIST and CIFAR10 datasets using 8-, 16-, 32-, and 64-bit
+floating-point numbers, various data pre-processing techniques, and several
+attacks on five configurations of the ResNet model. Then, using empirical
+measurements, we examine the various trade-offs between cost, robustness,
+latency, and reliability to find that larger models do not significantly aid in
+adversarial robustness despite costing significantly more to train.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-Guided Alignment for Unsupervised Domain Adaptation in
+  Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ismail Nejjar, Gaetan Frusque, Florent Forest, Olga Fink
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Domain Adaptation for Regression (UDAR) aims to adapt a model
+from a labeled source domain to an unlabeled target domain for regression
+tasks. Recent successful works in UDAR mostly focus on subspace alignment,
+involving the alignment of a selected subspace within the entire feature space.
+This contrasts with the feature alignment methods used for classification,
+which aim at aligning the entire feature space and have proven effective but
+are less so in regression settings. Specifically, while classification aims to
+identify separate clusters across the entire embedding dimension, regression
+induces less structure in the data representation, necessitating additional
+guidance for efficient alignment. In this paper, we propose an effective method
+for UDAR by incorporating guidance from uncertainty. Our approach serves a dual
+purpose: providing a measure of confidence in predictions and acting as a
+regularization of the embedding space. Specifically, we leverage the Deep
+Evidential Learning framework, which outputs both predictions and uncertainties
+for each input sample. We propose aligning the parameters of higher-order
+evidential distributions between the source and target domains using
+traditional alignment methods at the feature or posterior level. Additionally,
+we propose to augment the feature space representation by mixing source samples
+with pseudo-labeled target samples based on label similarity. This cross-domain
+mixing strategy produces more realistic samples than random mixing and
+introduces higher uncertainty, facilitating further alignment. We demonstrate
+the effectiveness of our approach on four benchmarks for UDAR, on which we
+outperform existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A fast horizon detector and a new annotated <span class="highlight-title">dataset</span> for maritime video
+  processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.13694v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.13694v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassir Zardoua, Boulaala Mohammed, Mhamed El Mrabet, Astito Abdelali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and fast sea horizon detection is vital for tasks in autonomous
+navigation and maritime security, such as video stabilization, target region
+reduction, precise tracking, and obstacle avoidance. This paper introduces a
+novel sea horizon detector from RGB videos, focusing on rapid and effective sea
+noise suppression while preserving weak horizon edges. Line fitting methods are
+subsequently employed on filtered edges for horizon detection. We address the
+filtering problem by extracting line segments with a very low edge threshold,
+ensuring the detection of line segments even in low-contrast horizon
+conditions. We show that horizon line segments have simple and relevant
+properties in RGB images, which we exploit to suppress noisy segments. Then we
+use the surviving segments to construct a filtered edge map and infer the
+horizon from the filtered edges. We propose a careful incorporation of temporal
+information for horizon inference and experimentally show its effectiveness. We
+address the computational constraint by providing a vectorized implementation
+for efficient CPU execution, and leveraging image downsizing with minimal loss
+of accuracy on the original size. Moreover, we contribute a public horizon line
+dataset to enrich existing data resources. Our algorithm's performance is
+rigorously evaluated against state-of-the-art methods, and its components are
+validated through ablation experiments. Source code and dataset files are
+available at:
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Coverage Axis++: Efficient Inner Point Selection for 3D Shape
+  Skeletonization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12946v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12946v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zimeng Wang, Zhiyang Dou, Rui Xu, Cheng Lin, Yuan Liu, Xiaoxiao Long, Shiqing Xin, Lingjie Liu, Taku Komura, Xiaoming Yuan, Wenping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Coverage Axis++, a novel and efficient approach to 3D shape
+skeletonization. The current state-of-the-art approaches for this task often
+rely on the watertightness of the input or suffer from substantial
+computational costs, thereby limiting their practicality. To address this
+challenge, Coverage Axis++ proposes a heuristic algorithm to select skeletal
+points, offering a high-accuracy approximation of the Medial Axis Transform
+(MAT) while significantly mitigating computational intensity for various shape
+representations. We introduce a simple yet effective strategy that considers
+both shape coverage and uniformity to derive skeletal points. The selection
+procedure enforces consistency with the shape structure while favoring the
+dominant medial balls, which thus introduces a compact underlying shape
+representation in terms of MAT. As a result, Coverage Axis++ allows for
+skeletonization for various shape representations (e.g., water-tight meshes,
+triangle soups, point clouds), specification of the number of skeletal points,
+few hyperparameters, and highly efficient computation with improved
+reconstruction accuracy. Extensive experiments across a wide range of 3D shapes
+validate the efficiency and effectiveness of Coverage Axis++. The code will be
+publicly available once the paper is published.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLIPSelf: Vision <span class="highlight-title">Transformer</span> Distills Itself for Open-Vocabulary Dense
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01403v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01403v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Size Wu, Wenwei Zhang, Lumin Xu, Sheng Jin, Xiangtai Li, Wentao Liu, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-vocabulary dense prediction tasks including object detection and image
+segmentation have been advanced by the success of Contrastive Language-Image
+Pre-training (CLIP). CLIP models, particularly those incorporating vision
+transformers (ViTs), have exhibited remarkable generalization ability in
+zero-shot image classification. However, when transferring the vision-language
+alignment of CLIP from global image representation to local region
+representation for the open-vocabulary dense prediction tasks, CLIP ViTs suffer
+from the domain shift from full images to local image regions. In this paper,
+we embark on an in-depth analysis of the region-language alignment in CLIP
+models, which is essential for downstream open-vocabulary dense prediction
+tasks. Subsequently, we propose an approach named CLIPSelf, which adapts the
+image-level recognition ability of CLIP ViT to local image regions without
+needing any region-text pairs. CLIPSelf empowers ViTs to distill itself by
+aligning a region representation extracted from its dense feature map with the
+image-level representation of the corresponding image crop. With the enhanced
+CLIP ViTs, we achieve new state-of-the-art performance on open-vocabulary
+object detection, semantic segmentation, and panoptic segmentation across
+various benchmarks. Models and code are released at
+https://github.com/wusize/CLIPSelf.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hidden Flaws Behind Expert-Level Accuracy of <span class="highlight-title">GPT</span>-4 Vision in Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08396v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08396v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiao Jin, Fangyuan Chen, Yiliang Zhou, Ziyang Xu, Justin M. Cheung, Robert Chen, Ronald M. Summers, Justin F. Rousseau, Peiyun Ni, Marc J Landsman, Sally L. Baxter, Subhi J. Al'Aref, Yijia Li, Michael F. Chiang, Yifan Peng, Zhiyong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies indicate that Generative Pre-trained Transformer 4 with Vision
+(GPT-4V) outperforms human physicians in medical challenge tasks. However,
+these evaluations primarily focused on the accuracy of multi-choice questions
+alone. Our study extends the current scope by conducting a comprehensive
+analysis of GPT-4V's rationales of image comprehension, recall of medical
+knowledge, and step-by-step multimodal reasoning when solving New England
+Journal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test
+the knowledge and diagnostic capabilities of medical professionals. Evaluation
+results confirmed that GPT-4V outperforms human physicians regarding
+multi-choice accuracy (88.0% vs. 77.0%, p=0.034). GPT-4V also performs well in
+cases where physicians incorrectly answer, with over 80% accuracy. However, we
+discovered that GPT-4V frequently presents flawed rationales in cases where it
+makes the correct final choices (27.3%), most prominent in image comprehension
+(21.6%). Regardless of GPT-4V's high accuracy in multi-choice questions, our
+findings emphasize the necessity for further in-depth evaluations of its
+rationales before integrating such models into clinical workflows.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual inspection for illicit items in X-ray images using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03658v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03658v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Mademlis, Georgios Batsis, Adamantia Anna Rebolledo Chrysochoou, Georgios Th. Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated detection of contraband items in X-ray images can significantly
+increase public safety, by enhancing the productivity and alleviating the
+mental load of security officers in airports, subways, customs/post offices,
+etc. The large volume and high throughput of passengers, mailed parcels, etc.,
+during rush hours practically make it a Big Data problem. Modern computer
+vision algorithms relying on Deep Neural Networks (DNNs) have proven capable of
+undertaking this task even under resource-constrained and embedded execution
+scenarios, e.g., as is the case with fast, single-stage object detectors.
+However, no comparative experimental assessment of the various relevant DNN
+components/methods has been performed under a common evaluation protocol, which
+means that reliable cross-method comparisons are missing. This paper presents
+exactly such a comparative assessment, utilizing a public relevant dataset and
+a well-defined methodology for selecting the specific DNN components/modules
+that are being evaluated. The results indicate the superiority of Transformer
+detectors, the obsolete nature of auxiliary neural modules that have been
+developed in the past few years for security applications and the efficiency of
+the CSP-DarkNet backbone CNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2305.01936</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UMedNeRF: Uncertainty-aware Single View Volumetric Rendering for Medical
+  Neural Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.05836v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.05836v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Hu, Qinrui Fan, Shu Hu, Siwei Lyu, Xi Wu, Xin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of clinical medicine, computed tomography (CT) is an effective
+medical imaging modality for the diagnosis of various pathologies. Compared
+with X-ray images, CT images can provide more information, including
+multi-planar slices and three-dimensional structures for clinical diagnosis.
+However, CT imaging requires patients to be exposed to large doses of ionizing
+radiation for a long time, which may cause irreversible physical harm. In this
+paper, we propose an Uncertainty-aware MedNeRF (UMedNeRF) network based on
+generated radiation fields. The network can learn a continuous representation
+of CT projections from 2D X-ray images by obtaining the internal structure and
+depth information and using adaptive loss weights to ensure the quality of the
+generated images. Our model is trained on publicly available knee and chest
+datasets, and we show the results of CT projection rendering with a single
+X-ray and compare our method with other methods based on generated radiation
+fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Multi-view Stereo with Late Cost Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11751v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11751v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiang Wu, Rui Li, Yu Zhu, Wenxun Zhao, Jinqiu Sun, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pairwise matching cost aggregation is a crucial step for modern
+learning-based Multi-view Stereo (MVS). Prior works adopt an early aggregation
+scheme, which adds up pairwise costs into an intermediate cost. However, we
+analyze that this process can degrade informative pairwise matchings, thereby
+blocking the depth network from fully utilizing the original geometric matching
+cues. To address this challenge, we present a late aggregation approach that
+allows for aggregating pairwise costs throughout the network feed-forward
+process, achieving accurate estimations with only minor changes of the plain
+CasMVSNet. Instead of building an intermediate cost by weighted sum, late
+aggregation preserves all pairwise costs along a distinct view channel. This
+enables the succeeding depth network to fully utilize the crucial geometric
+cues without loss of cost fidelity. Grounded in the new aggregation scheme, we
+propose further techniques addressing view order dependence inside the
+preserved cost, handling flexible testing views, and improving the depth
+filtering process. Despite its technical simplicity, our method improves
+significantly upon the baseline cascade-based approach, achieving comparable
+results with state-of-the-art methods with favorable computation overhead.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and models are available at https://github.com/Wuuu3511/LAMVSNET</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VELMA: Verbalization Embodiment of LLM Agents for Vision and Language
+  Navigation in Street View <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06082v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06082v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Schumann, Wanrong Zhu, Weixi Feng, Tsu-Jui Fu, Stefan Riezler, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incremental decision making in real-world environments is one of the most
+challenging tasks in embodied artificial intelligence. One particularly
+demanding scenario is Vision and Language Navigation~(VLN) which requires
+visual and natural language understanding as well as spatial and temporal
+reasoning capabilities. The embodied agent needs to ground its understanding of
+navigation instructions in observations of a real-world environment like Street
+View. Despite the impressive results of LLMs in other research areas, it is an
+ongoing problem of how to best connect them with an interactive visual
+environment. In this work, we propose VELMA, an embodied LLM agent that uses a
+verbalization of the trajectory and of visual environment observations as
+contextual prompt for the next action. Visual information is verbalized by a
+pipeline that extracts landmarks from the human written navigation instructions
+and uses CLIP to determine their visibility in the current panorama view. We
+show that VELMA is able to successfully follow navigation instructions in
+Street View with only two in-context examples. We further finetune the LLM
+agent on a few thousand examples and achieve 25%-30% relative improvement in
+task completion over the previous state-of-the-art for two datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision <span class="highlight-title">Transformer</span>s increase efficiency of 3D cardiac CT multi-label
+  segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09099v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09099v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lee Jollans, Mariana Bustamante, Lilian Henriksson, Anders Persson, Tino Ebbers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation of the heart is essential for personalized blood flow
+simulations and surgical intervention planning. Segmentations need to be
+accurate in every spatial dimension, which is not ensured by segmenting data
+slice by slice. Two cardiac computed tomography (CT) datasets consisting of 760
+volumes across the whole cardiac cycle from 39 patients, and of 60 volumes from
+60 patients respectively were used to train networks to simultaneously segment
+multiple regions representing the whole heart in 3D. The segmented regions
+included the left and right atrium and ventricle, left ventricular myocardium,
+ascending aorta, pulmonary arteries, pulmonary veins, and left atrial
+appendage. The widely used 3D U-Net and the UNETR architecture were compared to
+our proposed method optimized for large volumetric inputs. The proposed network
+architecture, termed Transformer Residual U-Net (TRUNet), maintains the cascade
+downsampling encoder, cascade upsampling decoder and skip connections from
+U-Net, while incorporating a Vision Transformer (ViT) block in the encoder
+alongside a modified ResNet50 block. TRUNet reached higher segmentation
+performance for all structures within approximately half the training time
+needed for 3D U-Net and UNETR. The proposed method achieved more precise vessel
+boundary segmentations and better captured the heart's overall anatomical
+structure compared to the other methods. The fast training time and accurate
+delineation of adjacent structures makes TRUNet a promising candidate for
+medical image segmentation tasks. The code for TRUNet is available at
+github.com/ljollans/TRUNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ M2ORT: Many-To-One Regression <span class="highlight-title">Transformer</span> for Spatial Transcriptomics
+  Prediction from Histopathology Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10608v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10608v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyi Wang, Xiuju Du, Jing Liu, Shuyi Ouyang, Yen-Wei Chen, Lanfen Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of Spatial Transcriptomics (ST) has facilitated the
+spatially-aware profiling of gene expressions based on histopathology images.
+Although ST data offers valuable insights into the micro-environment of tumors,
+its acquisition cost remains expensive. Therefore, directly predicting the ST
+expressions from digital pathology images is desired. Current methods usually
+adopt existing regression backbones for this task, which ignore the inherent
+multi-scale hierarchical data structure of digital pathology images. To address
+this limit, we propose M2ORT, a many-to-one regression Transformer that can
+accommodate the hierarchical structure of the pathology images through a
+decoupled multi-scale feature extractor. Different from traditional models that
+are trained with one-to-one image-label pairs, M2ORT accepts multiple pathology
+images of different magnifications at a time to jointly predict the gene
+expressions at their corresponding common ST spot, aiming at learning a
+many-to-one relationship through training. We have tested M2ORT on three public
+ST datasets and the experimental results show that M2ORT can achieve
+state-of-the-art performance with fewer parameters and floating-point
+operations (FLOPs). The code is available at:
+https://github.com/Dootmaan/M2ORT/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MotionMix: Weakly-Supervised Diffusion for Controllable Motion
+  Generation <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11115v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11115v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nhat M. Hoang, Kehong Gong, Chuan Guo, Michael Bi Mi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Controllable generation of 3D human motions becomes an important topic as the
+world embraces digital transformation. Existing works, though making promising
+progress with the advent of diffusion models, heavily rely on meticulously
+captured and annotated (e.g., text) high-quality motion corpus, a
+resource-intensive endeavor in the real world. This motivates our proposed
+MotionMix, a simple yet effective weakly-supervised diffusion model that
+leverages both noisy and unannotated motion sequences. Specifically, we
+separate the denoising objectives of a diffusion model into two stages:
+obtaining conditional rough motion approximations in the initial $T-T^*$ steps
+by learning the noisy annotated motions, followed by the unconditional
+refinement of these preliminary motions during the last $T^*$ steps using
+unannotated motions. Notably, though learning from two sources of imperfect
+data, our model does not compromise motion generation quality compared to fully
+supervised approaches that access gold data. Extensive experiments on several
+benchmarks demonstrate that our MotionMix, as a versatile framework,
+consistently achieves state-of-the-art performances on text-to-motion,
+action-to-motion, and music-to-dance tasks. Project page:
+https://nhathoang2002.github.io/MotionMix-page/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 38th Association for the Advancement of Artificial
+  Intelligence (AAAI) Conference on Artificial Intelligence, Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Model Based Posterior Sampling for Noisy Linear Inverse
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.12343v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.12343v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangming Meng, Yoshiyuki Kabashima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the ubiquitous linear inverse problems with additive Gaussian
+noise and propose an unsupervised sampling approach called diffusion model
+based posterior sampling (DMPS) to reconstruct the unknown signal from noisy
+linear measurements. Specifically, using one diffusion model (DM) as an
+implicit prior, the fundamental difficulty in performing posterior sampling is
+that the noise-perturbed likelihood score, i.e., gradient of an annealed
+likelihood function, is intractable. To circumvent this problem, we introduce a
+simple yet effective closed-form approximation using an uninformative prior
+assumption. Extensive experiments are conducted on a variety of noisy linear
+inverse problems such as noisy super-resolution, denoising, deblurring, and
+colorization. In all tasks, the proposed DMPS demonstrates highly competitive
+or even better performances on various tasks while being 3 times faster than
+the state-of-the-art competitor diffusion posterior sampling (DPS).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/mengxiangming/dmps</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpeechAct: Towards Generating Whole-body Motion from Speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.17425v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.17425v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinsong Zhang, Minjie Zhu, Yuxiang Zhang, Yebin Liu, Kun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the problem of generating whole-body motion from speech.
+Despite great successes, prior methods still struggle to produce reasonable and
+diverse whole-body motions from speech. This is due to their reliance on
+suboptimal representations and a lack of strategies for generating diverse
+results. To address these challenges, we present a novel hybrid point
+representation to achieve accurate and continuous motion generation, e.g.,
+avoiding foot skating, and this representation can be transformed into an
+easy-to-use representation, i.e., SMPL-X body mesh, for many applications. To
+generate whole-body motion from speech, for facial motion, closely tied to the
+audio signal, we introduce an encoder-decoder architecture to achieve
+deterministic outcomes. However, for the body and hands, which have weaker
+connections to the audio signal, we aim to generate diverse yet reasonable
+motions. To boost diversity in motion generation, we propose a contrastive
+motion learning method to encourage the model to produce more distinctive
+representations. Specifically, we design a robust VQ-VAE to learn a quantized
+motion codebook using our hybrid representation. Then, we regress the motion
+representation from the audio signal by a translation model employing our
+contrastive motion learning method. Experimental results validate the superior
+performance and the correctness of our model. The project page is available for
+research purposes at http://cic.tju.edu.cn/faculty/likun/projects/SpeechAct.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>the manuscript should be revised</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modularized Zero-shot VQA with <span class="highlight-title">Pre-train</span>ed Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17369v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17369v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Cao, Jing Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale pre-trained models (PTMs) show great zero-shot capabilities. In
+this paper, we study how to leverage them for zero-shot visual question
+answering (VQA). Our approach is motivated by a few observations. First, VQA
+questions often require multiple steps of reasoning, which is still a
+capability that most PTMs lack. Second, different steps in VQA reasoning chains
+require different skills such as object detection and relational reasoning, but
+a single PTM may not possess all these skills. Third, recent work on zero-shot
+VQA does not explicitly consider multi-step reasoning chains, which makes them
+less interpretable compared with a decomposition-based approach. We propose a
+modularized zero-shot network that explicitly decomposes questions into sub
+reasoning steps and is highly interpretable. We convert sub reasoning tasks to
+acceptable objectives of PTMs and assign tasks to proper PTMs without any
+adaptation. Our experiments on two VQA benchmarks under the zero-shot setting
+demonstrate the effectiveness of our method and better interpretability
+compared with several baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted as Findings in ACL 2023; Code available:
+  https://github.com/abril4416/Mod-Zero-VQA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GaitPT: Skeletons Are All You Need For Gait Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10623v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10623v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Catruna, Adrian Cosma, Emilian Radoi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The analysis of patterns of walking is an important area of research that has
+numerous applications in security, healthcare, sports and human-computer
+interaction. Lately, walking patterns have been regarded as a unique
+fingerprinting method for automatic person identification at a distance. In
+this work, we propose a novel gait recognition architecture called Gait Pyramid
+Transformer (GaitPT) that leverages pose estimation skeletons to capture unique
+walking patterns, without relying on appearance information. GaitPT adopts a
+hierarchical transformer architecture that effectively extracts both spatial
+and temporal features of movement in an anatomically consistent manner, guided
+by the structure of the human skeleton. Our results show that GaitPT achieves
+state-of-the-art performance compared to other skeleton-based gait recognition
+works, in both controlled and in-the-wild scenarios. GaitPT obtains 82.6%
+average accuracy on CASIA-B, surpassing other works by a margin of 6%.
+Moreover, it obtains 52.16% Rank-1 accuracy on GREW, outperforming both
+skeleton-based and appearance-based approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Target Aware Network Architecture Search and Compression for Efficient
+  Knowledge Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.05967v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.05967v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S. H. Shabbeer Basha, Debapriya Tula, Sravan Kumar Vinakota, Shiv Ram Dubey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transfer Learning enables Convolutional Neural Networks (CNN) to acquire
+knowledge from a source domain and transfer it to a target domain, where
+collecting large-scale annotated examples is time-consuming and expensive.
+Conventionally, while transferring the knowledge learned from one task to
+another task, the deeper layers of a pre-trained CNN are finetuned over the
+target dataset. However, these layers are originally designed for the source
+task which may be over-parameterized for the target task. Thus, finetuning
+these layers over the target dataset may affect the generalization ability of
+the CNN due to high network complexity. To tackle this problem, we propose a
+two-stage framework called TASCNet which enables efficient knowledge transfer.
+In the first stage, the configuration of the deeper layers is learned
+automatically and finetuned over the target dataset. Later, in the second
+stage, the redundant filters are pruned from the fine-tuned CNN to decrease the
+network's complexity for the target task while preserving the performance. This
+two-stage mechanism finds a compact version of the pre-trained CNN with optimal
+structure (number of filters in a convolutional layer, number of neurons in a
+dense layer, and so on) from the hypothesis space. The efficacy of the proposed
+method is evaluated using VGG-16, ResNet-50, and DenseNet-121 on CalTech-101,
+CalTech-256, and Stanford Dogs datasets. Similar to computer vision tasks, we
+have also conducted experiments on Movie Review Sentiment Analysis task. The
+proposed TASCNet reduces the computational complexity of pre-trained CNNs over
+the target task by reducing both trainable parameters and FLOPs which enables
+resource-efficient knowledge transfer. The source code is available at:
+https://github.com/Debapriya-Tula/TASCNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted for publication in Multimedia Systems Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DreamCom: Finetuning Text-guided Inpainting Model for Image Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15508v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15508v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingxiao Lu, Jiangtong Li, Bo Zhang, Li Niu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of image composition is merging a foreground object into a
+background image to obtain a realistic composite image. Recently, generative
+composition methods are built on large pretrained diffusion models, due to
+their unprecedented image generation ability. However, they are weak in
+preserving the foreground object details. Inspired by recent text-to-image
+generation customized for certain object, we propose DreamCom by treating image
+composition as text-guided image inpainting customized for certain object.
+Specifically , we finetune pretrained text-guided image inpainting model based
+on a few reference images containing the same object, during which the text
+prompt contains a special token associated with this object. Then, given a new
+background, we can insert this object into the background with the text prompt
+containing the special token. In practice, the inserted object may be adversely
+affected by the background, so we propose masked attention mechanisms to avoid
+negative background interference. Experimental results on DreamEditBench and
+our contributed MureCom dataset show the outstanding performance of our
+DreamCom.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Learning for the Primitives of 3D Affordance in General
+  Objects <span class="chip">SP3</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12978v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12978v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeonwoo Kim, Sookwan Han, Patrick Kwon, Hanbyul Joo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the major challenges in AI is teaching machines to precisely respond
+and utilize environmental functionalities, thereby achieving the affordance
+awareness that humans possess. Despite its importance, the field has been
+lagging in terms of learning, especially in 3D, as annotating affordance
+accompanies a laborious process due to the numerous variations of human-object
+interaction. The low availability of affordance data limits the learning in
+terms of generalization for object categories, and also simplifies the
+representation of affordance, capturing only a fraction of the affordance. To
+overcome these challenges, we propose a novel, self-supervised method to
+generate the 3D affordance examples given only a 3D object, without any manual
+annotations. The method starts by capturing the 3D object into images and
+creating 2D affordance images by inserting humans into the image via inpainting
+diffusion models, where we present the Adaptive Mask algorithm to enable human
+insertion without altering the original details of the object. The method
+consequently lifts inserted humans back to 3D to create 3D human-object pairs,
+where the depth ambiguity is resolved within a depth optimization framework
+that utilizes pre-generated human postures from multiple viewpoints. We also
+provide a novel affordance representation defined on relative orientations and
+proximity between dense human and object points, that can be easily aggregated
+from any 3D HOI datasets. The proposed representation serves as a primitive
+that can be manifested to conventional affordance representations via simple
+transformations, ranging from physically exerted affordances to nonphysical
+ones. We demonstrate the efficacy of our method and representation by
+generating the 3D affordance samples and deriving high-quality affordance
+examples from the representation, including contact, orientation, and spatial
+occupancies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://sshowbiz.github.io/ZSP3A/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Detection by Approximation of Ensemble Boundary 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.10227v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.10227v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        T. Windeatt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A new method of detecting adversarial attacks is proposed for an ensemble of
+Deep Neural Networks (DNNs) solving two-class pattern recognition problems. The
+ensemble is combined using Walsh coefficients which are capable of
+approximating Boolean functions and thereby controlling the complexity of the
+ensemble decision boundary. The hypothesis in this paper is that decision
+boundaries with high curvature allow adversarial perturbations to be found, but
+change the curvature of the decision boundary, which is then approximated in a
+different way by Walsh coefficients compared to the clean images. By observing
+the difference in Walsh coefficient approximation between clean and adversarial
+images, it is shown experimentally that transferability of attack may be used
+for detection. Furthermore, approximating the decision boundary may aid in
+understanding the learning and transferability properties of DNNs. While the
+experiments here use images, the proposed approach of modelling two-class
+ensemble decision boundaries could in principle be applied to any application
+area. Code for approximating Boolean functions using Walsh coefficients:
+https://doi.org/10.24433/CO.3695905.v1
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LKFormer: Large Kernel <span class="highlight-title">Transformer</span> for Infrared Image Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11859v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11859v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiwei Qin, Kang Yan, Changmiao Wang, Ruiquan Ge, Yong Peng, Kai Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the broad application of infrared technology across diverse fields,
+there is an increasing emphasis on investigating super-resolution techniques
+for infrared images within the realm of deep learning. Despite the impressive
+results of current Transformer-based methods in image super-resolution tasks,
+their reliance on the self-attentive mechanism intrinsic to the Transformer
+architecture results in images being treated as one-dimensional sequences,
+thereby neglecting their inherent two-dimensional structure. Moreover, infrared
+images exhibit a uniform pixel distribution and a limited gradient range,
+posing challenges for the model to capture effective feature information.
+Consequently, we suggest a potent Transformer model, termed Large Kernel
+Transformer (LKFormer), to address this issue. Specifically, we have designed a
+Large Kernel Residual Attention (LKRA) module with linear complexity. This
+mainly employs depth-wise convolution with large kernels to execute non-local
+feature modeling, thereby substituting the standard self-attentive layer.
+Additionally, we have devised a novel feed-forward network structure called
+Gated-Pixel Feed-Forward Network (GPFN) to augment the LKFormer's capacity to
+manage the information flow within the network. Comprehensive experimental
+results reveal that our method surpasses the most advanced techniques
+available, using fewer parameters and yielding considerably superior
+performance.The source code will be available at
+https://github.com/sad192/large-kernel-Transformer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 4 figures, accept Multimedia Tools and Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PSAvatar: A Point-based Morphable Shape Model for Real-Time Head Avatar
+  Creation with 3D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12900v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12900v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongyuan Zhao, Zhenyu Bao, Qing Li, Guoping Qiu, Kanglin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite much progress, achieving real-time high-fidelity head avatar
+animation is still difficult and existing methods have to trade-off between
+speed and quality. 3DMM based methods often fail to model non-facial structures
+such as eyeglasses and hairstyles, while neural implicit models suffer from
+deformation inflexibility and rendering inefficiency. Although 3D Gaussian has
+been demonstrated to possess promising capability for geometry representation
+and radiance field reconstruction, applying 3D Gaussian in head avatar creation
+remains a major challenge since it is difficult for 3D Gaussian to model the
+head shape variations caused by changing poses and expressions. In this paper,
+we introduce PSAvatar, a novel framework for animatable head avatar creation
+that utilizes discrete geometric primitive to create a parametric morphable
+shape model and employs 3D Gaussian for fine detail representation and high
+fidelity rendering. The parametric morphable shape model is a Point-based
+Morphable Shape Model (PMSM) which uses points instead of meshes for 3D
+representation to achieve enhanced representation flexibility. The PMSM first
+converts the FLAME mesh to points by sampling on the surfaces as well as off
+the meshes to enable the reconstruction of not only surface-like structures but
+also complex geometries such as eyeglasses and hairstyles. By aligning these
+points with the head shape in an analysis-by-synthesis manner, the PMSM makes
+it possible to utilize 3D Gaussian for fine detail representation and
+appearance modeling, thus enabling the creation of high-fidelity avatars. We
+show that PSAvatar can reconstruct high-fidelity head avatars of a variety of
+subjects and the avatars can be animated in real-time ($\ge$ 25 fps at a
+resolution of 512 $\times$ 512 ).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tensor PCA from basis in tensor space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02803v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02803v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claudio Turchetti, Laura Falaschetti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aim of this paper is to present a mathematical framework for tensor PCA.
+The proposed approach is able to overcome the limitations of previous methods
+that extract a low dimensional subspace by iteratively solving an optimization
+problem. The core of the proposed approach is the derivation of a basis in
+tensor space from a real self-adjoint tensor operator, thus reducing the
+problem of deriving a basis to an eigenvalue problem. Three different cases
+have been studied to derive: i) a basis from a self-adjoint tensor operator;
+ii) a rank-1 basis; iii) a basis in a subspace. In particular, the equivalence
+between eigenvalue equation for a real self-adjoint tensor operator and
+standard matrix eigenvalue equation has been proven. For all the three cases
+considered, a subspace approach has been adopted to derive a tensor PCA.
+Experiments on image datasets validate the proposed mathematical framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version contains a new experiment better showing the
+  potentiality of the paper and a corrected autor list. This work has been
+  submitted to the IEEE for possible publication. Copyright may be transferred
+  without notice, after which this version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comparative Study of Image Restoration Networks for General Backbone
+  Network Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11881v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11881v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyu Chen, Zheyuan Li, Yuandong Pu, Yihao Liu, Jiantao Zhou, Yu Qiao, Chao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the significant progress made by deep models in various image
+restoration tasks, existing image restoration networks still face challenges in
+terms of task generality. An intuitive manifestation is that networks which
+excel in certain tasks often fail to deliver satisfactory results in others. To
+illustrate this point, we select five representative networks and conduct a
+comparative study on five classic image restoration tasks. First, we provide a
+detailed explanation of the characteristics of different image restoration
+tasks and backbone networks. Following this, we present the benchmark results
+and analyze the reasons behind the performance disparity of different models
+across various tasks. Drawing from this comparative study, we propose that a
+general image restoration backbone network needs to meet the functional
+requirements of diverse tasks. Based on this principle, we design a new general
+image restoration backbone network, X-Restormer. Extensive experiments
+demonstrate that X-Restormer possesses good task generality and achieves
+state-of-the-art performance across a variety of tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bidirectional recurrent imputation and abundance estimation of LULC
+  classes with MODIS multispectral time series and geo-topographic and climatic
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07223v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07223v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José Rodríguez-Ortega, Rohaifa Khaldi, Domingo Alcaraz-Segura, Siham Tabik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remotely sensed data are dominated by mixed Land Use and Land Cover (LULC)
+types. Spectral unmixing (SU) is a key technique that disentangles mixed pixels
+into constituent LULC types and their abundance fractions. While existing
+studies on Deep Learning (DL) for SU typically focus on single time-step
+hyperspectral (HS) or multispectral (MS) data, our work pioneers SU using MODIS
+MS time series, addressing missing data with end-to-end DL models. Our approach
+enhances a Long-Short Term Memory (LSTM)-based model by incorporating
+geographic, topographic (geo-topographic), and climatic ancillary information.
+Notably, our method eliminates the need for explicit endmember extraction,
+instead learning the input-output relationship between mixed spectra and LULC
+abundances through supervised learning. Experimental results demonstrate that
+integrating spectral-temporal input data with geo-topographic and climatic
+information significantly improves the estimation of LULC abundances in mixed
+pixels. To facilitate this study, we curated a novel labeled dataset for
+Andalusia (Spain) with monthly MODIS multispectral time series at 460m
+resolution for 2013. Named Andalusia MultiSpectral MultiTemporal Unmixing
+(Andalusia-MSMTU), this dataset provides pixel-level annotations of LULC
+abundances along with ancillary information. The dataset
+(https://zenodo.org/records/7752348) and code
+(https://github.com/jrodriguezortega/MSMTU) are available to the public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TopP&R: Robust Support Estimation Approach for Evaluating Fidelity and
+  Diversity in Generative Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08013v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08013v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pum Jun Kim, Yoojin Jang, Jisu Kim, Jaejun Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a robust and reliable evaluation metric for generative models by
+introducing topological and statistical treatments for rigorous support
+estimation. Existing metrics, such as Inception Score (IS), Frechet Inception
+Distance (FID), and the variants of Precision and Recall (P&R), heavily rely on
+supports that are estimated from sample features. However, the reliability of
+their estimation has not been seriously discussed (and overlooked) even though
+the quality of the evaluation entirely depends on it. In this paper, we propose
+Topological Precision and Recall (TopP&R, pronounced 'topper'), which provides
+a systematic approach to estimating supports, retaining only topologically and
+statistically important features with a certain level of confidence. This not
+only makes TopP&R strong for noisy features, but also provides statistical
+consistency. Our theoretical and experimental results show that TopP&R is
+robust to outliers and non-independent and identically distributed (Non-IID)
+perturbations, while accurately capturing the true trend of change in samples.
+To the best of our knowledge, this is the first evaluation metric focused on
+the robust estimation of the support and provides its statistical consistency
+under noise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aleth-NeRF: Illumination Adaptive NeRF with Concealing Field Assumption <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09093v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09093v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziteng Cui, Lin Gu, Xiao Sun, Xianzheng Ma, Yu Qiao, Tatsuya Harada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The standard Neural Radiance Fields (NeRF) paradigm employs a viewer-centered
+methodology, entangling the aspects of illumination and material reflectance
+into emission solely from 3D points. This simplified rendering approach
+presents challenges in accurately modeling images captured under adverse
+lighting conditions, such as low light or over-exposure. Motivated by the
+ancient Greek emission theory that posits visual perception as a result of rays
+emanating from the eyes, we slightly refine the conventional NeRF framework to
+train NeRF under challenging light conditions and generate normal-light
+condition novel views unsupervised. We introduce the concept of a "Concealing
+Field," which assigns transmittance values to the surrounding air to account
+for illumination effects. In dark scenarios, we assume that object emissions
+maintain a standard lighting level but are attenuated as they traverse the air
+during the rendering process. Concealing Field thus compel NeRF to learn
+reasonable density and colour estimations for objects even in dimly lit
+situations. Similarly, the Concealing Field can mitigate over-exposed emissions
+during the rendering stage. Furthermore, we present a comprehensive multi-view
+dataset captured under challenging illumination conditions for evaluation. Our
+code and dataset available at https://github.com/cuiziteng/Aleth-NeRF
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2024, code available at
+  https://cuiziteng.github.io/Aleth_NeRF_web/ Modified version of previous
+  paper arXiv:2303.05807</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Incremental Unified Framework for Small Defect Inspection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.08917v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.08917v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Tang, Hao Lu, Xiaogang Xu, Ruizheng Wu, Sixing Hu, Tong Zhang, Tsz Wa Cheng, Ming Ge, Ying-Cong Chen, Fugee Tsung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI)-driven defect inspection is pivotal in
+industrial manufacturing. Yet, many methods, tailored to specific pipelines,
+grapple with diverse product portfolios and evolving processes. Addressing
+this, we present the Incremental Unified Framework (IUF), which can reduce the
+feature conflict problem when continuously integrating new objects in the
+pipeline, making it advantageous in object-incremental learning scenarios.
+Employing a state-of-the-art transformer, we introduce Object-Aware
+Self-Attention (OASA) to delineate distinct semantic boundaries. Semantic
+Compression Loss (SCL) is integrated to optimize non-primary semantic space,
+enhancing network adaptability for novel objects. Additionally, we prioritize
+retaining the features of established objects during weight updates.
+Demonstrating prowess in both image and pixel-level defect inspection, our
+approach achieves state-of-the-art performance, proving indispensable for
+dynamic and scalable industrial inspections. Our code will be released at
+\url{https://github.com/jqtangust/IUF}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unleashing the Power of <span class="highlight-title">Prompt</span>-driven Nucleus Instance Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15939v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15939v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongyi Shui, Yunlong Zhang, Kai Yao, Chenglu Zhu, Sunyi Zheng, Jingxiong Li, Honglin Li, Yuxuan Sun, Ruizhe Guo, Lin Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nucleus instance segmentation in histology images is crucial for a broad
+spectrum of clinical applications. Current dominant algorithms rely on
+regression of nuclear proxy maps. Distinguishing nucleus instances from the
+estimated maps requires carefully curated post-processing, which is error-prone
+and parameter-sensitive. Recently, the Segment Anything Model (SAM) has earned
+huge attention in medical image segmentation, owing to its impressive
+generalization ability and promptable property. Nevertheless, its potential on
+nucleus instance segmentation remains largely underexplored. In this paper, we
+present a novel prompt-driven framework that consists of a nucleus prompter and
+SAM for automatic nucleus instance segmentation. Specifically, the prompter
+learns to generate a unique point prompt for each nucleus while the SAM is
+fine-tuned to output the corresponding mask for the prompted nucleus.
+Furthermore, we propose the inclusion of adjacent nuclei as negative prompts to
+enhance the model's capability to identify overlapping nuclei. Without
+complicated post-processing, our proposed method sets a new state-of-the-art
+performance on three challenging benchmarks. Code is available at
+\url{github.com/windygoo/PromptNucSeg}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open-sourced Data Ecosystem in Autonomous Driving: the Present and
+  Future 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03408v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03408v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyang Li, Yang Li, Huijie Wang, Jia Zeng, Huilin Xu, Pinlong Cai, Li Chen, Junchi Yan, Feng Xu, Lu Xiong, Jingdong Wang, Futang Zhu, Kai Yan, Chunjing Xu, Tiancai Wang, Fei Xia, Beipeng Mu, Zhihui Peng, Dahua Lin, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the continuous maturation and application of autonomous driving
+technology, a systematic examination of open-source autonomous driving datasets
+becomes instrumental in fostering the robust evolution of the industry
+ecosystem. Current autonomous driving datasets can broadly be categorized into
+two generations. The first-generation autonomous driving datasets are
+characterized by relatively simpler sensor modalities, smaller data scale, and
+is limited to perception-level tasks. KITTI, introduced in 2012, serves as a
+prominent representative of this initial wave. In contrast, the
+second-generation datasets exhibit heightened complexity in sensor modalities,
+greater data scale and diversity, and an expansion of tasks from perception to
+encompass prediction and control. Leading examples of the second generation
+include nuScenes and Waymo, introduced around 2019. This comprehensive review,
+conducted in collaboration with esteemed colleagues from both academia and
+industry, systematically assesses over seventy open-source autonomous driving
+datasets from domestic and international sources. It offers insights into
+various aspects, such as the principles underlying the creation of high-quality
+datasets, the pivotal role of data engine systems, and the utilization of
+generative foundation models to facilitate scalable data generation.
+Furthermore, this review undertakes an exhaustive analysis and discourse
+regarding the characteristics and data scales that future third-generation
+autonomous driving datasets should possess. It also delves into the scientific
+and technical challenges that warrant resolution. These endeavors are pivotal
+in advancing autonomous innovation and fostering technological enhancement in
+critical domains. For further details, please refer to
+https://github.com/OpenDriveLab/DriveAGI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article is a simplified English translation of corresponding
+  Chinese article. Please refer to Chinese version for the complete content</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Building Universal Foundation Models for Medical Image Analysis with
+  Spatially Adaptive Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07630v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07630v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingxiao Luo, Xuanzhong Chen, Bingda Tang, Xinsheng Chen, Rong Han, Chengpeng Hu, Yujiang Li, Ting Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in foundation models, typically trained with
+self-supervised learning on large-scale and diverse datasets, have shown great
+potential in medical image analysis. However, due to the significant spatial
+heterogeneity of medical imaging data, current models must tailor specific
+structures for different datasets, making it challenging to leverage the
+abundant unlabeled data. In this work, we propose a universal foundation model
+for medical image analysis that processes images with heterogeneous spatial
+properties using a unified structure. To accomplish this, we propose spatially
+adaptive networks (SPAD-Nets), a family of networks that dynamically adjust the
+structures to adapt to the spatial properties of input images, to build such a
+universal foundation model. We pre-train a spatial adaptive visual tokenizer
+(SPAD-VT) and then a spatial adaptive Vision Transformer (SPAD-ViT) via masked
+image modeling (MIM) on 55 public medical image datasets. The pre-training data
+comprises over 9 million image slices, representing the largest, most
+comprehensive, and most diverse dataset to our knowledge for pre-training
+universal foundation models for medical image analysis. The experimental
+results on downstream medical image classification and segmentation tasks
+demonstrate the superior performance and label efficiency of our model. Our
+code is available at https://github.com/function2-llx/PUMIT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MLLM-Tool: A Multimodal Large Language Model For Tool Agent Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10727v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10727v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyu Wang, Weixin Luo, Qianyu Chen, Haonan Mai, Jindi Guo, Sixun Dong,  Xiaohua,  Xuan, Zhengxin Li, Lin Ma, Shenghua Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the astonishing performance of large language models (LLMs) in
+natural language comprehension and generation tasks triggered lots of
+exploration of using them as central controllers to build agent systems.
+Multiple studies focus on bridging the LLMs to external tools to extend the
+application scenarios. However, the current LLMs' perceiving tool-use ability
+is limited to a single text query, which may result in ambiguity in
+understanding the users' real intentions. LLMs are expected to eliminate that
+by perceiving the visual- or auditory-grounded instructions' information.
+Therefore, in this paper, we propose MLLM-Tool, a system incorporating
+open-source LLMs and multi-modal encoders so that the learnt LLMs can be
+conscious of multi-modal input instruction and then select the function-matched
+tool correctly. To facilitate the evaluation of the model's capability, we
+collect a dataset featured by consisting of multi-modal input tools from
+HuggingFace. Another important feature of our dataset is that our dataset also
+contains multiple potential choices for the same instruction due to the
+existence of identical functions and synonymous functions, which provides more
+potential solutions for the same query. The experiments reveal that our
+MLLM-Tool is capable of recommending appropriate tools for multi-modal
+instructions. Codes and data are available at
+https://github.com/MLLM-Tool/MLLM-Tool.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 9 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VIPTR: A Vision Permutable Extractor for Fast and Efficient Scene Text
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10110v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10110v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianfu Cheng, Weixiao Zhou, Xiang Li, Xiaoming Chen, Jian Yang, Tongliang Li, Zhoujun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Text Recognition (STR) is a challenging task that involves recognizing
+text within images of natural scenes. Although current state-of-the-art models
+for STR exhibit high performance, they typically suffer from low inference
+efficiency due to their reliance on hybrid architectures comprised of visual
+encoders and sequence decoders. In this work, we propose the VIsion Permutable
+extractor for fast and efficient scene Text Recognition (VIPTR), which achieves
+an impressive balance between high performance and rapid inference speeds in
+the domain of STR. Specifically, VIPTR leverages a visual-semantic extractor
+with a pyramid structure, characterized by multiple self-attention layers,
+while eschewing the traditional sequence decoder. This design choice results in
+a lightweight and efficient model capable of handling inputs of varying sizes.
+Extensive experimental results on various standard datasets for both Chinese
+and English scene text recognition validate the superiority of VIPTR. Notably,
+the VIPTR-T (Tiny) variant delivers highly competitive accuracy on par with
+other lightweight models and achieves SOTA inference speeds. Meanwhile, the
+VIPTR-L (Large) variant attains greater recognition accuracy, while maintaining
+a low parameter count and favorable inference speed. Our proposed method
+provides a compelling solution for the STR challenge, which blends high
+accuracy with efficiency and greatly benefits real-world applications requiring
+fast and reliable text recognition. The code is publicly available at
+https://github.com/cxfyxl/VIPTR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Unfolding Convolutional Dictionary Model for Multi-Contrast MRI
+  Super-resolution and Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01171v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01171v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Lei, Faming Fang, Guixu Zhang, Ming Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Magnetic resonance imaging (MRI) tasks often involve multiple contrasts.
+Recently, numerous deep learning-based multi-contrast MRI super-resolution (SR)
+and reconstruction methods have been proposed to explore the complementary
+information from the multi-contrast images. However, these methods either
+construct parameter-sharing networks or manually design fusion rules, failing
+to accurately model the correlations between multi-contrast images and lacking
+certain interpretations. In this paper, we propose a multi-contrast
+convolutional dictionary (MC-CDic) model under the guidance of the optimization
+algorithm with a well-designed data fidelity term. Specifically, we bulid an
+observation model for the multi-contrast MR images to explicitly model the
+multi-contrast images as common features and unique features. In this way, only
+the useful information in the reference image can be transferred to the target
+image, while the inconsistent information will be ignored. We employ the
+proximal gradient algorithm to optimize the model and unroll the iterative
+steps into a deep CDic model. Especially, the proximal operators are replaced
+by learnable ResNet. In addition, multi-scale dictionaries are introduced to
+further improve the model performance. We test our MC-CDic model on
+multi-contrast MRI SR and reconstruction tasks. Experimental results
+demonstrate the superior performance of the proposed MC-CDic model against
+existing SOTA methods. Code is available at
+https://github.com/lpcccc-cv/MC-CDic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LRANet: Towards Accurate and Efficient Scene Text Detection with
+  Low-Rank Approximation Network <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15142v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15142v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Su, Zhineng Chen, Zhiwen Shao, Yuning Du, Zhilong Ji, Jinfeng Bai, Yong Zhou, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, regression-based methods, which predict parameterized text shapes
+for text localization, have gained popularity in scene text detection. However,
+the existing parameterized text shape methods still have limitations in
+modeling arbitrary-shaped texts due to ignoring the utilization of
+text-specific shape information. Moreover, the time consumption of the entire
+pipeline has been largely overlooked, leading to a suboptimal overall inference
+speed. To address these issues, we first propose a novel parameterized text
+shape method based on low-rank approximation. Unlike other shape representation
+methods that employ data-irrelevant parameterization, our approach utilizes
+singular value decomposition and reconstructs the text shape using a few
+eigenvectors learned from labeled text contours. By exploring the shape
+correlation among different text contours, our method achieves consistency,
+compactness, simplicity, and robustness in shape representation. Next, we
+propose a dual assignment scheme for speed acceleration. It adopts a sparse
+assignment branch to accelerate the inference speed, and meanwhile, provides
+ample supervised signals for training through a dense assignment branch.
+Building upon these designs, we implement an accurate and efficient
+arbitrary-shaped text detector named LRANet. Extensive experiments are
+conducted on several challenging benchmarks, demonstrating the superior
+accuracy and efficiency of LRANet compared to state-of-the-art methods. Code is
+available at: \url{https://github.com/ychensu/LRANet.git}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RGBD Objects in the Wild: Scaling Real-World 3D Object Learning from
+  RGB-D Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12592v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12592v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongchi Xia, Yang Fu, Sifei Liu, Xiaolong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new RGB-D object dataset captured in the wild called
+WildRGB-D. Unlike most existing real-world object-centric datasets which only
+come with RGB capturing, the direct capture of the depth channel allows better
+3D annotations and broader downstream applications. WildRGB-D comprises
+large-scale category-level RGB-D object videos, which are taken using an iPhone
+to go around the objects in 360 degrees. It contains around 8500 recorded
+objects and nearly 20000 RGB-D videos across 46 common object categories. These
+videos are taken with diverse cluttered backgrounds with three setups to cover
+as many real-world scenarios as possible: (i) a single object in one video;
+(ii) multiple objects in one video; and (iii) an object with a static hand in
+one video. The dataset is annotated with object masks, real-world scale camera
+poses, and reconstructed aggregated point clouds from RGBD videos. We benchmark
+four tasks with WildRGB-D including novel view synthesis, camera pose
+estimation, object 6d pose estimation, and object surface reconstruction. Our
+experiments show that the large-scale capture of RGB-D objects provides a large
+potential to advance 3D object learning. Our project page is
+https://wildrgbd.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page: https://wildrgbd.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal Misinformation Detection: Approaches, Challenges and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.13883v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.13883v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Abdali, Sina shaham, Bhaskar Krishnamachari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As social media platforms are evolving from text-based forums into
+multi-modal environments, the nature of misinformation in social media is also
+transforming accordingly. Taking advantage of the fact that visual modalities
+such as images and videos are more favorable and attractive to the users and
+textual contents are sometimes skimmed carelessly, misinformation spreaders
+have recently targeted contextual connections between the modalities e.g., text
+and image. Hence many researchers have developed automatic techniques for
+detecting possible cross-modal discordance in web-based content. We analyze,
+categorize and identify existing approaches in addition to challenges and
+shortcomings they face in order to unearth new research opportunities in the
+field of multi-modal misinformation detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PoseDiffusion: Solving Pose Estimation via Diffusion-aided Bundle
+  Adjustment <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15667v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15667v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianyuan Wang, Christian Rupprecht, David Novotny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camera pose estimation is a long-standing computer vision problem that to
+date often relies on classical methods, such as handcrafted keypoint matching,
+RANSAC and bundle adjustment. In this paper, we propose to formulate the
+Structure from Motion (SfM) problem inside a probabilistic diffusion framework,
+modelling the conditional distribution of camera poses given input images. This
+novel view of an old problem has several advantages. (i) The nature of the
+diffusion framework mirrors the iterative procedure of bundle adjustment. (ii)
+The formulation allows a seamless integration of geometric constraints from
+epipolar geometry. (iii) It excels in typically difficult scenarios such as
+sparse views with wide baselines. (iv) The method can predict intrinsics and
+extrinsics for an arbitrary amount of images. We demonstrate that our method
+PoseDiffusion significantly improves over the classic SfM pipelines and the
+learned approaches on two real-world datasets. Finally, it is observed that our
+method can generalize across datasets without further training. Project page:
+https://posediffusion.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV Camera Ready: revised Introduction and Related work, added a
+  metric mAA (AUC), added some quantitative results, and added Appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Point2SSM: Learning Morphological Variations of Anatomies from Point
+  Cloud <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14486v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14486v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jadie Adams, Shireen Elhabian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Point2SSM, a novel unsupervised learning approach for constructing
+correspondence-based statistical shape models (SSMs) directly from raw point
+clouds. SSM is crucial in clinical research, enabling population-level analysis
+of morphological variation in bones and organs. Traditional methods of SSM
+construction have limitations, including the requirement of noise-free surface
+meshes or binary volumes, reliance on assumptions or templates, and prolonged
+inference times due to simultaneous optimization of the entire cohort.
+Point2SSM overcomes these barriers by providing a data-driven solution that
+infers SSMs directly from raw point clouds, reducing inference burdens and
+increasing applicability as point clouds are more easily acquired. While deep
+learning on 3D point clouds has seen success in unsupervised representation
+learning and shape correspondence, its application to anatomical SSM
+construction is largely unexplored. We conduct a benchmark of state-of-the-art
+point cloud deep networks on the SSM task, revealing their limited robustness
+to clinical challenges such as noisy, sparse, or incomplete input and limited
+training data. Point2SSM addresses these issues through an attention-based
+module, providing effective correspondence mappings from learned point
+features. Our results demonstrate that the proposed method significantly
+outperforms existing networks in terms of accurate surface sampling and
+correspondence, better capturing population-level statistics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a Spotlight presentation at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Training with Autoencoders for Visual Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.11723v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.11723v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Bauer, Shinichi Nakajima, Klaus-Robert Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, deep auto-encoders have been used for the task of anomaly detection
+in the visual domain. By optimising for the reconstruction error using
+anomaly-free examples, the common belief is that a corresponding network should
+fail to accurately reconstruct anomalous regions in the application phase. This
+goal is typically addressed by controlling the capacity of the network, either
+by reducing the size of the bottleneck layer or by enforcing sparsity
+constraints on its activations. However, neither of these techniques does
+explicitly penalise reconstruction of anomalous signals often resulting in poor
+detection. We tackle this problem by adapting a self-supervised learning regime
+that allows the use of discriminative information during training but focuses
+on the data manifold of normal examples. Precisely, we investigate two
+different training objectives inspired by the task of neural image inpainting.
+Our main objective regularises the model to produce locally consistent
+reconstructions, while replacing irregularities, therefore, acting as a filter
+that removes anomalous patterns. Our formal analysis shows that under mild
+conditions the corresponding model resembles a non-linear orthogonal projection
+of partially corrupted images onto the manifold of uncorrupted (defect-free)
+examples. This insight makes the reconstruction error a natural choice for
+defining the anomaly score of a sample according to its distance from a
+corresponding projection on the data manifold. We emphasise that inference with
+our approach is very efficient during training and prediction requiring a
+single forward pass for each input image. Our experiments on the MVTec AD
+dataset demonstrate high detection and localisation performance. On the
+texture-subset, in particular, our approach consistently outperforms recent
+anomaly detection methods by a significant margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MIML: Multiplex Image Machine Learning for High Precision Cell
+  Classification via Mechanical Traits within Microfluidic Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08421v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08421v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khayrul Islam, Ratul Paul, Shen Wang, Yaling Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Label-free cell classification is advantageous for supplying pristine cells
+for further use or examination, yet existing techniques frequently fall short
+in terms of specificity and speed. In this study, we address these limitations
+through the development of a novel machine learning framework, Multiplex Image
+Machine Learning (MIML). This architecture uniquely combines label-free cell
+images with biomechanical property data, harnessing the vast, often
+underutilized morphological information intrinsic to each cell. By integrating
+both types of data, our model offers a more holistic understanding of the
+cellular properties, utilizing morphological information typically discarded in
+traditional machine learning models. This approach has led to a remarkable
+98.3\% accuracy in cell classification, a substantial improvement over models
+that only consider a single data type. MIML has been proven effective in
+classifying white blood cells and tumor cells, with potential for broader
+application due to its inherent flexibility and transfer learning capability.
+It's particularly effective for cells with similar morphology but distinct
+biomechanical properties. This innovative approach has significant implications
+across various fields, from advancing disease diagnostics to understanding
+cellular behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>major change</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Role and Integration of Image Processing Systems in Maritime Target
+  Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.12809v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.12809v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassir Zardoua, Bilal Sebbar, Moussab Chbeine, Abdelali Astito, Mohammed Boulaala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, maritime traffic has increased, especially in seaborne
+trade. To ensure safety, security, and environmental protection, various
+systems have been deployed, often combining data for improved effectiveness.
+One key application of this combined data is tracking targets at sea, where the
+Automatic Identification System (AIS) and X-band marine radar are crucial.
+Recently, there has been growing interest in using visual data from cameras to
+enhance tracking. This has led to the development of several tracking
+algorithms based on image processing. While much of the existing literature
+addresses data fusion, there hasn't been much focus on why integrating image
+processing systems is important given the existence of the other systems. In
+our paper, we aim to analyze these surveillance systems and highlight the
+reasons for integrating image processing systems. Our main goal is to show how
+this integration can improve maritime security, offering practical insights
+into enhancing safety and protection at sea.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">21</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building Contextual Knowledge Graphs for Personalized Learning
+  Recommendations using Text Mining and Semantic Graph Completion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hasan Abu-Rasheed, Mareike Dornhöfer, Christian Weber, Gábor Kismihók, Ulrike Buchmann, Madjid Fathi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modelling learning objects (LO) within their context enables the learner to
+advance from a basic, remembering-level, learning objective to a higher-order
+one, i.e., a level with an application- and analysis objective. While
+hierarchical data models are commonly used in digital learning platforms, using
+graph-based models enables representing the context of LOs in those platforms.
+This leads to a foundation for personalized recommendations of learning paths.
+In this paper, the transformation of hierarchical data models into knowledge
+graph (KG) models of LOs using text mining is introduced and evaluated. We
+utilize custom text mining pipelines to mine semantic relations between
+elements of an expert-curated hierarchical model. We evaluate the KG structure
+and relation extraction using graph quality-control metrics and the comparison
+of algorithmic semantic-similarities to expert-defined ones. The results show
+that the relations in the KG are semantically comparable to those defined by
+domain experts, and that the proposed KG improves representing and linking the
+contexts of LOs through increasing graph communities and betweenness
+centrality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Cost-Sensitive Meta-Learning Strategy for Fair Provider Exposure in
+  Recommendation <span class="chip">ECIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludovico Boratto, Giulia Cerniglia, Mirko Marras, Alessandra Perniciano, Barbara Pes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When devising recommendation services, it is important to account for the
+interests of all content providers, encompassing not only newcomers but also
+minority demographic groups. In various instances, certain provider groups find
+themselves underrepresented in the item catalog, a situation that can influence
+recommendation results. Hence, platform owners often seek to regulate the
+exposure of these provider groups in the recommended lists. In this paper, we
+propose a novel cost-sensitive approach designed to guarantee these target
+exposure levels in pairwise recommendation models. This approach quantifies,
+and consequently mitigate, the discrepancies between the volume of
+recommendations allocated to groups and their contribution in the item catalog,
+under the principle of equity. Our results show that this approach, while
+aligning groups exposure with their assigned levels, does not compromise to the
+original recommendation utility. Source code and pre-processed data can be
+retrieved at
+https://github.com/alessandraperniciano/meta-learning-strategy-fair-provider-exposure.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 46th European Conference on Information Retrieval
+  (ECIR 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-grained Contract NER using instruction based model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hiranmai Sri Adibhatla, Pavan Baswani, Manish Shrivastava
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lately, instruction-based techniques have made significant strides in
+improving performance in few-shot learning scenarios. They achieve this by
+bridging the gap between pre-trained language models and fine-tuning for
+specific downstream tasks. Despite these advancements, the performance of Large
+Language Models (LLMs) in information extraction tasks like Named Entity
+Recognition (NER), using prompts or instructions, still falls short of
+supervised baselines. The reason for this performance gap can be attributed to
+the fundamental disparity between NER and LLMs. NER is inherently a sequence
+labeling task, where the model must assign entity-type labels to individual
+tokens within a sentence. In contrast, LLMs are designed as a text generation
+task. This distinction between semantic labeling and text generation leads to
+subpar performance. In this paper, we transform the NER task into a
+text-generation task that can be readily adapted by LLMs. This involves
+enhancing source sentences with task-specific instructions and answer choices,
+allowing for the identification of entities and their types within natural
+language. We harness the strength of LLMs by integrating supervised learning
+within them. The goal of this combined strategy is to boost the performance of
+LLMs in extraction tasks like NER while simultaneously addressing hallucination
+issues often observed in LLM-generated content. A novel corpus Contract NER
+comprising seven frequently observed contract categories, encompassing named
+entities associated with 18 distinct legal entity types is released along with
+our baseline models. Our models and dataset are available to the community for
+future research * .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TPRF: A <span class="highlight-title">Transformer</span>-based Pseudo-Relevance Feedback Model for Efficient
+  and Effective Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13509v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13509v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuting Yu, Hang Li, Ahmed Mourad, Bevan Koopman, Guido Zuccon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers Pseudo-Relevance Feedback (PRF) methods for dense
+retrievers in a resource constrained environment such as that of cheap cloud
+instances or embedded systems (e.g., smartphones and smartwatches), where
+memory and CPU are limited and GPUs are not present. For this, we propose a
+transformer-based PRF method (TPRF), which has a much smaller memory footprint
+and faster inference time compared to other deep language models that employ
+PRF mechanisms, with a marginal effectiveness loss. TPRF learns how to
+effectively combine the relevance feedback signals from dense passage
+representations. Specifically, TPRF provides a mechanism for modelling
+relationships and weights between the query and the relevance feedback signals.
+The method is agnostic to the specific dense representation used and thus can
+be generally applied to any dense retriever.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siwei Wu, Yizhi Li, Kang Zhu, Ge Zhang, Yiming Liang, Kaijing Ma, Chenghao Xiao, Haoran Zhang, Bohao Yang, Wenhu Chen, Wenhao Huang, Noura Al Moubayed, Jie Fu, Chenghua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal information retrieval (MMIR) is a rapidly evolving field, where
+significant progress, particularly in image-text pairing, has been made through
+advanced representation learning and cross-modality alignment research.
+However, current benchmarks for evaluating MMIR performance in image-text
+pairing within the scientific domain show a notable gap, where chart and table
+images described in scholarly language usually do not play a significant role.
+To bridge this gap, we develop a specialised scientific MMIR (SciMMIR)
+benchmark by leveraging open-access paper collections to extract data relevant
+to the scientific domain. This benchmark comprises 530K meticulously curated
+image-text pairs, extracted from figures and tables with detailed captions in
+scientific documents. We further annotate the image-text pairs with two-level
+subset-subcategory hierarchy annotations to facilitate a more comprehensive
+evaluation of the baselines. We conducted zero-shot and fine-tuning evaluations
+on prominent multi-modal image-captioning and visual language models, such as
+CLIP and BLIP. Our analysis offers critical insights for MMIR in the scientific
+domain, including the impact of pre-training and fine-tuning settings and the
+influence of the visual and textual encoders. All our data and checkpoints are
+publicly available at https://github.com/Wusiwei0410/SciMMIR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken
+  Question Answering <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chyi-Jiunn Lin, Guan-Ting Lin, Yung-Sung Chuang, Wei-Lun Wu, Shang-Wen Li, Abdelrahman Mohamed, Hung-yi Lee, Lin-shan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spoken Question Answering (SQA) is essential for machines to reply to user's
+question by finding the answer span within a given spoken passage. SQA has been
+previously achieved without ASR to avoid recognition errors and
+Out-of-Vocabulary (OOV) problems. However, the real-world problem of
+Open-domain SQA (openSQA), in which the machine needs to first retrieve
+passages that possibly contain the answer from a spoken archive in addition,
+was never considered. This paper proposes the first known end-to-end framework,
+Speech Dense Passage Retriever (SpeechDPR), for the retrieval component of the
+openSQA problem. SpeechDPR learns a sentence-level semantic representation by
+distilling knowledge from the cascading model of unsupervised ASR (UASR) and
+text dense retriever (TDR). No manually transcribed speech data is needed.
+Initial experiments showed performance comparable to the cascading model of
+UASR and TDR, and significantly better when UASR was poor, verifying this
+approach is more robust to speech recognition errors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decentralized Collaborative Learning with Adaptive Reference Data for
+  On-Device POI Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13448v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13448v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiqi Zheng, Liang Qu, Tong Chen, Lizhen Cui, Yuhui Shi, Hongzhi Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Location-based Social Networks, Point-of-Interest (POI) recommendation
+helps users discover interesting places. There is a trend to move from the
+cloud-based model to on-device recommendations for privacy protection and
+reduced server reliance. Due to the scarcity of local user-item interactions on
+individual devices, solely relying on local instances is not adequate.
+Collaborative Learning (CL) emerges to promote model sharing among users, where
+reference data is an intermediary that allows users to exchange their soft
+decisions without directly sharing their private data or parameters, ensuring
+privacy and benefiting from collaboration. However, existing CL-based
+recommendations typically use a single reference for all users. Reference data
+valuable for one user might be harmful to another, given diverse user
+preferences. Users may not offer meaningful soft decisions on items outside
+their interest scope. Consequently, using the same reference data for all
+collaborations can impede knowledge exchange and lead to sub-optimal
+performance. To address this gap, we introduce the Decentralized Collaborative
+Learning with Adaptive Reference Data (DARD) framework, which crafts adaptive
+reference data for effective user collaboration. It first generates a
+desensitized public reference data pool with transformation and probability
+data generation methods. For each user, the selection of adaptive reference
+data is executed in parallel by training loss tracking and influence function.
+Local models are trained with individual private data and collaboratively with
+the geographical and semantic neighbors. During the collaboration between two
+users, they exchange soft decisions based on a combined set of their adaptive
+reference data. Our evaluations across two real-world datasets highlight DARD's
+superiority in recommendation performance and addressing the scarcity of
+available reference data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Query Exposure Prediction for Groups of Documents in Rankings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13434v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13434v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Jaenich, Graham McDonald, Iadh Ounis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main objective of an Information Retrieval system is to provide a user
+with the most relevant documents to the user's query. To do this, modern IR
+systems typically deploy a re-ranking pipeline in which a set of documents is
+retrieved by a lightweight first-stage retrieval process and then re-ranked by
+a more effective but expensive model. However, the success of a re-ranking
+pipeline is heavily dependent on the performance of the first stage retrieval,
+since new documents are not usually identified during the re-ranking stage.
+Moreover, this can impact the amount of exposure that a particular group of
+documents, such as documents from a particular demographic group, can receive
+in the final ranking. For example, the fair allocation of exposure becomes more
+challenging or impossible if the first stage retrieval returns too few
+documents from certain groups, since the number of group documents in the
+ranking affects the exposure more than the documents' positions. With this in
+mind, it is beneficial to predict the amount of exposure that a group of
+documents is likely to receive in the results of the first stage retrieval
+process, in order to ensure that there are a sufficient number of documents
+included from each of the groups. In this paper, we introduce the novel task of
+query exposure prediction (QEP). Specifically, we propose the first approach
+for predicting the distribution of exposure that groups of documents will
+receive for a given query. Our new approach, called GEP, uses lexical
+information from individual groups of documents to estimate the exposure the
+groups will receive in a ranking. Our experiments on the TREC 2021 and 2022
+Fair Ranking Track test collections show that our proposed GEP approach results
+in exposure predictions that are up to 40 % more accurate than the predictions
+of adapted existing query performance prediction and resource allocation
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How to Forget Clients in Federated Online Learning to Rank? <span class="chip">ECIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyi Wang, Bing Liu, Guido Zuccon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data protection legislation like the European Union's General Data Protection
+Regulation (GDPR) establishes the \textit{right to be forgotten}: a user
+(client) can request contributions made using their data to be removed from
+learned models. In this paper, we study how to remove the contributions made by
+a client participating in a Federated Online Learning to Rank (FOLTR) system.
+In a FOLTR system, a ranker is learned by aggregating local updates to the
+global ranking model. Local updates are learned in an online manner at a
+client-level using queries and implicit interactions that have occurred within
+that specific client. By doing so, each client's local data is not shared with
+other clients or with a centralised search service, while at the same time
+clients can benefit from an effective global ranking model learned from
+contributions of each client in the federation.
+  In this paper, we study an effective and efficient unlearning method that can
+remove a client's contribution without compromising the overall ranker
+effectiveness and without needing to retrain the global ranker from scratch. A
+key challenge is how to measure whether the model has unlearned the
+contributions from the client $c^*$ that has requested removal. For this, we
+instruct $c^*$ to perform a poisoning attack (add noise to this client updates)
+and then we measure whether the impact of the attack is lessened when the
+unlearning process has taken place. Through experiments on four datasets, we
+demonstrate the effectiveness and efficiency of the unlearning strategy under
+different combinations of parameter settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ECIR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting IR Personalization Performance using Pre-retrieval Query
+  Predictors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13351v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13351v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduardo Vicente-López, Luis M. de Campos, Juan M. Fernández-Luna, Juan F. Huete
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalization generally improves the performance of queries but in a few
+cases it may also harms it. If we are able to predict and therefore to disable
+personalization for those situations, the overall performance will be higher
+and users will be more satisfied with personalized systems. We use some
+state-of-the-art pre-retrieval query performance predictors and propose some
+others including the user profile information for the previous purpose. We
+study the correlations among these predictors and the difference between the
+personalized and the original queries. We also use classification and
+regression techniques to improve the results and finally reach a bit more than
+one third of the maximum ideal performance. We think this is a good starting
+point within this research line, which certainly needs more effort and
+improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Big Data Architecture for Early Identification and Categorization of
+  Dark Web Sites 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Pastor-Galindo, Hông-Ân Sandlin, Félix Gómez Mármol, Gérôme Bovet, Gregorio Martínez Pérez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dark web has become notorious for its association with illicit activities
+and there is a growing need for systems to automate the monitoring of this
+space. This paper proposes an end-to-end scalable architecture for the early
+identification of new Tor sites and the daily analysis of their content. The
+solution is built using an Open Source Big Data stack for data serving with
+Kubernetes, Kafka, Kubeflow, and MinIO, continuously discovering onion
+addresses in different sources (threat intelligence, code repositories, web-Tor
+gateways, and Tor repositories), downloading the HTML from Tor and
+deduplicating the content using MinHash LSH, and categorizing with the BERTopic
+modeling (SBERT embedding, UMAP dimensionality reduction, HDBSCAN document
+clustering and c-TF-IDF topic keywords). In 93 days, the system identified
+80,049 onion services and characterized 90% of them, addressing the challenge
+of Tor volatility. A disproportionate amount of repeated content is found, with
+only 6.1% unique sites. From the HTML files of the dark sites, 31 different
+low-topics are extracted, manually labeled, and grouped into 11 high-level
+topics. The five most popular included sexual and violent content,
+repositories, search engines, carding, cryptocurrencies, and marketplaces.
+During the experiments, we identified 14 sites with 13,946 clones that shared a
+suspiciously similar mirroring rate per day, suggesting an extensive common
+phishing network. Among the related works, this study is the most
+representative characterization of onion services based on topics to date.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ It's About Time: Incorporating Temporality in Retrieval Augmented
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13222v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13222v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anoushka Gade, Jorjeta Jetcheva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The web serves as a global repository of knowledge, used by billions of
+people to search for information. Ensuring that users receive the most relevant
+and up-to-date information, especially in the presence of multiple versions of
+web content from different time points remains a critical challenge for
+information retrieval. This challenge has recently been compounded by the
+increased use of question answering tools trained on Wikipedia or web content
+and powered by large language models (LLMs) \citep{chatgpt} which have been
+found to make up information (or hallucinate), and in addition have been shown
+to struggle with the temporal dimensions of information. Even Retriever
+Augmented Language Models (RALMs) which incorporate a document database to
+reduce LLM hallucination are unable to handle temporal queries correctly. This
+leads to instances where RALMs respond to queries such as "Who won the
+Wimbledon Championship?", by retrieving document passages related to Wimbledon
+but without the ability to differentiate between them based on how recent they
+are.
+  In this paper, we propose and evaluate, TempRALM, a temporally-aware
+Retriever Augmented Language Model (RALM) with few-shot learning extensions,
+which takes into account both semantically and temporally relevant documents
+relative to a given query, rather than relying on semantic similarity alone. We
+show that our approach results in up to 74\% improvement in performance over
+the baseline RALM model, without requiring model pre-training, recalculating or
+replacing the RALM document index, or adding other computationally intensive
+elements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Algorithmically Curated Lies: How Search Engines Handle Misinformation
+  about US Biolabs in Ukraine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13832v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13832v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizaveta Kuznetsova, Mykola Makhortykh, Maryna Sydorova, Aleksandra Urman, Ilaria Vitulano, Martha Stolze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing volume of online content prompts the need for adopting
+algorithmic systems of information curation. These systems range from web
+search engines to recommender systems and are integral for helping users stay
+informed about important societal developments. However, unlike journalistic
+editing the algorithmic information curation systems (AICSs) are known to be
+subject to different forms of malperformance which make them vulnerable to
+possible manipulation. The risk of manipulation is particularly prominent in
+the case when AICSs have to deal with information about false claims that
+underpin propaganda campaigns of authoritarian regimes. Using as a case study
+of the Russian disinformation campaign concerning the US biolabs in Ukraine, we
+investigate how one of the most commonly used forms of AICSs - i.e. web search
+engines - curate misinformation-related content. For this aim, we conduct
+virtual agent-based algorithm audits of Google, Bing, and Yandex search outputs
+in June 2022. Our findings highlight the troubling performance of search
+engines. Even though some search engines, like Google, were less likely to
+return misinformation results, across all languages and locations, the three
+search engines still mentioned or promoted a considerable share of false
+content (33% on Google; 44% on Bing, and 70% on Yandex). We also find
+significant disparities in misinformation exposure based on the language of
+search, with all search engines presenting a higher number of false stories in
+Russian. Location matters as well with users from Germany being more likely to
+be exposed to search results promoting false information. These observations
+stress the possibility of AICSs being vulnerable to manipulation, in particular
+in the case of the unfolding propaganda campaigns, and underline the importance
+of monitoring performance of these systems to prevent it.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robustness in Fairness against Edge-level Perturbations in GNN-based
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludovico Boratto, Gianni Fenu, Francesco Fabbri, Mirko Marras, Giacomo Medda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efforts in the recommendation community are shifting from the sole emphasis
+on utility to considering beyond-utility factors, such as fairness and
+robustness. Robustness of recommendation models is typically linked to their
+ability to maintain the original utility when subjected to attacks. Limited
+research has explored the robustness of a recommendation model in terms of
+fairness, e.g., the parity in performance across groups, under attack
+scenarios. In this paper, we aim to assess the robustness of graph-based
+recommender systems concerning fairness, when exposed to attacks based on
+edge-level perturbations. To this end, we considered four different fairness
+operationalizations, including both consumer and provider perspectives.
+Experiments on three datasets shed light on the impact of perturbations on the
+targeted fairness notion, uncovering key shortcomings in existing evaluation
+protocols for robustness. As an example, we observed perturbations affect
+consumer fairness on a higher extent than provider fairness, with alarming
+unfairness for the former. Source code:
+https://github.com/jackmedda/CPFairRobust
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Longitudinal Sentiment Topic Modelling of Reddit Posts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13805v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13805v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Nwaoha, Ziyad Gaffar, Ho Joon Chun, Marina Sokolova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we analyze texts of Reddit posts written by students of four
+major Canadian universities. We gauge the emotional tone and uncover prevailing
+themes and discussions through longitudinal topic modeling of posts textual
+data. Our study focuses on four years, 2020-2023, covering COVID-19 pandemic
+and after pandemic years. Our results highlight a gradual uptick in discussions
+related to mental health.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 4 figures, 13 tables. arXiv admin note: text overlap with
+  arXiv:2401.12382</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Better Understanding of User Satisfaction in Open-Domain
+  Conversational Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.02659v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.02659v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhumin Chu, Qingyao Ai, Zhihong Wang, Yiqun Liu, Yingye Huang, Rui Zhang, Min Zhang, Shaoping Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increasing popularity of conversational search, how to evaluate the
+performance of conversational search systems has become an important question
+in the IR community. Existing works on conversational search evaluation can
+mainly be categorized into two streams: (1) constructing metrics based on
+semantic similarity (e.g. BLUE, METEOR and BERTScore), or (2) directly
+evaluating the response ranking performance of the system using traditional
+search methods (e.g. nDCG, RBP and nERR). However, these methods either ignore
+the information need of the user or ignore the mixed-initiative property of
+conversational search. This raises the question of how to accurately model user
+satisfaction in conversational search scenarios. Since explicitly asking users
+to provide satisfaction feedback is difficult, traditional IR studies often
+rely on the Cranfield paradigm (i.e., third-party annotation) and user behavior
+modeling to estimate user satisfaction in search. However, the feasibility and
+effectiveness of these two approaches have not been fully explored in
+conversational search. In this paper, we dive into the evaluation of
+conversational search from the perspective of user satisfaction. We build a
+novel conversational search experimental platform and construct a Chinese
+open-domain conversational search behavior dataset containing rich annotations
+and search behavior data. We also collect third-party satisfaction annotation
+at the session-level and turn-level, to investigate the feasibility of the
+Cranfield paradigm in the conversational search scenario. Experimental results
+show both some consistency and considerable differences between the user
+satisfaction annotations and third-party annotations. We also propose dialog
+continuation or ending behavior models (DCEBM) to capture session-level user
+satisfaction based on turn-level information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MARVEL: Unlocking the Multi-Modal Capability of Dense Retrieval via
+  Visual Module Plugin 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.14037v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.14037v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianshuo Zhou, Sen Mei, Xinze Li, Zhenghao Liu, Chenyan Xiong, Zhiyuan Liu, Yu Gu, Ge Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes Multi-modAl Retrieval model via Visual modulE pLugin
+(MARVEL), which learns an embedding space for queries and multi-modal documents
+to conduct retrieval. MARVEL encodes queries and multi-modal documents with a
+unified encoder model, which helps to alleviate the modality gap between images
+and texts. Specifically, we enable the image understanding ability of the
+well-trained dense retriever, T5-ANCE, by incorporating the visual module's
+encoded image features as its inputs. To facilitate the multi-modal retrieval
+tasks, we build the ClueWeb22-MM dataset based on the ClueWeb22 dataset, which
+regards anchor texts as queries, and exacts the related text and image
+documents from anchor-linked web pages. Our experiments show that MARVEL
+significantly outperforms the state-of-the-art methods on the multi-modal
+retrieval dataset WebQA and ClueWeb22-MM. MARVEL provides an opportunity to
+broaden the advantages of text retrieval to the multi-model scenario. Besides,
+we also illustrate that the language model has the ability to extract image
+semantics and partly map the image features to the input word embedding space.
+All codes are available at https://github.com/OpenMatch/MARVEL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pure Message Passing Can Estimate Common Neighbor for Link Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00976v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00976v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiwen Dong, Zhichun Guo, Nitesh V. Chawla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Message Passing Neural Networks (MPNNs) have emerged as the {\em de facto}
+standard in graph representation learning. However, when it comes to link
+prediction, they often struggle, surpassed by simple heuristics such as Common
+Neighbor (CN). This discrepancy stems from a fundamental limitation: while
+MPNNs excel in node-level representation, they stumble with encoding the joint
+structural features essential to link prediction, like CN. To bridge this gap,
+we posit that, by harnessing the orthogonality of input vectors, pure
+message-passing can indeed capture joint structural features. Specifically, we
+study the proficiency of MPNNs in approximating CN heuristics. Based on our
+findings, we introduce the Message Passing Link Predictor (MPLP), a novel link
+prediction model. MPLP taps into quasi-orthogonal vectors to estimate
+link-level structural features, all while preserving the node-level
+complexities. Moreover, our approach demonstrates that leveraging
+message-passing to capture structural features could offset MPNNs'
+expressiveness limitations at the expense of estimation variance. We conduct
+experiments on benchmark datasets from various domains, where our method
+consistently outperforms the baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models are Zero-Shot Rankers for Recommender Systems <span class="chip">ECIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08845v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08845v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupeng Hou, Junjie Zhang, Zihan Lin, Hongyu Lu, Ruobing Xie, Julian McAuley, Wayne Xin Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large language models (LLMs) (e.g., GPT-4) have demonstrated
+impressive general-purpose task-solving abilities, including the potential to
+approach recommendation tasks. Along this line of research, this work aims to
+investigate the capacity of LLMs that act as the ranking model for recommender
+systems. We first formalize the recommendation problem as a conditional ranking
+task, considering sequential interaction histories as conditions and the items
+retrieved by other candidate generation models as candidates. To solve the
+ranking task by LLMs, we carefully design the prompting template and conduct
+extensive experiments on two widely-used datasets. We show that LLMs have
+promising zero-shot ranking abilities but (1) struggle to perceive the order of
+historical interactions, and (2) can be biased by popularity or item positions
+in the prompts. We demonstrate that these issues can be alleviated using
+specially designed prompting and bootstrapping strategies. Equipped with these
+insights, zero-shot LLMs can even challenge conventional recommendation models
+when ranking candidates are retrieved by multiple candidate generators. The
+code and processed datasets are available at
+https://github.com/RUCAIBox/LLMRank.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECIR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Differentiable Clustering for Intent Learning in Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.05975v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.05975v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Liu, Shihao Zhu, Jun Xia, Yingwei Ma, Jian Ma, Wenliang Zhong, Guannan Zhang, Kejun Zhang, Xinwang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mining users' intents plays a crucial role in sequential recommendation. The
+recent approach, ICLRec, was introduced to extract underlying users' intents
+using contrastive learning and clustering. While it has shown effectiveness,
+the existing method suffers from complex and cumbersome alternating
+optimization, leading to two main issues. Firstly, the separation of
+representation learning and clustering optimization within a generalized
+expectation maximization (EM) framework often results in sub-optimal
+performance. Secondly, performing clustering on the entire dataset hampers
+scalability for large-scale industry data. To address these challenges, we
+propose a novel intent learning method called \underline{ODCRec}, which
+integrates representation learning into an \underline{O}nline
+\underline{D}ifferentiable \underline{C}lustering framework for
+\underline{Rec}ommendation. Specifically, we encode users' behavior sequences
+and initialize the cluster centers as differentiable network parameters.
+Additionally, we design a clustering loss that guides the networks to
+differentiate between different cluster centers and pull similar samples
+towards their respective cluster centers. This allows simultaneous optimization
+of recommendation and clustering using mini-batch data. Moreover, we leverage
+the learned cluster centers as self-supervision signals for representation
+learning, resulting in further enhancement of recommendation performance.
+Extensive experiments conducted on open benchmarks and industry data validate
+the superiority, effectiveness, and efficiency of our proposed ODCRec method.
+Code is available at: https://github.com/yueliu1999/ELCRec.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal
+  Contrastive EHR Modelling with Hierarchical Regularisation <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heejoon Koo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting next visit diagnosis using Electronic Health Records (EHR) is an
+essential task in healthcare, critical for devising proactive future plans for
+both healthcare providers and patients. Nonetheless, many preceding studies
+have not sufficiently addressed the heterogeneous and hierarchical
+characteristics inherent in EHR data, inevitably leading to sub-optimal
+performance. To this end, we propose NECHO, a novel medical code-centric
+multimodal contrastive EHR learning framework with hierarchical regularisation.
+First, we integrate multifaceted information encompassing medical codes,
+demographics, and clinical notes using a tailored network design and a pair of
+bimodal contrastive losses, all of which pivot around a medical code
+representation. We also regularise modality-specific encoders using a parental
+level information in medical ontology to learn hierarchical structure of EHR
+data. A series of experiments on MIMIC-III data demonstrates effectiveness of
+our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EACL 2024 (The 18th Conference of the European Chapter of
+  the Association for Computational Linguistics)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">123</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Definitive Guide to Policy Gradients in Deep Reinforcement Learning:
+  Theory, Algorithms and Implementations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13662v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13662v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Lehmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, various powerful policy gradient algorithms have been
+proposed in deep reinforcement learning. While all these algorithms build on
+the Policy Gradient Theorem, the specific design choices differ significantly
+across algorithms. We provide a holistic overview of on-policy policy gradient
+algorithms to facilitate the understanding of both their theoretical
+foundations and their practical implementations. In this overview, we include a
+detailed proof of the continuous version of the Policy Gradient Theorem,
+convergence results and a comprehensive discussion of practical algorithms. We
+compare the most prominent algorithms on continuous control environments and
+provide insights on the benefits of regularization. All code is available at
+https://github.com/Matt00n/PolicyGradientsJax.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MambaByte: Token-free Selective State Space Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junxiong Wang, Tushaar Gangavarapu, Jing Nathan Yan, Alexander M Rush
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Token-free language models learn directly from raw bytes and remove the bias
+of subword tokenization. Operating on bytes, however, results in significantly
+longer sequences, and standard autoregressive Transformers scale poorly in such
+settings. We experiment with MambaByte, a token-free adaptation of the Mamba
+state space model, trained autoregressively on byte sequences. Our experiments
+indicate the computational efficiency of MambaByte compared to other byte-level
+models. We also find MambaByte to be competitive with and even outperform
+state-of-the-art subword Transformers. Furthermore, owing to linear scaling in
+length, MambaByte benefits from fast inference compared to Transformers. Our
+findings establish the viability of MambaByte in enabling token-free language
+modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inadequacy of common stochastic neural networks for reliable clinical
+  decision support 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13657v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13657v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Lindenmeyer, Malte Blattmann, Stefan Franke, Thomas Neumuth, Daniel Schneider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Widespread adoption of AI for medical decision making is still hindered due
+to ethical and safety-related concerns. For AI-based decision support systems
+in healthcare settings it is paramount to be reliable and trustworthy. Common
+deep learning approaches, however, have the tendency towards overconfidence
+under data shift. Such inappropriate extrapolation beyond evidence-based
+scenarios may have dire consequences. This highlights the importance of
+reliable estimation of local uncertainty and its communication to the end user.
+While stochastic neural networks have been heralded as a potential solution to
+these issues, this study investigates their actual reliability in clinical
+applications. We centered our analysis on the exemplary use case of mortality
+prediction for ICU hospitalizations using EHR from MIMIC3 study. For
+predictions on the EHR time series, Encoder-Only Transformer models were
+employed. Stochasticity of model functions was achieved by incorporating common
+methods such as Bayesian neural network layers and model ensembles. Our models
+achieve state of the art performance in terms of discrimination performance
+(AUC ROC: 0.868+-0.011, AUC PR: 0.554+-0.034) and calibration on the mortality
+prediction benchmark. However, epistemic uncertainty is critically
+underestimated by the selected stochastic deep learning methods. A heuristic
+proof for the responsible collapse of the posterior distribution is provided.
+Our findings reveal the inadequacy of commonly used stochastic deep learning
+approaches to reliably recognize OoD samples. In both methods, unsubstantiated
+model confidence is not prevented due to strongly biased functional posteriors,
+rendering them inappropriate for reliable clinical decision support. This
+highlights the need for approaches with more strictly enforced or inherent
+distance-awareness to known data points, e.g., using kernel-based techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Keywords: probabilistic inference, uncertainty estimation,
+  uncertainty quantification, epistemic uncertainty, clinical prognosis,
+  electronic health records</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph-Informed Neural Networks for Sparse Grid-Based Discontinuity
+  Detectors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Della Santa, Sandra Pieraccini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a novel approach for detecting the discontinuity
+interfaces of a discontinuous function. This approach leverages Graph-Informed
+Neural Networks (GINNs) and sparse grids to address discontinuity detection
+also in domains of dimension larger than 3. GINNs, trained to identify troubled
+points on sparse grids, exploit graph structures built on the grids to achieve
+efficient and accurate discontinuity detection performances. We also introduce
+a recursive algorithm for general sparse grid-based detectors, characterized by
+convergence properties and easy applicability. Numerical experiments on
+functions with dimensions n = 2 and n = 4 demonstrate the efficiency and robust
+generalization of GINNs in detecting discontinuity interfaces. Notably, the
+trained GINNs offer portability and versatility, allowing integration into
+various algorithms and sharing among users.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web
+  Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Yu Koh, Robert Lo, Lawrence Jang, Vikram Duvvur, Ming Chong Lim, Po-Yu Huang, Graham Neubig, Shuyan Zhou, Ruslan Salakhutdinov, Daniel Fried
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous agents capable of planning, reasoning, and executing actions on
+the web offer a promising avenue for automating computer tasks. However, the
+majority of existing benchmarks primarily focus on text-based agents,
+neglecting many natural tasks that require visual information to effectively
+solve. Given that most computer interfaces cater to human perception, visual
+information often augments textual data in ways that text-only models struggle
+to harness effectively. To bridge this gap, we introduce VisualWebArena, a
+benchmark designed to assess the performance of multimodal web agents on
+realistic \textit{visually grounded tasks}. VisualWebArena comprises of a set
+of diverse and complex web-based tasks that evaluate various capabilities of
+autonomous multimodal agents. To perform on this benchmark, agents need to
+accurately process image-text inputs, interpret natural language instructions,
+and execute actions on websites to accomplish user-defined objectives. We
+conduct an extensive evaluation of state-of-the-art LLM-based autonomous
+agents, including several multimodal models. Through extensive quantitative and
+qualitative analysis, we identify several limitations of text-only LLM agents,
+and reveal gaps in the capabilities of state-of-the-art multimodal language
+agents. VisualWebArena provides a framework for evaluating multimodal
+autonomous language agents, and offers insights towards building stronger
+autonomous agents for the web. Our code, baseline models, and data is publicly
+available at https://jykoh.com/vwa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages. Project page: https://jykoh.com/vwa</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Good is Chat<span class="highlight-title">GPT</span> at Face Biometrics? A First Look into Recognition,
+  Soft Biometrics, and Explainability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13641v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13641v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan DeAndres-Tame, Ruben Tolosana, Ruben Vera-Rodriguez, Aythami Morales, Julian Fierrez, Javier Ortega-Garcia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) such as GPT developed by OpenAI, have already
+shown astonishing results, introducing quick changes in our society. This has
+been intensified by the release of ChatGPT which allows anyone to interact in a
+simple conversational way with LLMs, without any experience in the field
+needed. As a result, ChatGPT has been rapidly applied to many different tasks
+such as code- and song-writer, education, virtual assistants, etc., showing
+impressive results for tasks for which it was not trained (zero-shot learning).
+  The present study aims to explore the ability of ChatGPT, based on the recent
+GPT-4 multimodal LLM, for the task of face biometrics. In particular, we
+analyze the ability of ChatGPT to perform tasks such as face verification,
+soft-biometrics estimation, and explainability of the results. ChatGPT could be
+very valuable to further increase the explainability and transparency of the
+automatic decisions in human scenarios. Experiments are carried out in order to
+evaluate the performance and robustness of ChatGPT, using popular public
+benchmarks and comparing the results with state-of-the-art methods in the
+field. The results achieved in this study show the potential of LLMs such as
+ChatGPT for face biometrics, especially to enhance explainability. For
+reproducibility reasons, we release all the code in GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can overfitted deep neural networks in adversarial training generalize?
+  -- An approximation viewpoint 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Shi, Fanghui Liu, Yuan Cao, Johan A. K. Suykens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial training is a widely used method to improve the robustness of
+deep neural networks (DNNs) over adversarial perturbations. However, it is
+empirically observed that adversarial training on over-parameterized networks
+often suffers from the \textit{robust overfitting}: it can achieve almost zero
+adversarial training error while the robust generalization performance is not
+promising. In this paper, we provide a theoretical understanding of the
+question of whether overfitted DNNs in adversarial training can generalize from
+an approximation viewpoint. Specifically, our main results are summarized into
+three folds: i) For classification, we prove by construction the existence of
+infinitely many adversarial training classifiers on over-parameterized DNNs
+that obtain arbitrarily small adversarial training error (overfitting), whereas
+achieving good robust generalization error under certain conditions concerning
+the data quality, well separated, and perturbation level. ii) Linear
+over-parameterization (meaning that the number of parameters is only slightly
+larger than the sample size) is enough to ensure such existence if the target
+function is smooth enough. iii) For regression, our results demonstrate that
+there also exist infinitely many overfitted DNNs with linear
+over-parameterization in adversarial training that can achieve almost optimal
+rates of convergence for the standard generalization error. Overall, our
+analysis points out that robust overfitting can be avoided but the required
+model capacity will depend on the smoothness of the target function, while a
+robust generalization gap is inevitable. We hope our analysis will give a
+better understanding of the mathematical foundations of robustness in DNNs from
+an approximation view.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span> Weight Experiments for LLM Instruction Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathew Huerta-Enochian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a small study analyzing how prompt token classification loss
+weighting (PLW) affects the performance of 7B-size LLaMA models fine-tuned on
+instruction tasks. We recreated Stanford's Alpaca experiment with both LLaMA 1
+and LLaMA 2 using multiple instruction datasets. We found that models
+fine-tuned on our short-completion dataset have a negative quadratic
+relationship with PLW while models fine-tuned on long-completion datasets were
+unaffected by PLW.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages of content. 5 pages for limitations, acknowledgments,
+  references, and appendix. 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CNN architecture extraction on edge GPU 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Horvath, Lukasz Chmielewski, Leo Weissbart, Lejla Batina, Yuval Yarom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks have become popular due to their versatility and
+state-of-the-art results in many applications, such as image classification,
+natural language processing, speech recognition, forecasting, etc. These
+applications are also used in resource-constrained environments such as
+embedded devices. In this work, the susceptibility of neural network
+implementations to reverse engineering is explored on the NVIDIA Jetson Nano
+microcomputer via side-channel analysis. To this end, an architecture
+extraction attack is presented. In the attack, 15 popular convolutional neural
+network architectures (EfficientNets, MobileNets, NasNet, etc.) are implemented
+on the GPU of Jetson Nano and the electromagnetic radiation of the GPU is
+analyzed during the inference operation of the neural networks. The results of
+the analysis show that neural network architectures are easily distinguishable
+using deep learning-based side-channel analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Will appear at the AIHWS 2024 workshop at ACNS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guided Diffusion for Fast Inverse Design of Density-based Mechanical
+  Metamaterials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13570v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13570v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanyan Yang, Lili Wang, Xiaoya Zhai, Kai Chen, Wenming Wu, Yunkai Zhao, Ligang Liu, Xiao-Ming Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mechanical metamaterial is a synthetic material that can possess
+extraordinary physical characteristics, such as abnormal elasticity, stiffness,
+and stability, by carefully designing its internal structure. To make
+metamaterials contain delicate local structures with unique mechanical
+properties, it is a potential method to represent them through high-resolution
+voxels. However, it brings a substantial computational burden. To this end,
+this paper proposes a fast inverse design method, whose core is an advanced
+deep generative AI algorithm, to generate voxel-based mechanical metamaterials.
+Specifically, we use the self-conditioned diffusion model, capable of
+generating a microstructure with a resolution of $128^3$ to approach the
+specified homogenized tensor matrix in just 3 seconds. Accordingly, this rapid
+reverse design tool facilitates the exploration of extreme metamaterials, the
+sequence interpolation in metamaterials, and the generation of diverse
+microstructures for multi-scale design. This flexible and adaptive generative
+tool is of great value in structural engineering or other mechanical systems
+and can stimulate more subsequent research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Task structure and nonlinearity jointly determine learned
+  representational geometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Alleman, Jack W Lindsey, Stefano Fusi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The utility of a learned neural representation depends on how well its
+geometry supports performance in downstream tasks. This geometry depends on the
+structure of the inputs, the structure of the target outputs, and the
+architecture of the network. By studying the learning dynamics of networks with
+one hidden layer, we discovered that the network's activation function has an
+unexpectedly strong impact on the representational geometry: Tanh networks tend
+to learn representations that reflect the structure of the target outputs,
+while ReLU networks retain more information about the structure of the raw
+inputs. This difference is consistently observed across a broad class of
+parameterized tasks in which we modulated the degree of alignment between the
+geometry of the task inputs and that of the task labels. We analyzed the
+learning dynamics in weight space and show how the differences between the
+networks with Tanh and ReLU nonlinearities arise from the asymmetric asymptotic
+behavior of ReLU, which leads feature neurons to specialize for different
+regions of input space. By contrast, feature neurons in Tanh networks tend to
+inherit the task label structure. Consequently, when the target outputs are low
+dimensional, Tanh networks generate neural representations that are more
+disentangled than those obtained with a ReLU nonlinearity. Our findings shed
+light on the interplay between input-output geometry, nonlinearity, and learned
+representations in neural networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking the Fairness of Image Upsampling Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mike Laszkiewicz, Imant Daunhawer, Julia E. Vogt, Asja Fischer, Johannes Lederer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed a rapid development of deep generative models for
+creating synthetic media, such as images and videos. While the practical
+applications of these models in everyday tasks are enticing, it is crucial to
+assess the inherent risks regarding their fairness. In this work, we introduce
+a comprehensive framework for benchmarking the performance and fairness of
+conditional generative models. We develop a set of
+metrics$\unicode{x2013}$inspired by their supervised fairness
+counterparts$\unicode{x2013}$to evaluate the models on their fairness and
+diversity. Focusing on the specific application of image upsampling, we create
+a benchmark covering a wide variety of modern upsampling methods. As part of
+the benchmark, we introduce UnfairFace, a subset of FairFace that replicates
+the racial distribution of common large-scale face datasets. Our empirical
+study highlights the importance of using an unbiased training set and reveals
+variations in how the algorithms respond to dataset imbalances. Alarmingly, we
+find that none of the considered methods produces statistically fair and
+diverse results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Concept Bottleneck Models: How to Make Black Boxes Intervenable? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ričards Marcinkevičs, Sonia Laguna, Moritz Vandenhirtz, Julia E. Vogt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, interpretable machine learning has re-explored concept bottleneck
+models (CBM), comprising step-by-step prediction of the high-level concepts
+from the raw features and the target variable from the predicted concepts. A
+compelling advantage of this model class is the user's ability to intervene on
+the predicted concept values, affecting the model's downstream output. In this
+work, we introduce a method to perform such concept-based interventions on
+already-trained neural networks, which are not interpretable by design, given
+an annotated validation set. Furthermore, we formalise the model's
+intervenability as a measure of the effectiveness of concept-based
+interventions and leverage this definition to fine-tune black-box models.
+Empirically, we explore the intervenability of black-box classifiers on
+synthetic tabular and natural image benchmarks. We demonstrate that fine-tuning
+improves intervention effectiveness and often yields better-calibrated
+predictions. To showcase the practical utility of the proposed techniques, we
+apply them to deep chest X-ray classifiers and show that fine-tuned black boxes
+can be as intervenable and more performant than CBMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Masked Particle Modeling on Sets: Towards <span class="highlight-title">Self-Supervised</span> High Energy
+  Physics Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Heinrich, Michael Kagan, Samuel Klein, Matthew Leigh, Tobias Golling, John Andrew Raine, Margarita Osadchy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose \textit{masked particle modeling} (MPM) as a self-supervised
+method for learning generic, transferable, and reusable representations on
+unordered sets of inputs for use in high energy physics (HEP) scientific data.
+This work provides a novel scheme to perform masked modeling based pre-training
+to learn permutation invariant functions on sets. More generally, this work
+provides a step towards building large foundation models for HEP that can be
+generically pre-trained with self-supervised learning and later fine-tuned for
+a variety of down-stream tasks. In MPM, particles in a set are masked and the
+training objective is to recover their identity, as defined by a discretized
+token representation of a pre-trained vector quantized variational autoencoder.
+We study the efficacy of the method in samples of high energy jets at collider
+physics experiments, including studies on the impact of discretization,
+permutation invariance, and ordering. We also study the fine-tuning capability
+of the model, showing that it can be adapted to tasks such as supervised and
+weakly supervised jet classification, and that the model can transfer
+efficiently with small fine-tuning data sets to new classes and new data
+domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finetuning Foundation Models for Joint Analysis Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13536v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13536v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Vig, Nicole Hartman, Lukas Heinrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we demonstrate that significant gains in performance and data
+efficiency can be achieved in High Energy Physics (HEP) by moving beyond the
+standard paradigm of sequential optimization or reconstruction and analysis
+components. We conceptually connect HEP reconstruction and analysis to modern
+machine learning workflows such as pretraining, finetuning, domain adaptation
+and high-dimensional embedding spaces and quantify the gains in the example
+usecase of searches of heavy resonances decaying via an intermediate di-Higgs
+system to four $b$-jets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Understanding the Riemannian SGD and SVRG Flows on Wasserstein
+  Probabilistic Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13530v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13530v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyang Yi, Bohan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, optimization on the Riemannian manifold has provided new insights
+to the optimization community. In this regard, the manifold taken as the
+probability measure metric space equipped with the second-order Wasserstein
+distance is of particular interest, since optimization on it can be linked to
+practical sampling processes. In general, the oracle (continuous) optimization
+method on Wasserstein space is Riemannian gradient flow (i.e., Langevin
+dynamics when minimizing KL divergence). In this paper, we aim to enrich the
+continuous optimization methods in the Wasserstein space by extending the
+gradient flow into the stochastic gradient descent (SGD) flow and stochastic
+variance reduction gradient (SVRG) flow. The two flows on Euclidean space are
+standard stochastic optimization methods, while their Riemannian counterparts
+are not explored yet. By leveraging the structures in Wasserstein space, we
+construct a stochastic differential equation (SDE) to approximate the discrete
+dynamics of desired stochastic methods in the corresponded random vector space.
+Then, the flows of probability measures are naturally obtained by applying
+Fokker-Planck equation to such SDE. Furthermore, the convergence rates of the
+proposed Riemannian stochastic flows are proven, and they match the results in
+Euclidean space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tissue Cross-Section and Pen Marking Segmentation in Whole Slide Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruben T. Lucassen, Willeke A. M. Blokx, Mitko Veta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tissue segmentation is a routine preprocessing step to reduce the
+computational cost of whole slide image (WSI) analysis by excluding background
+regions. Traditional image processing techniques are commonly used for tissue
+segmentation, but often require manual adjustments to parameter values for
+atypical cases, fail to exclude all slide and scanning artifacts from the
+background, and are unable to segment adipose tissue. Pen marking artifacts in
+particular can be a potential source of bias for subsequent analyses if not
+removed. In addition, several applications require the separation of individual
+cross-sections, which can be challenging due to tissue fragmentation and
+adjacent positioning. To address these problems, we develop a convolutional
+neural network for tissue and pen marking segmentation using a dataset of 200
+H&E stained WSIs. For separating tissue cross-sections, we propose a novel
+post-processing method based on clustering predicted centroid locations of the
+cross-sections in a 2D histogram. On an independent test set, the model
+achieved a mean Dice score of 0.981$\pm$0.033 for tissue segmentation and a
+mean Dice score of 0.912$\pm$0.090 for pen marking segmentation. The mean
+absolute difference between the number of annotated and separated
+cross-sections was 0.075$\pm$0.350. Our results demonstrate that the proposed
+model can accurately segment H&E stained tissue cross-sections and pen markings
+in WSIs while being robust to many common slide and scanning artifacts. The
+model with trained model parameters and post-processing method are made
+publicly available as a Python package called SlideSegmenter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Expressive Acoustic Guitar Sound Synthesis with an Instrument-Specific
+  Input Representation and Diffusion Outpainting <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13498v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13498v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hounsu Kim, Soonbeom Choi, Juhan Nam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthesizing performing guitar sound is a highly challenging task due to the
+polyphony and high variability in expression. Recently, deep generative models
+have shown promising results in synthesizing expressive polyphonic instrument
+sounds from music scores, often using a generic MIDI input. In this work, we
+propose an expressive acoustic guitar sound synthesis model with a customized
+input representation to the instrument, which we call guitarroll. We implement
+the proposed approach using diffusion-based outpainting which can generate
+audio with long-term consistency. To overcome the lack of MIDI/audio-paired
+datasets, we used not only an existing guitar dataset but also collected data
+from a high quality sample-based guitar synthesizer. Through quantitative and
+qualitative evaluations, we show that our proposed model has higher audio
+quality than the baseline model and generates more realistic timbre sounds than
+the previous leading work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Separable Physics-Informed Neural Networks for the solution of
+  elasticity problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vasiliy A. Es'kin, Danil V. Davydov, Julia V. Gur'eva, Alexey O. Malkhanov, Mikhail E. Smorkalov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A method for solving elasticity problems based on separable physics-informed
+neural networks (SPINN) in conjunction with the deep energy method (DEM) is
+presented. Numerical experiments have been carried out for a number of problems
+showing that this method has a significantly higher convergence rate and
+accuracy than the vanilla physics-informed neural networks (PINN) and even
+SPINN based on a system of partial differential equations (PDEs). In addition,
+using the SPINN in the framework of DEM approach it is possible to solve
+problems of the linear theory of elasticity on complex geometries, which is
+unachievable with the help of PINNs in frames of partial differential
+equations. Considered problems are very close to the industrial problems in
+terms of geometry, loading, and material parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Agent Diagnostics for Robustness via Illuminated Diversity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13460v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13460v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikayel Samvelyan, Davide Paglieri, Minqi Jiang, Jack Parker-Holder, Tim Rocktäschel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly advancing field of multi-agent systems, ensuring robustness in
+unfamiliar and adversarial settings is crucial. Notwithstanding their
+outstanding performance in familiar environments, these systems often falter in
+new situations due to overfitting during the training phase. This is especially
+pronounced in settings where both cooperative and competitive behaviours are
+present, encapsulating a dual nature of overfitting and generalisation
+challenges. To address this issue, we present Multi-Agent Diagnostics for
+Robustness via Illuminated Diversity (MADRID), a novel approach for generating
+diverse adversarial scenarios that expose strategic vulnerabilities in
+pre-trained multi-agent policies. Leveraging the concepts from open-ended
+learning, MADRID navigates the vast space of adversarial settings, employing a
+target policy's regret to gauge the vulnerabilities of these settings. We
+evaluate the effectiveness of MADRID on the 11vs11 version of Google Research
+Football, one of the most complex environments for multi-agent reinforcement
+learning. Specifically, we employ MADRID for generating a diverse array of
+adversarial settings for TiZero, the state-of-the-art approach which "masters"
+the game through 45 days of training on a large-scale distributed
+infrastructure. We expose key shortcomings in TiZero's tactical
+decision-making, underlining the crucial importance of rigorous evaluation in
+multi-agent systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symbolic Equation Solving via Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lennart Dabelow, Masahito Ueda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine-learning methods are gradually being adopted in a great variety of
+social, economic, and scientific contexts, yet they are notorious for
+struggling with exact mathematics. A typical example is computer algebra, which
+includes tasks like simplifying mathematical terms, calculating formal
+derivatives, or finding exact solutions of algebraic equations. Traditional
+software packages for these purposes are commonly based on a huge database of
+rules for how a specific operation (e.g., differentiation) transforms a certain
+term (e.g., sine function) into another one (e.g., cosine function). Thus far,
+these rules have usually needed to be discovered and subsequently programmed by
+humans. Focusing on the paradigmatic example of solving linear equations in
+symbolic form, we demonstrate how the process of finding elementary
+transformation rules and step-by-step solutions can be automated using
+reinforcement learning with deep neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures + appendices 17 pages, 1 figure, 16 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detection of Correlated Random Vectors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dor Elimelech, Wasim Huleihel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the problem of deciding whether two standard
+normal random vectors $\mathsf{X}\in\mathbb{R}^{n}$ and
+$\mathsf{Y}\in\mathbb{R}^{n}$ are correlated or not. This is formulated as a
+hypothesis testing problem, where under the null hypothesis, these vectors are
+statistically independent, while under the alternative, $\mathsf{X}$ and a
+randomly and uniformly permuted version of $\mathsf{Y}$, are correlated with
+correlation $\rho$. We analyze the thresholds at which optimal testing is
+information-theoretically impossible and possible, as a function of $n$ and
+$\rho$. To derive our information-theoretic lower bounds, we develop a novel
+technique for evaluating the second moment of the likelihood ratio using an
+orthogonal polynomials expansion, which among other things, reveals a
+surprising connection to integer partition functions. We also study a
+multi-dimensional generalization of the above setting, where rather than two
+vectors we observe two databases/matrices, and furthermore allow for partial
+correlations between these two.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated learning with distributed fixed design quantum chips and
+  quantum channels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ammar Daskin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The privacy in classical federated learning can be breached through the use
+of local gradient results by using engineered queries from the clients.
+However, quantum communication channels are considered more secure because the
+use of measurements in the data causes some loss of information, which can be
+detected. Therefore, the quantum version of federated learning can be used to
+provide more privacy. Additionally, sending an $N$ dimensional data vector
+through a quantum channel requires sending $\log N$ entangled qubits, which can
+provide exponential efficiency if the data vector is obtained as quantum
+states.
+  In this paper, we propose a quantum federated learning model where fixed
+design quantum chips are operated based on the quantum states sent by a
+centralized server. Based on the coming superposition states, the clients
+compute and then send their local gradients as quantum states to the server,
+where they are aggregated to update parameters. Since the server does not send
+model parameters, but instead sends the operator as a quantum state, the
+clients are not required to share the model. This allows for the creation of
+asynchronous learning models. In addition, the model as a quantum state is fed
+into client-side chips directly; therefore, it does not require measurements on
+the upcoming quantum state to obtain model parameters in order to compute
+gradients. This can provide efficiency over the models where parameter vector
+is sent via classical or quantum channels and local gradients are obtained
+through the obtained values of these parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How to Forget Clients in Federated Online Learning to Rank? <span class="chip">ECIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyi Wang, Bing Liu, Guido Zuccon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data protection legislation like the European Union's General Data Protection
+Regulation (GDPR) establishes the \textit{right to be forgotten}: a user
+(client) can request contributions made using their data to be removed from
+learned models. In this paper, we study how to remove the contributions made by
+a client participating in a Federated Online Learning to Rank (FOLTR) system.
+In a FOLTR system, a ranker is learned by aggregating local updates to the
+global ranking model. Local updates are learned in an online manner at a
+client-level using queries and implicit interactions that have occurred within
+that specific client. By doing so, each client's local data is not shared with
+other clients or with a centralised search service, while at the same time
+clients can benefit from an effective global ranking model learned from
+contributions of each client in the federation.
+  In this paper, we study an effective and efficient unlearning method that can
+remove a client's contribution without compromising the overall ranker
+effectiveness and without needing to retrain the global ranker from scratch. A
+key challenge is how to measure whether the model has unlearned the
+contributions from the client $c^*$ that has requested removal. For this, we
+instruct $c^*$ to perform a poisoning attack (add noise to this client updates)
+and then we measure whether the impact of the attack is lessened when the
+unlearning process has taken place. Through experiments on four datasets, we
+demonstrate the effectiveness and efficiency of the unlearning strategy under
+different combinations of parameter settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ECIR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text Categorization Can Enhance Domain-Agnostic Stopword Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Houcemeddine Turki, Naome A. Etori, Mohamed Ali Hadj Taieb, Abdul-Hakeem Omotayo, Chris Chinenye Emezue, Mohamed Ben Aouicha, Ayodele Awokoya, Falalu Ibrahim Lawan, Doreen Nixdorf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the role of text categorization in streamlining
+stopword extraction in natural language processing (NLP), specifically focusing
+on nine African languages alongside French. By leveraging the MasakhaNEWS,
+African Stopwords Project, and MasakhaPOS datasets, our findings emphasize that
+text categorization effectively identifies domain-agnostic stopwords with over
+80% detection success rate for most examined languages. Nevertheless,
+linguistic variances result in lower detection rates for certain languages.
+Interestingly, we find that while over 40% of stopwords are common across news
+categories, less than 15% are unique to a single category. Uncommon stopwords
+add depth to text but their classification as stopwords depends on context.
+Therefore combining statistical and linguistic approaches creates comprehensive
+stopword lists, highlighting the value of our hybrid method. This research
+enhances NLP for African languages and underscores the importance of text
+categorization in stopword extraction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A Project Report for the Masakhane Research Community</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Accuracy-Fairness: Stop evaluating bias mitigation methods solely
+  on between-group metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sofie Goethals, Toon Calders, David Martens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI) finds widespread applications across various
+domains, sparking concerns about fairness in its deployment. While fairness in
+AI remains a central concern, the prevailing discourse often emphasizes
+outcome-based metrics without a nuanced consideration of the differential
+impacts within subgroups. Bias mitigation techniques do not only affect the
+ranking of pairs of instances across sensitive groups, but often also
+significantly affect the ranking of instances within these groups. Such changes
+are hard to explain and raise concerns regarding the validity of the
+intervention. Unfortunately, these effects largely remain under the radar in
+the accuracy-fairness evaluation framework that is usually applied. This paper
+challenges the prevailing metrics for assessing bias mitigation techniques,
+arguing that they do not take into account the changes within-groups and that
+the resulting prediction labels fall short of reflecting real-world scenarios.
+We propose a paradigm shift: initially, we should focus on generating the most
+precise ranking for each subgroup. Following this, individuals should be chosen
+from these rankings to meet both fairness standards and practical
+considerations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating System Bias in Resource Constrained Asynchronous Federated
+  Learning Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13366v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13366v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jikun Gao, Ioannis Mavromatis, Peizheng Li, Pietro Carnelli, Aftab Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) systems face performance challenges in dealing with
+heterogeneous devices and non-identically distributed data across clients. We
+propose a dynamic global model aggregation method within Asynchronous Federated
+Learning (AFL) deployments to address these issues. Our aggregation method
+scores and adjusts the weighting of client model updates based on their upload
+frequency to accommodate differences in device capabilities. Additionally, we
+also immediately provide an updated global model to clients after they upload
+their local models to reduce idle time and improve training efficiency. We
+evaluate our approach within an AFL deployment consisting of 10 simulated
+clients with heterogeneous compute constraints and non-IID data. The simulation
+results, using the FashionMNIST dataset, demonstrate over 10% and 19%
+improvement in global model accuracy compared to state-of-the-art methods
+PAPAYA and FedAsync, respectively. Our dynamic aggregation method allows
+reliable global model training despite limiting client resources and
+statistical data heterogeneity. This improves robustness and scalability for
+real-world FL deployments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures. This work has been accepted by PerCom PerconAI
+  workshop 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Debiased Sample Selection for Combating Noisy Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Wei, Lei Feng, Haobo Wang, Bo An
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning with noisy labels aims to ensure model generalization given a
+label-corrupted training set. The sample selection strategy achieves promising
+performance by selecting a label-reliable subset for model training. In this
+paper, we empirically reveal that existing sample selection methods suffer from
+both data and training bias that are represented as imbalanced selected sets
+and accumulation errors in practice, respectively. However, only the training
+bias was handled in previous studies. To address this limitation, we propose a
+noIse-Tolerant Expert Model (ITEM) for debiased learning in sample selection.
+Specifically, to mitigate the training bias, we design a robust network
+architecture that integrates with multiple experts. Compared with the
+prevailing double-branch network, our network exhibits better performance of
+selection and prediction by ensembling these experts while training with fewer
+parameters. Meanwhile, to mitigate the data bias, we propose a mixed sampling
+strategy based on two weight-based data samplers. By training on the mixture of
+two class-discriminative mini-batches, the model mitigates the effect of the
+imbalanced training set while avoiding sparse representations that are easily
+caused by sampling strategies. Extensive experiments and analyses demonstrate
+the effectiveness of ITEM. Our code is available at this url
+\href{https://github.com/1998v7/ITEM}{ITEM}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lessons on <span class="highlight-title">Dataset</span>s and Paradigms in Machine Learning for Symbolic
+  Computation: A Case Study on CAD 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tereso del Río, Matthew England
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Symbolic Computation algorithms and their implementation in computer algebra
+systems often contain choices which do not affect the correctness of the output
+but can significantly impact the resources required: such choices can benefit
+from having them made separately for each problem via a machine learning model.
+This study reports lessons on such use of machine learning in symbolic
+computation, in particular on the importance of analysing datasets prior to
+machine learning and on the different machine learning paradigms that may be
+utilised. We present results for a particular case study, the selection of
+variable ordering for cylindrical algebraic decomposition, but expect that the
+lessons learned are applicable to other decisions in symbolic computation.
+  We utilise an existing dataset of examples derived from applications which
+was found to be imbalanced with respect to the variable ordering decision. We
+introduce an augmentation technique for polynomial systems problems that allows
+us to balance and further augment the dataset, improving the machine learning
+results by 28\% and 38\% on average, respectively. We then demonstrate how the
+existing machine learning methodology used for the problem $-$ classification
+$-$ might be recast into the regression paradigm. While this does not have a
+radical change on the performance, it does widen the scope in which the
+methodology can be applied to make choices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Full Bayesian Significance Testing for Neural Networks <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehua Liu, Zimeng Li, Jingyuan Wang, Yue He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significance testing aims to determine whether a proposition about the
+population distribution is the truth or not given observations. However,
+traditional significance testing often needs to derive the distribution of the
+testing statistic, failing to deal with complex nonlinear relationships. In
+this paper, we propose to conduct Full Bayesian Significance Testing for neural
+networks, called \textit{n}FBST, to overcome the limitation in relationship
+characterization of traditional approaches. A Bayesian neural network is
+utilized to fit the nonlinear and multi-dimensional relationships with small
+errors and avoid hard theoretical derivation by computing the evidence value.
+Besides, \textit{n}FBST can test not only global significance but also local
+and instance-wise significance, which previous testing methods don't focus on.
+Moreover, \textit{n}FBST is a general framework that can be extended based on
+the measures selected, such as Grad-\textit{n}FBST, LRP-\textit{n}FBST,
+DeepLIFT-\textit{n}FBST, LIME-\textit{n}FBST. A range of experiments on both
+simulated and real data are conducted to show the advantages of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanmay Chakraborty, Christin Seifert, Christian Wirth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In industry, Bayesian optimization (BO) is widely applied in the human-AI
+collaborative parameter tuning of cyber-physical systems. However, BO's
+solutions may deviate from human experts' actual goal due to approximation
+errors and simplified objectives, requiring subsequent tuning. The black-box
+nature of BO limits the collaborative tuning process because the expert does
+not trust the BO recommendations. Current explainable AI (XAI) methods are not
+tailored for optimization and thus fall short of addressing this gap. To bridge
+this gap, we propose TNTRules (TUNE-NOTUNE Rules), a post-hoc, rule-based
+explainability method that produces high quality explanations through
+multiobjective optimization. Our evaluation of benchmark optimization problems
+and real-world hyperparameter optimization tasks demonstrates TNTRules'
+superiority over state-of-the-art XAI methods in generating high quality
+explanations. This work contributes to the intersection of BO and XAI,
+providing interpretable optimization techniques for real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NACHOS: Neural Architecture Search for Hardware Constrained Early Exit
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13330v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13330v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Gambella, Jary Pomponi, Simone Scardapane, Manuel Roveri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early Exit Neural Networks (EENNs) endow astandard Deep Neural Network (DNN)
+with Early Exit Classifiers (EECs), to provide predictions at intermediate
+points of the processing when enough confidence in classification is achieved.
+This leads to many benefits in terms of effectiveness and efficiency.
+Currently, the design of EENNs is carried out manually by experts, a complex
+and time-consuming task that requires accounting for many aspects, including
+the correct placement, the thresholding, and the computational overhead of the
+EECs. For this reason, the research is exploring the use of Neural Architecture
+Search (NAS) to automatize the design of EENNs. Currently, few comprehensive
+NAS solutions for EENNs have been proposed in the literature, and a fully
+automated, joint design strategy taking into consideration both the backbone
+and the EECs remains an open problem. To this end, this work presents Neural
+Architecture Search for Hardware Constrained Early Exit Neural Networks
+(NACHOS), the first NAS framework for the design of optimal EENNs satisfying
+constraints on the accuracy and the number of Multiply and Accumulate (MAC)
+operations performed by the EENNs at inference time. In particular, this
+provides the joint design of backbone and EECs to select a set of admissible
+(i.e., respecting the constraints) Pareto Optimal Solutions in terms of best
+tradeoff between the accuracy and number of MACs. The results show that the
+models designed by NACHOS are competitive with the state-of-the-art EENNs.
+Additionally, this work investigates the effectiveness of two novel
+regularization terms designed for the optimization of the auxiliary classifiers
+of the EENN
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Synthetic Health Sensor Data for Privacy-Preserving Wearable
+  Stress Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Lange, Nils Wenzlitschke, Erhard Rahm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Smartwatch health sensor data is increasingly utilized in smart health
+applications and patient monitoring, including stress detection. However, such
+medical data often comprises sensitive personal information and is
+resource-intensive to acquire for research purposes. In response to this
+challenge, we introduce the privacy-aware synthetization of multi-sensor
+smartwatch health readings related to moments of stress. Our method involves
+the generation of synthetic sequence data through Generative Adversarial
+Networks (GANs), coupled with the implementation of Differential Privacy (DP)
+safeguards for protecting patient information during model training. To ensure
+the integrity of our synthetic data, we employ a range of quality assessments
+and monitor the plausibility between synthetic and original data. To test the
+usefulness, we create private machine learning models on a commonly used,
+albeit small, stress detection dataset, exploring strategies for enhancing the
+existing data foundation with our synthetic data. Through our GAN-based
+augmentation methods, we observe improvements in model performance, both in
+non-private (0.45% F1) and private (11.90-15.48% F1) training scenarios. We
+underline the potential of differentially private synthetic data in optimizing
+utility-privacy trade-offs, especially with limited availability of real
+training samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConTextual: Evaluating Context-Sensitive Text-Rich Visual Reasoning in
+  Large Multimodal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohan Wadhawan, Hritik Bansal, Kai-Wei Chang, Nanyun Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in AI have led to the development of large multimodal
+models (LMMs) capable of processing complex tasks involving joint reasoning
+over text and visual content in the image (e.g., navigating maps in public
+places). This paper introduces ConTextual, a novel benchmark comprising
+instructions designed explicitly to evaluate LMMs' ability to perform
+context-sensitive text-rich visual reasoning. ConTextual emphasizes diverse
+real-world scenarios (e.g., time-reading, navigation, shopping and more)
+demanding a deeper understanding of the interactions between textual and visual
+elements. Our findings reveal a significant performance gap of 30.8% between
+the best-performing LMM, GPT-4V(ision), and human capabilities using human
+evaluation indicating substantial room for improvement in context-sensitive
+text-rich visual reasoning. Notably, while GPT-4V excelled in abstract
+categories like meme and quote interpretation, its overall performance still
+lagged behind humans. In addition to human evaluations, we also employed
+automatic evaluation metrics using GPT-4, uncovering similar trends in
+performance disparities. We also perform a fine-grained evaluation across
+diverse visual contexts and provide qualitative analysis which provides a
+robust framework for future advancements in the LMM design.
+https://con-textual.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification of Radiologically Isolated Syndrome and Clinically
+  Isolated Syndrome with Machine-Learning Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        V Mato-Abad, A Labiano-Fontcuberta, S Rodriguez-Yanez, R Garcia-Vazquez, CR Munteanu, J Andrade-Garda, A Domingo-Santos, V Galan Sanchez-Seco, Y Aladro, ML Martinez-Gines, L Ayuso, J Benito-Leon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background and purpose: The unanticipated detection by magnetic resonance
+imaging (MRI) in the brain of asymptomatic subjects of white matter lesions
+suggestive of multiple sclerosis (MS) has been named radiologically isolated
+syndrome (RIS). As the difference between early MS [i.e. clinically isolated
+syndrome (CIS)] and RIS is the occurrence of a clinical event, it is logical to
+improve detection of the subclinical form without interfering with MRI as there
+are radiological diagnostic criteria for that. Our objective was to use
+machine-learning classification methods to identify morphometric measures that
+help to discriminate patients with RIS from those with CIS.
+  Methods: We used a multimodal 3-T MRI approach by combining MRI biomarkers
+(cortical thickness, cortical and subcortical grey matter volume, and white
+matter integrity) of a cohort of 17 patients with RIS and 17 patients with CIS
+for single-subject level classification.
+  Results: The best proposed models to predict the diagnosis of CIS and RIS
+were based on the Naive Bayes, Bagging and Multilayer Perceptron classifiers
+using only three features: the left rostral middle frontal gyrus volume and the
+fractional anisotropy values in the right amygdala and right lingual gyrus. The
+Naive Bayes obtained the highest accuracy [overall classification, 0.765; area
+under the receiver operating characteristic (AUROC), 0.782].
+  Conclusions: A machine-learning approach applied to multimodal MRI data may
+differentiate between the earliest clinical expressions of MS (CIS and RIS)
+with an accuracy of 78%.
+  Keywords: Bagging; Multilayer Perceptron; Naive Bayes classifier; clinically
+isolated syndrome; diffusion tensor imaging; machine-learning; magnetic
+resonance imaging; multiple sclerosis; radiologically isolated syndrome.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RefreshNet: Learning Multiscale Dynamics through Hierarchical Refreshing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junaid Farooq, Danish Rafiq, Pantelis R. Vlachas, Mohammad Abid Bazaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forecasting complex system dynamics, particularly for long-term predictions,
+is persistently hindered by error accumulation and computational burdens. This
+study presents RefreshNet, a multiscale framework developed to overcome these
+challenges, delivering an unprecedented balance between computational
+efficiency and predictive accuracy. RefreshNet incorporates convolutional
+autoencoders to identify a reduced order latent space capturing essential
+features of the dynamics, and strategically employs multiple recurrent neural
+network (RNN) blocks operating at varying temporal resolutions within the
+latent space, thus allowing the capture of latent dynamics at multiple temporal
+scales. The unique "refreshing" mechanism in RefreshNet allows coarser blocks
+to reset inputs of finer blocks, effectively controlling and alleviating error
+accumulation. This design demonstrates superiority over existing techniques
+regarding computational efficiency and predictive accuracy, especially in
+long-term forecasting. The framework is validated using three benchmark
+applications: the FitzHugh-Nagumo system, the Reaction-Diffusion equation, and
+Kuramoto-Sivashinsky dynamics. RefreshNet significantly outperforms
+state-of-the-art methods in long-term forecasting accuracy and speed, marking a
+significant advancement in modeling complex systems and opening new avenues in
+understanding and predicting their behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Crowdsourcing Via <span class="highlight-title">Self-Supervised</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anmol Kagrecha, Henrik Marklund, Benjamin Van Roy, Hong Jun Jeon, Richard Zeckhauser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Common crowdsourcing systems average estimates of a latent quantity of
+interest provided by many crowdworkers to produce a group estimate. We develop
+a new approach -- just-predict-others -- that leverages self-supervised
+learning and a novel aggregation scheme. This approach adapts weights assigned
+to crowdworkers based on estimates they provided for previous quantities. When
+skills vary across crowdworkers or their estimates correlate, the weighted sum
+offers a more accurate group estimate than the average. Existing algorithms
+such as expectation maximization can, at least in principle, produce similarly
+accurate group estimates. However, their computational requirements become
+onerous when complex models, such as neural networks, are required to express
+relationships among crowdworkers. Just-predict-others accommodates such
+complexity as well as many other practical challenges. We analyze the efficacy
+of just-predict-others through theoretical and computational studies. Among
+other things, we establish asymptotic optimality as the number of engagements
+per crowdworker grows.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How to Collaborate: Towards Maximizing the Generalization Performance in
+  Cross-Silo Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchang Sun, Marios Kountouris, Jun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) has attracted vivid attention as a privacy-preserving
+distributed learning framework. In this work, we focus on cross-silo FL, where
+clients become the model owners after training and are only concerned about the
+model's generalization performance on their local data. Due to the data
+heterogeneity issue, asking all the clients to join a single FL training
+process may result in model performance degradation. To investigate the
+effectiveness of collaboration, we first derive a generalization bound for each
+client when collaborating with others or when training independently. We show
+that the generalization performance of a client can be improved only by
+collaborating with other clients that have more training data and similar data
+distribution. Our analysis allows us to formulate a client utility maximization
+problem by partitioning clients into multiple collaborating groups. A
+hierarchical clustering-based collaborative training (HCCT) scheme is then
+proposed, which does not need to fix in advance the number of groups. We
+further analyze the convergence of HCCT for general non-convex loss functions
+which unveils the effect of data similarity among clients. Extensive
+simulations show that HCCT achieves better generalization performance than
+baseline schemes, whereas it degenerates to independent training and
+conventional FL in specific scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Random to Informed Data Selection: A Diversity-Based Approach to
+  Optimize Human Annotation and Few-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Alcoforado, Thomas Palmeira Ferraz, Lucas Hideki Okamura, Israel Campos Fama, Arnold Moya Lavado, Bárbara Dias Bueno, Bruno Veloso, Anna Helena Reali Costa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in Natural Language Processing is obtaining annotated data
+for supervised learning. An option is the use of crowdsourcing platforms for
+data annotation. However, crowdsourcing introduces issues related to the
+annotator's experience, consistency, and biases. An alternative is to use
+zero-shot methods, which in turn have limitations compared to their few-shot or
+fully supervised counterparts. Recent advancements driven by large language
+models show potential, but struggle to adapt to specialized domains with
+severely limited data. The most common approaches therefore involve the human
+itself randomly annotating a set of datapoints to build initial datasets. But
+randomly sampling data to be annotated is often inefficient as it ignores the
+characteristics of the data and the specific needs of the model. The situation
+worsens when working with imbalanced datasets, as random sampling tends to
+heavily bias towards the majority classes, leading to excessive annotated data.
+To address these issues, this paper contributes an automatic and informed data
+selection architecture to build a small dataset for few-shot learning. Our
+proposal minimizes the quantity and maximizes diversity of data selected for
+human annotation, while improving model performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at PROPOR 2024 - The 16th International Conference on
+  Computational Processing of Portuguese</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable Link Prediction on Large-Scale Heterogeneous Graphs with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baolong Bi, Shenghua Liu, Yiwei Wang, Lingrui Mei, Xueqi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploring the application of large-scale language models to graph learning is
+a novel endeavor. However, the vast amount of information inherent in large
+graphs poses significant challenges to this process. This paper focuses on the
+link prediction task and introduces LPNL (Link Prediction via Natural
+Language), a framework based on a large language model designed for scalable
+link prediction on large-scale heterogeneous graphs.We design novel prompts for
+link prediction that articulate graph details in natural language. We propose a
+two-stage sampling pipeline to extract crucial information from large-scale
+heterogeneous graphs, and a divide-and-conquer strategy to control the input
+token count within predefined limits, addressing the challenge of overwhelming
+information. We fine-tune a T5 model based on our self-supervised learning
+designed for for link prediction. Extensive experiments on a large public
+heterogeneous graphs demonstrate that LPNL outperforms various advanced
+baselines, highlighting its remarkable performance in link prediction tasks on
+large-scale graphs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TEPI: Taxonomy-aware Embedding and Pseudo-Imaging for Scarcely-labeled
+  Zero-shot Genome Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sathyanarayanan Aakur, Vishalini R. Laguduva, Priyadharsini Ramamurthy, Akhilesh Ramachandran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A species' genetic code or genome encodes valuable evolutionary, biological,
+and phylogenetic information that aids in species recognition, taxonomic
+classification, and understanding genetic predispositions like drug resistance
+and virulence. However, the vast number of potential species poses significant
+challenges in developing a general-purpose whole genome classification tool.
+Traditional bioinformatics tools have made notable progress but lack
+scalability and are computationally expensive. Machine learning-based
+frameworks show promise but must address the issue of large classification
+vocabularies with long-tail distributions. In this study, we propose addressing
+this problem through zero-shot learning using TEPI, Taxonomy-aware Embedding
+and Pseudo-Imaging. We represent each genome as pseudo-images and map them to a
+taxonomy-aware embedding space for reasoning and classification. This embedding
+space captures compositional and phylogenetic relationships of species,
+enabling predictions in extensive search spaces. We evaluate TEPI using two
+rigorous zero-shot settings and demonstrate its generalization capabilities
+qualitatively on curated, large-scale, publicly sourced data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE JBHI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Principled Local Optimization Methods for Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Honglin Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL), a distributed learning paradigm that scales
+on-device learning collaboratively, has emerged as a promising approach for
+decentralized AI applications. Local optimization methods such as Federated
+Averaging (FedAvg) are the most prominent methods for FL applications. Despite
+their simplicity and popularity, the theoretical understanding of local
+optimization methods is far from clear. This dissertation aims to advance the
+theoretical foundation of local methods in the following three directions.
+  First, we establish sharp bounds for FedAvg, the most popular algorithm in
+Federated Learning. We demonstrate how FedAvg may suffer from a notion we call
+iterate bias, and how an additional third-order smoothness assumption may
+mitigate this effect and lead to better convergence rates. We explain this
+phenomenon from a Stochastic Differential Equation (SDE) perspective.
+  Second, we propose Federated Accelerated Stochastic Gradient Descent (FedAc),
+the first principled acceleration of FedAvg, which provably improves the
+convergence rate and communication efficiency. Our technique uses on a
+potential-based perturbed iterate analysis, a novel stability analysis of
+generalized accelerated SGD, and a strategic tradeoff between acceleration and
+stability.
+  Third, we study the Federated Composite Optimization problem, which extends
+the classic smooth setting by incorporating a shared non-smooth regularizer. We
+show that direct extensions of FedAvg may suffer from the "curse of primal
+averaging," resulting in slow convergence. As a solution, we propose a new
+primal-dual algorithm, Federated Dual Averaging, which overcomes the curse of
+primal averaging by employing a novel inter-client dual averaging procedure.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Stanford University Doctoral Dissertation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AMANet: Advancing SAR Ship Detection with Adaptive Multi-Hierarchical
+  Attention Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaolin Ma, Junkai Cheng, Aihua Li, Yuhua Zhang, Zhilong Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, methods based on deep learning have been successfully applied to
+ship detection for synthetic aperture radar (SAR) images. Despite the
+development of numerous ship detection methodologies, detecting small and
+coastal ships remains a significant challenge due to the limited features and
+clutter in coastal environments. For that, a novel adaptive multi-hierarchical
+attention module (AMAM) is proposed to learn multi-scale features and
+adaptively aggregate salient features from various feature layers, even in
+complex environments. Specifically, we first fuse information from adjacent
+feature layers to enhance the detection of smaller targets, thereby achieving
+multi-scale feature enhancement. Then, to filter out the adverse effects of
+complex backgrounds, we dissect the previously fused multi-level features on
+the channel, individually excavate the salient regions, and adaptively
+amalgamate features originating from different channels. Thirdly, we present a
+novel adaptive multi-hierarchical attention network (AMANet) by embedding the
+AMAM between the backbone network and the feature pyramid network (FPN).
+Besides, the AMAM can be readily inserted between different frameworks to
+improve object detection. Lastly, extensive experiments on two large-scale SAR
+ship detection datasets demonstrate that our AMANet method is superior to
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdCorDA: Classifier Refinement via Adversarial Correction and Domain
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lulan Shen, Ali Edalati, Brett Meyer, Warren Gross, James J. Clark
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes a simple yet effective technique for refining a
+pretrained classifier network. The proposed AdCorDA method is based on
+modification of the training set and making use of the duality between network
+weights and layer inputs. We call this input space training. The method
+consists of two stages - adversarial correction followed by domain adaptation.
+Adversarial correction uses adversarial attacks to correct incorrect
+training-set classifications. The incorrectly classified samples of the
+training set are removed and replaced with the adversarially corrected samples
+to form a new training set, and then, in the second stage, domain adaptation is
+performed back to the original training set. Extensive experimental validations
+show significant accuracy boosts of over 5% on the CIFAR-100 dataset. The
+technique can be straightforwardly applied to refinement of weight-quantized
+neural networks, where experiments show substantial enhancement in performance
+over the baseline. The adversarial correction technique also results in
+enhanced robustness to adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multitask Active Learning for Graph Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjing Chang, Kay Liu, Kaize Ding, Philip S. Yu, Jianjun Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the web era, graph machine learning has been widely used on ubiquitous
+graph-structured data. As a pivotal component for bolstering web security and
+enhancing the robustness of graph-based applications, the significance of graph
+anomaly detection is continually increasing. While Graph Neural Networks (GNNs)
+have demonstrated efficacy in supervised and semi-supervised graph anomaly
+detection, their performance is contingent upon the availability of sufficient
+ground truth labels. The labor-intensive nature of identifying anomalies from
+complex graph structures poses a significant challenge in real-world
+applications. Despite that, the indirect supervision signals from other tasks
+(e.g., node classification) are relatively abundant. In this paper, we propose
+a novel MultItask acTIve Graph Anomaly deTEction framework, namely MITIGATE.
+Firstly, by coupling node classification tasks, MITIGATE obtains the capability
+to detect out-of-distribution nodes without known anomalies. Secondly, MITIGATE
+quantifies the informativeness of nodes by the confidence difference across
+tasks, allowing samples with conflicting predictions to provide informative yet
+not excessively challenging information for subsequent training. Finally, to
+enhance the likelihood of selecting representative nodes that are distant from
+known patterns, MITIGATE adopts a masked aggregation mechanism for distance
+measurement, considering both inherent features of nodes and current labeled
+status. Empirical studies on four datasets demonstrate that MITIGATE
+significantly outperforms the state-of-the-art methods for anomaly detection.
+Our code is publicly available at: https://github.com/AhaChang/MITIGATE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review. Code available at
+  https://github.com/AhaChang/MITIGATE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Improving Interference Management Based on Deep Learning With
+  Uncertainty Quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyun-Suk Lee, Do-Yup Kim, Kyungsik Min
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a groundbreaking self-improving interference management
+framework tailored for wireless communications, integrating deep learning with
+uncertainty quantification to enhance overall system performance. Our approach
+addresses the computational challenges inherent in traditional
+optimization-based algorithms by harnessing deep learning models to predict
+optimal interference management solutions. A significant breakthrough of our
+framework is its acknowledgment of the limitations inherent in data-driven
+models, particularly in scenarios not adequately represented by the training
+dataset. To overcome these challenges, we propose a method for uncertainty
+quantification, accompanied by a qualifying criterion, to assess the
+trustworthiness of model predictions. This framework strategically alternates
+between model-generated solutions and traditional algorithms, guided by a
+criterion that assesses the prediction credibility based on quantified
+uncertainties. Experimental results validate the framework's efficacy,
+demonstrating its superiority over traditional deep learning models, notably in
+scenarios underrepresented in the training dataset. This work marks a
+pioneering endeavor in harnessing self-improving deep learning for interference
+management, through the lens of uncertainty quantification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topology-aware Embedding Memory for Learning on Expanding Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13200v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13200v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xikun Zhang, Dongjin Song, Yixin Chen, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Memory replay based techniques have shown great success for continual
+learning with incrementally accumulated Euclidean data. Directly applying them
+to continually expanding graphs, however, leads to the potential memory
+explosion problem due to the need to buffer representative nodes and their
+associated topological neighborhood structures. To this end, we systematically
+analyze the key challenges in the memory explosion problem, and present a
+general framework, i.e., Parameter Decoupled Graph Neural Networks (PDGNNs)
+with Topology-aware Embedding Memory (TEM), to tackle this issue. The proposed
+framework not only reduces the memory space complexity from $\mathcal{O}(nd^L)$
+to $\mathcal{O}(n)$~\footnote{$n$: memory budget, $d$: average node degree,
+$L$: the radius of the GNN receptive field}, but also fully utilizes the
+topological information for memory replay. Specifically, PDGNNs decouple
+trainable parameters from the computation ego-subgraph via
+\textit{Topology-aware Embeddings} (TEs), which compress ego-subgraphs into
+compact vectors (i.e., TEs) to reduce the memory consumption. Based on this
+framework, we discover a unique \textit{pseudo-training effect} in continual
+learning on expanding graphs and this effect motivates us to develop a novel
+\textit{coverage maximization sampling} strategy that can enhance the
+performance with a tight memory budget. Thorough empirical studies demonstrate
+that, by tackling the memory explosion problem and incorporating topological
+information into memory replay, PDGNNs with TEM significantly outperform
+state-of-the-art techniques, especially in the challenging class-incremental
+setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Design of Crystal Structures by Point Cloud Representations
+  and Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhelin Li, Rami Mrad, Runxian Jiao, Guan Huang, Jun Shan, Shibing Chu, Yuanping Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiently generating energetically stable crystal structures has long been
+a challenge in material design, primarily due to the immense arrangement of
+atoms in a crystal lattice. To facilitate the discovery of stable material, we
+present a framework for the generation of synthesizable materials, leveraging a
+point cloud representation to encode intricate structural information. At the
+heart of this framework lies the introduction of a diffusion model as its
+foundational pillar. To gauge the efficacy of our approach, we employ it to
+reconstruct input structures from our training datasets, rigorously validating
+its high reconstruction performance. Furthermore, we demonstrate the profound
+potential of Point Cloud-Based Crystal Diffusion (PCCD) by generating entirely
+new materials, emphasizing their synthesizability. Our research stands as a
+noteworthy contribution to the advancement of materials design and synthesis
+through the cutting-edge avenue of generative design instead of the
+conventional substitution or experience-based discovery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>I am ready to submit to a journal, but I have not</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shortcutting Cross-Validation: Efficiently Deriving Column-Wise Centered
+  and Scaled Training Set $\mathbf{X}^\mathbf{T}\mathbf{X}$ and
+  $\mathbf{X}^\mathbf{T}\mathbf{Y}$ Without Full Recomputation of Matrix
+  Products or Statistical Moments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ole-Christian Galbo Engstrøm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-validation is a widely used technique for assessing the performance of
+predictive models on unseen data. Many predictive models, such as Kernel-Based
+Partial Least-Squares (PLS) models, require the computation of
+$\mathbf{X}^{\mathbf{T}}\mathbf{X}$ and $\mathbf{X}^{\mathbf{T}}\mathbf{Y}$
+using only training set samples from the input and output matrices,
+$\mathbf{X}$ and $\mathbf{Y}$, respectively. In this work, we present three
+algorithms that efficiently compute these matrices. The first one allows no
+column-wise preprocessing. The second one allows column-wise centering around
+the training set means. The third one allows column-wise centering and
+column-wise scaling around the training set means and standard deviations.
+Demonstrating correctness and superior computational complexity, they offer
+significant cross-validation speedup compared with straight-forward
+cross-validation and previous work on fast cross-validation - all without data
+leakage. Their suitability for parallelization is highlighted with an
+open-source Python implementation combining our algorithms with Improved Kernel
+PLS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 1 table, 6 algorithms</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentBoard: An Analytical Evaluation Board of Multi-turn LLM Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Ma, Junlei Zhang, Zhihao Zhu, Cheng Yang, Yujiu Yang, Yaohui Jin, Zhenzhong Lan, Lingpeng Kong, Junxian He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating large language models (LLMs) as general-purpose agents is
+essential for understanding their capabilities and facilitating their
+integration into practical applications. However, the evaluation process
+presents substantial challenges. A primary obstacle is the benchmarking of
+agent performance across diverse scenarios within a unified framework,
+especially in maintaining partially-observable environments and ensuring
+multi-round interactions. Moreover, current evaluation frameworks mostly focus
+on the final success rate, revealing few insights during the process and
+failing to provide a deep understanding of the model abilities. To address
+these challenges, we introduce AgentBoard, a pioneering comprehensive benchmark
+and accompanied open-source evaluation framework tailored to analytical
+evaluation of LLM agents. AgentBoard offers a fine-grained progress rate metric
+that captures incremental advancements as well as a comprehensive evaluation
+toolkit that features easy assessment of agents for multi-faceted analysis
+through interactive visualization. This not only sheds light on the
+capabilities and limitations of LLM agents but also propels the
+interpretability of their performance to the forefront. Ultimately, AgentBoard
+serves as a significant step towards demystifying agent behaviors and
+accelerating the development of stronger LLM agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Model Reuse in the HuggingFace Community: Challenges,
+  Benefit and Trends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mina Taraghi, Gianolli Dorcelus, Armstrong Foundjem, Florian Tambon, Foutse Khomh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ubiquity of large-scale Pre-Trained Models (PTMs) is on the rise,
+sparking interest in model hubs, and dedicated platforms for hosting PTMs.
+Despite this trend, a comprehensive exploration of the challenges that users
+encounter and how the community leverages PTMs remains lacking. To address this
+gap, we conducted an extensive mixed-methods empirical study by focusing on
+discussion forums and the model hub of HuggingFace, the largest public model
+hub. Based on our qualitative analysis, we present a taxonomy of the challenges
+and benefits associated with PTM reuse within this community. We then conduct a
+quantitative study to track model-type trends and model documentation evolution
+over time. Our findings highlight prevalent challenges such as limited guidance
+for beginner users, struggles with model output comprehensibility in training
+or inference, and a lack of model understanding. We also identified interesting
+trends among models where some models maintain high upload rates despite a
+decline in topics related to them. Additionally, we found that despite the
+introduction of model documentation tools, its quantity has not increased over
+time, leading to difficulties in model comprehension and selection among users.
+Our study sheds light on new challenges in reusing PTMs that were not reported
+before and we provide recommendations for various stakeholders involved in PTM
+reuse.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE SANER 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compositional Generative Inverse Design <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tailin Wu, Takashi Maruyama, Long Wei, Tao Zhang, Yilun Du, Gianluca Iaccarino, Jure Leskovec
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inverse design, where we seek to design input variables in order to optimize
+an underlying objective function, is an important problem that arises across
+fields such as mechanical engineering to aerospace engineering. Inverse design
+is typically formulated as an optimization problem, with recent works
+leveraging optimization across learned dynamics models. However, as models are
+optimized they tend to fall into adversarial modes, preventing effective
+sampling. We illustrate that by instead optimizing over the learned energy
+function captured by the diffusion model, we can avoid such adversarial
+examples and significantly improve design performance. We further illustrate
+how such a design system is compositional, enabling us to combine multiple
+different diffusion models representing subcomponents of our desired system to
+design systems with every specified component. In an N-body interaction task
+and a challenging 2D multi-airfoil design task, we demonstrate that by
+composing the learned diffusion model at test time, our method allows us to
+design initial states and boundary shapes that are more complex than those in
+the training data. Our method outperforms state-of-the-art neural inverse
+design method by an average of 41.5% in prediction MAE and 14.3% in design
+objective for the N-body dataset and discovers formation flying to minimize
+drag in the multi-airfoil design task. Project website and code can be found at
+https://github.com/AI4Science-WestlakeU/cindm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024 spotlight. 30 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpacTor-T5: <span class="highlight-title">Pre-train</span>ing T5 Models with Span Corruption and Replaced
+  Token Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Ye, Heinrich Jiang, Afshin Rostamizadeh, Ayan Chakrabarti, Giulia DeSalvo, Jean-François Kagy, Lazaros Karydas, Gui Citovsky, Sanjiv Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training large language models is known to be extremely resource
+intensive and often times inefficient, under-utilizing the information
+encapsulated in the training text sequences. In this paper, we present SpacTor,
+a new training procedure consisting of (1) a hybrid objective combining span
+corruption (SC) and token replacement detection (RTD), and (2) a two-stage
+curriculum that optimizes the hybrid objective over the initial $\tau$
+iterations, then transitions to standard SC loss. We show empirically that the
+effectiveness of the hybrid objective is tied to the two-stage pre-training
+schedule, and provide extensive analysis on why this is the case. In our
+experiments with encoder-decoder architectures (T5) on a variety of NLP tasks,
+SpacTor-T5 yields the same downstream performance as standard SC pre-training,
+while enabling a 50% reduction in pre-training iterations and 40% reduction in
+total FLOPs. Alternatively, given the same amount of computing budget, we find
+that SpacTor results in significantly improved downstream benchmark
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9+13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Time-Aware Knowledge Representations of Dynamic Objects with
+  Multidimensional Persistence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13157v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13157v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baris Coskunuzer, Ignacio Segovia-Dominguez, Yuzhou Chen, Yulia R. Gel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning time-evolving objects such as multivariate time series and dynamic
+networks requires the development of novel knowledge representation mechanisms
+and neural network architectures, which allow for capturing implicit
+time-dependent information contained in the data. Such information is typically
+not directly observed but plays a key role in the learning task performance. In
+turn, lack of time dimension in knowledge encoding mechanisms for
+time-dependent data leads to frequent model updates, poor learning performance,
+and, as a result, subpar decision-making. Here we propose a new approach to a
+time-aware knowledge representation mechanism that notably focuses on implicit
+time-dependent topological information along multiple geometric dimensions. In
+particular, we propose a new approach, named \textit{Temporal MultiPersistence}
+(TMP), which produces multidimensional topological fingerprints of the data by
+using the existing single parameter topological summaries. The main idea behind
+TMP is to merge the two newest directions in topological representation
+learning, that is, multi-persistence which simultaneously describes data shape
+evolution along multiple key parameters, and zigzag persistence to enable us to
+extract the most salient data shape information over time. We derive
+theoretical guarantees of TMP vectorizations and show its utility, in
+application to forecasting on benchmark traffic flow, Ethereum blockchain, and
+electrocardiogram datasets, demonstrating the competitive performance,
+especially, in scenarios of limited data records. In addition, our TMP method
+improves the computational efficiency of the state-of-the-art multipersistence
+summaries up to 59.5 times.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inverse Molecular Design with Multi-Conditional Diffusion Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gang Liu, Jiaxin Xu, Tengfei Luo, Meng Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inverse molecular design with diffusion models holds great potential for
+advancements in material and drug discovery. Despite success in unconditional
+molecule generation, integrating multiple properties such as synthetic score
+and gas permeability as condition constraints into diffusion models remains
+unexplored. We introduce multi-conditional diffusion guidance. The proposed
+Transformer-based denoising model has a condition encoder that learns the
+representations of numerical and categorical conditions. The denoising model,
+consisting of a structure encoder-decoder, is trained for denoising under the
+representation of conditions. The diffusion process becomes graph-dependent to
+accurately estimate graph-related noise in molecules, unlike the previous
+models that focus solely on the marginal distributions of atoms or bonds. We
+extensively validate our model for multi-conditional polymer and small molecule
+generation. Results demonstrate our superiority across metrics from
+distribution learning to condition control for molecular properties. An inverse
+polymer design task for gas separation with feedback from domain experts
+further demonstrates its practical utility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embedding Attack Project (Work Report) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiameng Pu, Zafar Takhirov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report summarizes all the MIA experiments (Membership Inference Attacks)
+of the Embedding Attack Project, including threat models, experimental setup,
+experimental results, findings and discussion. Current results cover the
+evaluation of two main MIA strategies (loss-based and embedding-based MIAs) on
+6 AI models ranging from Computer Vision to Language Modelling. There are two
+ongoing experiments on MIA defense and neighborhood-comparison embedding
+attacks. These are ongoing projects.
+  The current work on MIA and PIA can be summarized into six conclusions: (1)
+Amount of overfitting is directly proportional to model's vulnerability; (2)
+early embedding layers in the model are less susceptible to privacy leaks; (3)
+Deeper model layers contain more membership information; (4) Models are more
+vulnerable to MIA if both embeddings and corresponding training labels are
+compromised; (5) it is possible to use pseudo-labels to increase the MIA
+success; and (6) although MIA and PIA success rates are proportional, reducing
+the MIA does not necessarily reduce the PIA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reward-Free Curricula for Training Robust World Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09205v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09205v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Rigter, Minqi Jiang, Ingmar Posner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been a recent surge of interest in developing generally-capable
+agents that can adapt to new tasks without additional training in the
+environment. Learning world models from reward-free exploration is a promising
+approach, and enables policies to be trained using imagined experience for new
+tasks. However, achieving a general agent requires robustness across different
+environments. In this work, we address the novel problem of generating
+curricula in the reward-free setting to train robust world models. We consider
+robustness in terms of minimax regret over all environment instantiations and
+show that the minimax regret can be connected to minimising the maximum error
+in the world model across environment instances. This result informs our
+algorithm, WAKER: Weighted Acquisition of Knowledge across Environments for
+Robustness. WAKER selects environments for data collection based on the
+estimated error of the world model for each environment. Our experiments
+demonstrate that WAKER outperforms several baselines, resulting in improved
+robustness, efficiency, and generalisation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Batch Calibration: Rethinking Calibration for In-Context Learning and
+  <span class="highlight-title">Prompt</span> Engineering <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17249v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17249v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Zhou, Xingchen Wan, Lev Proleev, Diana Mincu, Jilin Chen, Katherine Heller, Subhrajit Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompting and in-context learning (ICL) have become efficient learning
+paradigms for large language models (LLMs). However, LLMs suffer from prompt
+brittleness and various bias factors in the prompt, including but not limited
+to the formatting, the choice verbalizers, and the ICL examples. To address
+this problem that results in unexpected performance degradation, calibration
+methods have been developed to mitigate the effects of these biases while
+recovering LLM performance. In this work, we first conduct a systematic
+analysis of the existing calibration methods, where we both provide a unified
+view and reveal the failure cases. Inspired by these analyses, we propose Batch
+Calibration (BC), a simple yet intuitive method that controls the contextual
+bias from the batched input, unifies various prior approaches, and effectively
+addresses the aforementioned issues. BC is zero-shot, inference-only, and
+incurs negligible additional costs. In the few-shot setup, we further extend BC
+to allow it to learn the contextual bias from labeled data. We validate the
+effectiveness of BC with PaLM 2-(S, M, L) and CLIP models and demonstrate
+state-of-the-art performance over previous calibration baselines across more
+than 10 natural language understanding and image classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. 9 pages, 9 figures, 3 tables (22 pages, 11 figures, 11
+  tables including references and appendices)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning-assisted Stochastic Capacity Expansion Planning: A Bayesian
+  Optimization Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10451v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10451v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aron Brenner, Rahman Khorramfar, Dharik Mallapragada, Saurabh Amin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving large-scale capacity expansion problems (CEPs) is central to
+cost-effective decarbonization of regional-scale energy systems. To ensure the
+intended outcomes of CEPs, modeling uncertainty due to weather-dependent
+variable renewable energy (VRE) supply and energy demand becomes crucially
+important. However, the resulting stochastic optimization models are often less
+computationally tractable than their deterministic counterparts. Here, we
+propose a learning-assisted approximate solution method to tractably solve
+two-stage stochastic CEPs. Our method identifies low-cost planning decisions by
+constructing and solving a sequence of tractable temporally aggregated
+surrogate problems. We adopt a Bayesian optimization approach to searching the
+space of time series aggregation hyperparameters and compute approximate
+solutions that minimize costs on a validation set of supply-demand projections.
+Importantly, we evaluate solved planning outcomes on a held-out set of test
+projections. We apply our approach to generation and transmission expansion
+planning for a joint power-gas system spanning New England. We show that our
+approach yields an estimated cost savings of up to 3.8% in comparison to
+benchmark time series aggregation approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Distributed Estimation and Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15865v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15865v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Papachristou, M. Amin Rahimian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study distributed estimation and learning problems in a networked
+environment in which agents exchange information to estimate unknown
+statistical properties of random variables from their privately observed
+samples. The agents can collectively estimate the unknown quantities by
+exchanging information about their private observations, but they also face
+privacy risks. Our novel algorithms extend the existing distributed estimation
+literature and enable the participating agents to estimate a complete
+sufficient statistic from private signals acquired offline or online over time
+and to preserve the privacy of their signals and network neighborhoods. This is
+achieved through linear aggregation schemes with adjusted randomization schemes
+that add noise to the exchanged estimates subject to differential privacy (DP)
+constraints, both in an offline and online manner. We provide convergence rate
+analysis and tight finite-time convergence bounds. We show that the noise that
+minimizes the convergence time to the best estimates is the Laplace noise, with
+parameters corresponding to each agent's sensitivity to their signal and
+network characteristics. Our algorithms are further amenable to dynamic
+topologies and balancing privacy and accuracy trade-offs. Finally, to
+supplement and validate our theoretical results, we run experiments on
+real-world data from the US Power Grid Network and electric consumption data
+from German Households to estimate the average power consumption of power
+stations and households under all privacy regimes and show that our method
+outperforms existing first-order privacy-aware distributed optimization
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Additional experiments, comparison with related work, and extensions
+  (dynamic networks, directed networks networks, heterogeneous privacy budgets)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TrojanPuzzle: Covertly Poisoning Code-Suggestion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.02344v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.02344v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hojjat Aghakhani, Wei Dai, Andre Manoel, Xavier Fernandes, Anant Kharkar, Christopher Kruegel, Giovanni Vigna, David Evans, Ben Zorn, Robert Sim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With tools like GitHub Copilot, automatic code suggestion is no longer a
+dream in software engineering. These tools, based on large language models, are
+typically trained on massive corpora of code mined from unvetted public
+sources. As a result, these models are susceptible to data poisoning attacks
+where an adversary manipulates the model's training by injecting malicious
+data. Poisoning attacks could be designed to influence the model's suggestions
+at run time for chosen contexts, such as inducing the model into suggesting
+insecure code payloads. To achieve this, prior attacks explicitly inject the
+insecure code payload into the training data, making the poison data detectable
+by static analysis tools that can remove such malicious data from the training
+set. In this work, we demonstrate two novel attacks, COVERT and TROJANPUZZLE,
+that can bypass static analysis by planting malicious poison data in
+out-of-context regions such as docstrings. Our most novel attack, TROJANPUZZLE,
+goes one step further in generating less suspicious poison data by never
+explicitly including certain (suspicious) parts of the payload in the poison
+data, while still inducing a model that suggests the entire payload when
+completing code (i.e., outside docstrings). This makes TROJANPUZZLE robust
+against signature-based dataset-cleansing methods that can filter out
+suspicious sequences from the training data. Our evaluation against models of
+two sizes demonstrates that both COVERT and TROJANPUZZLE have significant
+implications for practitioners when selecting code used to train or tune
+code-suggestion models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Identifying Three-Dimensional Radiative Patterns Associated with Early
+  Tropical Cyclone Intensification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09493v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09493v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frederick Iat-Hin Tam, Tom Beucler, James H. Ruppert Jr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cloud radiative feedback impacts early tropical cyclone (TC) intensification,
+but limitations in existing diagnostic frameworks make them unsuitable for
+studying asymmetric or transient radiative heating. We propose a linear
+Variational Encoder-Decoder (VED) to learn the hidden relationship between
+radiation and the surface intensification of realistic simulated TCs. Limiting
+VED model inputs enables using its uncertainty to identify periods when
+radiation has more importance for intensification. A close examination of the
+extracted 3D radiative structures suggests that longwave radiative forcing from
+inner core deep convection and shallow clouds both contribute to
+intensification, with the deep convection having the most impact overall. We
+find that deep convection downwind of the shallow clouds is critical to the
+intensification of Haiyan. Our work demonstrates that machine learning can
+discover thermodynamic-kinematic relationships without relying on axisymmetric
+or deterministic assumptions, paving the way towards the objective discovery of
+processes leading to TC intensification in realistic conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures (main text)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiConStruct: Causal Concept-based Explanations through Black-Box
+  Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08534v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08534v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ricardo Moreira, Jacopo Bono, Mário Cardoso, Pedro Saleiro, Mário A. T. Figueiredo, Pedro Bizarro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model interpretability plays a central role in human-AI decision-making
+systems. Ideally, explanations should be expressed using human-interpretable
+semantic concepts. Moreover, the causal relations between these concepts should
+be captured by the explainer to allow for reasoning about the explanations.
+Lastly, explanation methods should be efficient and not compromise the
+performance of the predictive task. Despite the rapid advances in AI
+explainability in recent years, as far as we know to date, no method fulfills
+these three properties. Indeed, mainstream methods for local concept
+explainability do not produce causal explanations and incur a trade-off between
+explainability and prediction performance. We present DiConStruct, an
+explanation method that is both concept-based and causal, with the goal of
+creating more interpretable local explanations in the form of structural causal
+models and concept attributions. Our explainer works as a distillation model to
+any black-box machine learning model by approximating its predictions while
+producing the respective explanations. Because of this, DiConStruct generates
+explanations efficiently while not impacting the black-box prediction task. We
+validate our method on an image dataset and a tabular dataset, showing that
+DiConStruct approximates the black-box models with higher fidelity than other
+concept explainability baselines, while providing explanations that include the
+causal relations between the concepts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Conference on Causal Learning and Reasoning (CLeaR 2024,
+  https://www.cclear.cc/2024). To be published at Proceedings of Machine
+  Learning Research (PMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Latent Force Models: ODE-based Process Convolutions for Bayesian
+  Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.14828v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.14828v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Baldwin-McDonald, Mauricio A. Álvarez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modelling the behaviour of highly nonlinear dynamical systems with robust
+uncertainty quantification is a challenging task which typically requires
+approaches specifically designed to address the problem at hand. We introduce a
+domain-agnostic model to address this issue termed the deep latent force model
+(DLFM), a deep Gaussian process with physics-informed kernels at each layer,
+derived from ordinary differential equations using the framework of process
+convolutions. Two distinct formulations of the DLFM are presented which utilise
+weight-space and variational inducing points-based Gaussian process
+approximations, both of which are amenable to doubly stochastic variational
+inference. We present empirical evidence of the capability of the DLFM to
+capture the dynamics present in highly nonlinear real-world multi-output time
+series data. Additionally, we find that the DLFM is capable of achieving
+comparable performance to a range of non-physics-informed probabilistic models
+on benchmark univariate regression tasks. We also empirically assess the
+negative impact of the inducing points framework on the extrapolation
+capabilities of LFM-based models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 6 figures. Introduction and abstract updated. arXiv admin
+  note: text overlap with arXiv:2106.05960</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMD-Regularized Unbalanced Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.05001v9">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.05001v9.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piyushi Manupriya, J. Saketha Nath, Pratik Jawanpuria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the unbalanced optimal transport (UOT) problem, where the marginal
+constraints are enforced using Maximum Mean Discrepancy (MMD) regularization.
+Our work is motivated by the observation that the literature on UOT is focused
+on regularization based on $\phi$-divergence (e.g., KL divergence). Despite the
+popularity of MMD, its role as a regularizer in the context of UOT seems less
+understood. We begin by deriving a specific dual of MMD-regularized UOT
+(MMD-UOT), which helps us prove several useful properties. One interesting
+outcome of this duality result is that MMD-UOT induces novel metrics, which not
+only lift the ground metric like the Wasserstein but are also sample-wise
+efficient to estimate like the MMD. Further, for real-world applications
+involving non-discrete measures, we present an estimator for the transport plan
+that is supported only on the given ($m$) samples. Under certain conditions, we
+prove that the estimation error with this finitely-supported transport plan is
+also $\mathcal{O}(1/\sqrt{m})$. As far as we know, such error bounds that are
+free from the curse of dimensionality are not known for $\phi$-divergence
+regularized UOT. Finally, we discuss how the proposed estimator can be computed
+efficiently using accelerated gradient descent. Our experiments show that
+MMD-UOT consistently outperforms popular baselines, including KL-regularized
+UOT and MMD, in diverse machine learning applications. Our codes are publicly
+available at https://github.com/Piyushi-0/MMD-reg-OT
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Power of Linear Recurrent Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1802.03308v9">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1802.03308v9.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frieder Stolzenburg, Sandra Litz, Olivia Michael, Oliver Obst
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recurrent neural networks are a powerful means to cope with time series. We
+show how autoregressive linear, i.e., linearly activated recurrent neural
+networks (LRNNs) can approximate any time-dependent function f(t). The
+approximation can effectively be learned by simply solving a linear equation
+system; no backpropagation or similar methods are needed. Furthermore, and this
+is the main contribution of this article, the size of an LRNN can be reduced
+significantly in one step after inspecting the spectrum of the network
+transition matrix, i.e., its eigenvalues, by taking only the most relevant
+components. Therefore, in contrast to other approaches, we do not only learn
+network weights but also the network architecture. LRNNs have interesting
+properties: They end up in ellipse trajectories in the long run and allow the
+prediction of further values and compact representations of functions. We
+demonstrate this by several experiments, among them multiple superimposed
+oscillators (MSO), robotic soccer (RoboCup), and stock price prediction. LRNNs
+outperform the previous state-of-the-art for the MSO task with a minimal number
+of units.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 pages, 12 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UMedNeRF: Uncertainty-aware Single View Volumetric Rendering for Medical
+  Neural Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.05836v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.05836v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Hu, Qinrui Fan, Shu Hu, Siwei Lyu, Xi Wu, Xin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of clinical medicine, computed tomography (CT) is an effective
+medical imaging modality for the diagnosis of various pathologies. Compared
+with X-ray images, CT images can provide more information, including
+multi-planar slices and three-dimensional structures for clinical diagnosis.
+However, CT imaging requires patients to be exposed to large doses of ionizing
+radiation for a long time, which may cause irreversible physical harm. In this
+paper, we propose an Uncertainty-aware MedNeRF (UMedNeRF) network based on
+generated radiation fields. The network can learn a continuous representation
+of CT projections from 2D X-ray images by obtaining the internal structure and
+depth information and using adaptive loss weights to ensure the quality of the
+generated images. Our model is trained on publicly available knee and chest
+datasets, and we show the results of CT projection rendering with a single
+X-ray and compare our method with other methods based on generated radiation
+fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Assessing Electricity Service Unfairness with Transfer Counterfactual
+  Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03258v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03258v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Song Wei, Xiangrui Kong, Alinson Santos Xavier, Shixiang Zhu, Yao Xie, Feng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Energy justice is a growing area of interest in interdisciplinary energy
+research. However, identifying systematic biases in the energy sector remains
+challenging due to confounding variables, intricate heterogeneity in
+counterfactual effects, and limited data availability. First, this paper
+demonstrates how one can evaluate counterfactual unfairness in a power system
+by analyzing the average causal effect of a specific protected attribute.
+Subsequently, we use subgroup analysis to handle model heterogeneity and
+introduce a novel method for estimating counterfactual unfairness based on
+transfer learning, which helps to alleviate the data scarcity in each subgroup.
+In our numerical analysis, we apply our method to a unique large-scale
+customer-level power outage data set and investigate the counterfactual effect
+of demographic factors, such as income and age of the population, on power
+outage durations. Our results indicate that low-income and elderly-populated
+areas consistently experience longer power outages under both daily and
+post-disaster operations, and such discrimination is exacerbated under severe
+conditions. These findings suggest a widespread, systematic issue of injustice
+in the power service systems and emphasize the necessity for focused
+interventions in disadvantaged communities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The preliminary version titled "Detecting Electricity Service Equity
+  Issues with Transfer Counterfactual Learning on Large-Scale Outage Datasets"
+  is presented at NeurIPS 2023 Workshops on Causal Representation Learning
+  (CRL) and Algorithmic Fairness through the Lens of Time (AFT); See v1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Voxtlm: unified decoder-only models for consolidating speech
+  recognition/synthesis and speech/text continuation tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.07937v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.07937v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumi Maiti, Yifan Peng, Shukjae Choi, Jee-weon Jung, Xuankai Chang, Shinji Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a decoder-only language model, VoxtLM, that can perform four
+tasks: speech recognition, speech synthesis, text generation, and speech
+continuation. VoxtLM integrates text vocabulary with discrete speech tokens
+from self-supervised speech features and uses special tokens to enable
+multitask learning. Compared to a single-task model, VoxtLM exhibits a
+significant improvement in speech synthesis, with improvements in both speech
+intelligibility from 28.9 to 5.6 and objective quality from 2.68 to 3.90.
+VoxtLM also improves speech generation and speech recognition performance over
+the single-task counterpart. Further, VoxtLM is trained with publicly available
+data and training recipes and model checkpoints are open-sourced to make fully
+reproducible work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linear Log-Normal Attention with Unbiased Concentration <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.13541v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.13541v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yury Nahshan, Joseph Kampeas, Emir Haleva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer models have achieved remarkable results in a wide range of
+applications. However, their scalability is hampered by the quadratic time and
+memory complexity of the self-attention mechanism concerning the sequence
+length. This limitation poses a substantial obstacle when dealing with long
+documents or high-resolution images. In this work, we study the self-attention
+mechanism by analyzing the distribution of the attention matrix and its
+concentration ability. Furthermore, we propose instruments to measure these
+quantities and introduce a novel self-attention mechanism, Linear Log-Normal
+Attention, designed to emulate the distribution and concentration behavior of
+the original self-attention. Our experimental results on popular natural
+language benchmarks reveal that our proposed Linear Log-Normal Attention
+outperforms other linearized attention alternatives, offering a promising
+avenue for enhancing the scalability of transformer models. Our code is
+available in supplementary materials.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 20 figures, 5 tables, submitted to ICLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenDPD: An Open-Source End-to-End Learning & Benchmarking Framework for
+  Wideband Power Amplifier Modeling and Digital Pre-Distortion <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08318v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08318v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhuo Wu, Gagan Deep Singh, Mohammadreza Beikmirza, Leo C. N. de Vreede, Morteza Alavi, Chang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rise in communication capacity, deep neural networks (DNN) for
+digital pre-distortion (DPD) to correct non-linearity in wideband power
+amplifiers (PAs) have become prominent. Yet, there is a void in open-source and
+measurement-setup-independent platforms for fast DPD exploration and objective
+DPD model comparison. This paper presents an open-source framework, OpenDPD,
+crafted in PyTorch, with an associated dataset for PA modeling and DPD
+learning. We introduce a Dense Gated Recurrent Unit (DGRU)-DPD, trained via a
+novel end-to-end learning architecture, outperforming previous DPD models on a
+digital PA (DPA) in the new digital transmitter (DTX) architecture with
+unconventional transfer characteristics compared to analog PAs. Measurements
+show our DGRU-DPD achieves an ACPR of -44.69/-44.47 dBc and an EVM of -35.22 dB
+for 200 MHz OFDM signals. OpenDPD code, datasets, and documentation are
+publicly available at https://github.com/lab-emi/OpenDPD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published at the 2024 IEEE International Symposium on Circuits
+  and Systems (ISCAS), Singapore</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Relative Policy-Transition Optimization for Fast Policy Transfer <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.06009v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.06009v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Xu, Cheng Zhou, Yizheng Zhang, Baoxiang Wang, Lei Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of policy transfer between two Markov Decision
+Processes (MDPs). We introduce a lemma based on existing theoretical results in
+reinforcement learning to measure the relativity gap between two arbitrary
+MDPs, that is the difference between any two cumulative expected returns
+defined on different policies and environment dynamics. Based on this lemma, we
+propose two new algorithms referred to as Relative Policy Optimization (RPO)
+and Relative Transition Optimization (RTO), which offer fast policy transfer
+and dynamics modelling, respectively. RPO transfers the policy evaluated in one
+environment to maximize the return in another, while RTO updates the
+parameterized dynamics model to reduce the gap between the dynamics of the two
+environments. Integrating the two algorithms results in the complete Relative
+Policy-Transition Optimization (RPTO) algorithm, in which the policy interacts
+with the two environments simultaneously, such that data collections from two
+environments, policy and transition updates are completed in one closed loop to
+form a principled learning framework for policy transfer. We demonstrate the
+effectiveness of RPTO on a set of MuJoCo continuous control tasks by creating
+policy transfer problems via variant dynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Algorithm for Constrained Linear Inverse Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.01068v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.01068v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Rayyan Sheriff, Floor Fenne Redel, Peyman Mohajerin Esfahani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the constrained Linear Inverse Problem (LIP), where a certain
+atomic norm (like the $\ell_1 $ norm) is minimized subject to a quadratic
+constraint. Typically, such cost functions are non-differentiable which makes
+them not amenable to the fast optimization methods existing in practice. We
+propose two equivalent reformulations of the constrained LIP with improved
+convex regularity: (i) a smooth convex minimization problem, and (ii) a
+strongly convex min-max problem. These problems could be solved by applying
+existing acceleration-based convex optimization methods which provide better $
+O \left( \frac{1}{k^2} \right) $ theoretical convergence guarantee, improving
+upon the current best rate of $ O \left( \frac{1}{k} \right) $. We also provide
+a novel algorithm named the Fast Linear Inverse Problem Solver (FLIPS), which
+is tailored to maximally exploit the structure of the reformulations. We
+demonstrate the performance of FLIPS on the classical problems of Binary
+Selection, Compressed Sensing, and Image Denoising. We also provide open source
+\texttt{MATLAB} package for these three examples, which can be easily adapted
+to other LIPs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MNL-Bandit with Knapsacks: a near-optimal algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.01135v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.01135v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdellah Aznag, Vineet Goyal, Noemie Perivier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a dynamic assortment selection problem where a seller has a fixed
+inventory of $N$ substitutable products and faces an unknown demand that
+arrives sequentially over $T$ periods. In each period, the seller needs to
+decide on the assortment of products (satisfying certain constraints) to offer
+to the customers. The customer's response follows an unknown multinomial logit
+model (MNL) with parameter $\boldsymbol{v}$. If customer selects product $i \in
+[N]$, the seller receives revenue $r_i$. The goal of the seller is to maximize
+the total expected revenue from the $T$ customers given the fixed initial
+inventory of $N$ products. We present MNLwK-UCB, a UCB-based algorithm and
+characterize its regret under different regimes of inventory size. We show that
+when the inventory size grows quasi-linearly in time, MNLwK-UCB achieves a
+$\tilde{O}(N + \sqrt{NT})$ regret bound. We also show that for a smaller
+inventory (with growth $\sim T^{\alpha}$, $\alpha < 1$), MNLwK-UCB achieves a
+$\tilde{O}(N(1 + T^{\frac{1 - \alpha}{2}}) + \sqrt{NT})$. In particular, over a
+long time horizon $T$, the rate $\tilde{O}(\sqrt{NT})$ is always achieved
+regardless of the constraints and the size of the inventory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Improved the regret bound/assumptions. Corrected the abstract</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Consistent Optimal Transport with Empirical Conditional Measures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15901v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15901v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piyushi Manupriya, Rachit Keerti Das, Sayantan Biswas, Saketha Nath Jagarlapudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given samples from two joint distributions, we consider the problem of
+Optimal Transportation (OT) between them when conditioned on a common variable.
+We focus on the general setting where the conditioned variable may be
+continuous, and the marginals of this variable in the two joint distributions
+may not be the same. In such settings, standard OT variants cannot be employed,
+and novel estimation techniques are necessary. Since the main challenge is that
+the conditional distributions are not explicitly available, the key idea in our
+OT formulation is to employ kernelized-least-squares terms computed over the
+joint samples, which implicitly match the transport plan's marginals with the
+empirical conditionals. Under mild conditions, we prove that our estimated
+transport plans, as a function of the conditioned variable, are asymptotically
+optimal. For finite samples, we show that the deviation in terms of our
+regularized objective is bounded by $O(1/m^{1/4})$, where $m$ is the number of
+samples. We also discuss how the conditional transport plan could be modelled
+using explicit probabilistic models as well as using implicit generative ones.
+We empirically verify the consistency of our estimator on synthetic datasets,
+where the optimal plan is analytically known. When employed in applications
+like prompt learning for few-shot classification and conditional-generation in
+the context of predicting cell responses to treatment, our methodology improves
+upon state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Internal-Coordinate Density Modelling of Protein Structure: Covariance
+  Matters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13711v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13711v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marloes Arts, Jes Frellsen, Wouter Boomsma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  After the recent ground-breaking advances in protein structure prediction,
+one of the remaining challenges in protein machine learning is to reliably
+predict distributions of structural states. Parametric models of fluctuations
+are difficult to fit due to complex covariance structures between degrees of
+freedom in the protein chain, often causing models to either violate local or
+global structural constraints. In this paper, we present a new strategy for
+modelling protein densities in internal coordinates, which uses constraints in
+3D space to induce covariance structure between the internal degrees of
+freedom. We illustrate the potential of the procedure by constructing a
+variational autoencoder with full covariance output induced by the constraints
+implied by the conditional mean in 3D, and demonstrate that our approach makes
+it possible to scale density models of internal coordinates to full protein
+backbones in two settings: 1) a unimodal setting for proteins exhibiting small
+fluctuations and limited amounts of available data, and 2) a multimodal setting
+for larger conformational changes in a high data regime.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pages: 10 main, 3 references, 8 appendix. Figures: 5 main, 6 appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Timbre-Trap: A Low-Resource Framework for Instrument-Agnostic Music
+  Transcription <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15717v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15717v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Cwitkowitz, Kin Wai Cheuk, Woosung Choi, Marco A. Martínez-Ramírez, Keisuke Toyama, Wei-Hsiang Liao, Yuki Mitsufuji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, research on music transcription has focused mainly on
+architecture design and instrument-specific data acquisition. With the lack of
+availability of diverse datasets, progress is often limited to solo-instrument
+tasks such as piano transcription. Several works have explored multi-instrument
+transcription as a means to bolster the performance of models on low-resource
+tasks, but these methods face the same data availability issues. We propose
+Timbre-Trap, a novel framework which unifies music transcription and audio
+reconstruction by exploiting the strong separability between pitch and timbre.
+We train a single autoencoder to simultaneously estimate pitch salience and
+reconstruct complex spectral coefficients, selecting between either output
+during the decoding stage via a simple switch mechanism. In this way, the model
+learns to produce coefficients corresponding to timbre-less audio, which can be
+interpreted as pitch salience. We demonstrate that the framework leads to
+performance comparable to state-of-the-art instrument-agnostic transcription
+methods, while only requiring a small amount of annotated data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding <span class="highlight-title">Self-Supervised</span> Learning of Speech Representation via
+  Invariance and Redundancy Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03619v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03619v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuf Brima, Ulf Krumnack, Simone Pika, Gunther Heidemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) has emerged as a promising paradigm for
+learning flexible speech representations from unlabeled data. By designing
+pretext tasks that exploit statistical regularities, SSL models can capture
+useful representations that are transferable to downstream tasks. This study
+provides an empirical analysis of Barlow Twins (BT), an SSL technique inspired
+by theories of redundancy reduction in human perception. On downstream tasks,
+BT representations accelerated learning and transferred across domains.
+However, limitations exist in disentangling key explanatory factors, with
+redundancy reduction and invariance alone insufficient for factorization of
+learned latents into modular, compact, and informative codes. Our ablations
+study isolated gains from invariance constraints, but the gains were
+context-dependent. Overall, this work substantiates the potential of Barlow
+Twins for sample-efficient speech encoding. However, challenges remain in
+achieving fully hierarchical representations. The analysis methodology and
+insights pave a path for extensions incorporating further inductive priors and
+perceptual principles to further enhance the BT self-supervision framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures, in submission to MDPI Information</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ cito: An R package for training neural networks using torch 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09599v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09599v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Amesoeder, Florian Hartig, Maximilian Pichler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks (DNN) have become a central method in ecology. Most
+current deep learning (DL) applications rely on one of the major deep learning
+frameworks, in particular Torch or TensorFlow, to build and train DNN. Using
+these frameworks, however, requires substantially more experience and time than
+typical regression functions in the R environment. Here, we present 'cito', a
+user-friendly R package for DL that allows specifying DNNs in the familiar
+formula syntax used by many R packages. To fit the models, 'cito' uses 'torch',
+taking advantage of the numerically optimized torch library, including the
+ability to switch between training models on the CPU or the graphics processing
+unit (GPU) (which allows to efficiently train large DNN). Moreover, 'cito'
+includes many user-friendly functions for model plotting and analysis,
+including optional confidence intervals (CIs) based on bootstraps for
+predictions and explainable AI (xAI) metrics for effect sizes and variable
+importance with CIs and p-values. To showcase a typical analysis pipeline using
+'cito', including its built-in xAI features to explore the trained DNN, we
+build a species distribution model of the African elephant. We hope that by
+providing a user-friendly R framework to specify, deploy and interpret DNN,
+'cito' will make this interesting model class more accessible to ecological
+data analysis. A stable version of 'cito' can be installed from the
+comprehensive R archive network (CRAN).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PatchAD: Patch-based MLP-Mixer for Time Series Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09793v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09793v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhijie Zhong, Zhiwen Yu, Yiyuan Yang, Weizheng Wang, Kaixiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection stands as a crucial aspect of time series analysis, aiming
+to identify abnormal events in time series samples. The central challenge of
+this task lies in effectively learning the representations of normal and
+abnormal patterns in a label-lacking scenario. Previous research mostly relied
+on reconstruction-based approaches, restricting the representational abilities
+of the models. In addition, most of the current deep learning-based methods are
+not lightweight enough, which prompts us to design a more efficient framework
+for anomaly detection. In this study, we introduce PatchAD, a novel multi-scale
+patch-based MLP-Mixer architecture that leverages contrastive learning for
+representational extraction and anomaly detection. Specifically, PatchAD is
+composed of four distinct MLP Mixers, exclusively utilizing the MLP
+architecture for high efficiency and lightweight architecture. Additionally, we
+also innovatively crafted a dual project constraint module to mitigate
+potential model degradation. Comprehensive experiments demonstrate that PatchAD
+achieves state-of-the-art results across multiple real-world multivariate time
+series datasets. Our code is publicly available
+https://github.com/EmorZz1G/PatchAD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 16 figures, Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Model Based Posterior Sampling for Noisy Linear Inverse
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.12343v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.12343v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangming Meng, Yoshiyuki Kabashima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the ubiquitous linear inverse problems with additive Gaussian
+noise and propose an unsupervised sampling approach called diffusion model
+based posterior sampling (DMPS) to reconstruct the unknown signal from noisy
+linear measurements. Specifically, using one diffusion model (DM) as an
+implicit prior, the fundamental difficulty in performing posterior sampling is
+that the noise-perturbed likelihood score, i.e., gradient of an annealed
+likelihood function, is intractable. To circumvent this problem, we introduce a
+simple yet effective closed-form approximation using an uninformative prior
+assumption. Extensive experiments are conducted on a variety of noisy linear
+inverse problems such as noisy super-resolution, denoising, deblurring, and
+colorization. In all tasks, the proposed DMPS demonstrates highly competitive
+or even better performances on various tasks while being 3 times faster than
+the state-of-the-art competitor diffusion posterior sampling (DPS).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/mengxiangming/dmps</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Joint Effect of Task Similarity and Overparameterization on
+  Catastrophic Forgetting -- An Analytical Model <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12617v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12617v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Goldfarb, Itay Evron, Nir Weinberger, Daniel Soudry, Paul Hand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In continual learning, catastrophic forgetting is affected by multiple
+aspects of the tasks. Previous works have analyzed separately how forgetting is
+affected by either task similarity or overparameterization. In contrast, our
+paper examines how task similarity and overparameterization jointly affect
+forgetting in an analyzable model. Specifically, we focus on two-task continual
+linear regression, where the second task is a random orthogonal transformation
+of an arbitrary first task (an abstraction of random permutation tasks). We
+derive an exact analytical expression for the expected forgetting - and uncover
+a nuanced pattern. In highly overparameterized models, intermediate task
+similarity causes the most forgetting. However, near the interpolation
+threshold, forgetting decreases monotonically with the expected task
+similarity. We validate our findings with linear regression on synthetic data,
+and with neural networks on established permutation task benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Twelfth International Conference on Learning
+  Representations (ICLR 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Baseline Analysis of Reward Models' Ability To Accurately Analyze
+  Foundation Models Under Distribution Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.14743v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.14743v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Will LeVine, Benjamin Pikus, Anthony Chen, Sean Hendryx
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models, specifically Large Language Models (LLMs), have lately
+gained wide-spread attention and adoption. Reinforcement Learning with Human
+Feedback (RLHF) involves training a reward model to capture desired behaviors,
+which is then used to align LLM's. These reward models are additionally used at
+inference-time to estimate LLM responses' adherence to those desired behaviors.
+However, there is little work measuring how robust these reward models are to
+distribution shifts. In this work, we evaluate how reward model performance -
+measured via accuracy and calibration (i.e. alignment between accuracy and
+confidence) - is affected by distribution shift. We show novel calibration
+patterns and accuracy drops due to OOD prompts and responses, and that the
+reward model is more sensitive to shifts in responses than prompts.
+Additionally, we adapt an OOD detection technique commonly used in
+classification to the reward model setting to detect these distribution shifts
+in prompts and responses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GaitPT: Skeletons Are All You Need For Gait Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10623v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10623v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Catruna, Adrian Cosma, Emilian Radoi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The analysis of patterns of walking is an important area of research that has
+numerous applications in security, healthcare, sports and human-computer
+interaction. Lately, walking patterns have been regarded as a unique
+fingerprinting method for automatic person identification at a distance. In
+this work, we propose a novel gait recognition architecture called Gait Pyramid
+Transformer (GaitPT) that leverages pose estimation skeletons to capture unique
+walking patterns, without relying on appearance information. GaitPT adopts a
+hierarchical transformer architecture that effectively extracts both spatial
+and temporal features of movement in an anatomically consistent manner, guided
+by the structure of the human skeleton. Our results show that GaitPT achieves
+state-of-the-art performance compared to other skeleton-based gait recognition
+works, in both controlled and in-the-wild scenarios. GaitPT obtains 82.6%
+average accuracy on CASIA-B, surpassing other works by a margin of 6%.
+Moreover, it obtains 52.16% Rank-1 accuracy on GREW, outperforming both
+skeleton-based and appearance-based approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Detection by Approximation of Ensemble Boundary 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.10227v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.10227v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        T. Windeatt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A new method of detecting adversarial attacks is proposed for an ensemble of
+Deep Neural Networks (DNNs) solving two-class pattern recognition problems. The
+ensemble is combined using Walsh coefficients which are capable of
+approximating Boolean functions and thereby controlling the complexity of the
+ensemble decision boundary. The hypothesis in this paper is that decision
+boundaries with high curvature allow adversarial perturbations to be found, but
+change the curvature of the decision boundary, which is then approximated in a
+different way by Walsh coefficients compared to the clean images. By observing
+the difference in Walsh coefficient approximation between clean and adversarial
+images, it is shown experimentally that transferability of attack may be used
+for detection. Furthermore, approximating the decision boundary may aid in
+understanding the learning and transferability properties of DNNs. While the
+experiments here use images, the proposed approach of modelling two-class
+ensemble decision boundaries could in principle be applied to any application
+area. Code for approximating Boolean functions using Walsh coefficients:
+https://doi.org/10.24433/CO.3695905.v1
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Initial Screening Order Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15398v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15398v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose M. Alvarez, Antonio Mastropietro, Salvatore Ruggieri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the role of the initial screening order (ISO) in candidate
+screening processes, such as hiring and academic admissions. ISO refers to the
+order in which the screener sorts the candidate pool before the evaluation. It
+has been largely overlooked in the literature, despite its potential impact on
+the optimality and fairness of the chosen set, especially under a human
+screener. We define two problem formulations: best-$k$, where the screener
+chooses the $k$ best candidates, and good-$k$, where the screener chooses the
+first $k$ good-enough candidates. To study the impact of ISO, we introduce a
+human-like screener and compare to its algorithmic counterpart. The human-like
+screener is conceived to be inconsistent over time due to fatigue. Our analysis
+shows that the ISO under a human-like screener hinders individual fairness
+despite meeting group level fairness. This is due to the position bias, where a
+candidate's evaluation is affected by its position within ISO. We report
+extensive simulated experiments exploring the parameters of the problem
+formulations both for algorithmic and human-like screeners. This work is
+motivated by a real world candidate screening problem studied in collaboration
+with a large European company.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient kernel surrogates for neural network-based regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.18612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.18612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saad Qadeer, Andrew Engel, Amanda Howard, Adam Tsou, Max Vargas, Panos Stinis, Tony Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their immense promise in performing a variety of learning tasks, a
+theoretical understanding of the limitations of Deep Neural Networks (DNNs) has
+so far eluded practitioners. This is partly due to the inability to determine
+the closed forms of the learned functions, making it harder to study their
+generalization properties on unseen datasets. Recent work has shown that
+randomly initialized DNNs in the infinite width limit converge to kernel
+machines relying on a Neural Tangent Kernel (NTK) with known closed form. These
+results suggest, and experimental evidence corroborates, that empirical kernel
+machines can also act as surrogates for finite width DNNs. The high
+computational cost of assembling the full NTK, however, makes this approach
+infeasible in practice, motivating the need for low-cost approximations. In the
+current work, we study the performance of the Conjugate Kernel (CK), an
+efficient approximation to the NTK that has been observed to yield fairly
+similar results. For the regression problem of smooth functions and logistic
+regression classification, we show that the CK performance is only marginally
+worse than that of the NTK and, in certain cases, is shown to be superior. In
+particular, we establish bounds for the relative test losses, verify them with
+numerical tests, and identify the regularity of the kernel as the key
+determinant of performance. In addition to providing a theoretical grounding
+for using CKs instead of NTKs, our framework suggests a recipe for improving
+DNN accuracy inexpensively. We present a demonstration of this on the
+foundation model GPT-2 by comparing its performance on a classification task
+using a conventional approach and our prescription. We also show how our
+approach can be used to improve physics-informed operator network training for
+regression tasks as well as convolutional neural network training for vision
+classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages. software used to reach results available upon request,
+  approved for release by Pacific Northwest National Laboratory</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inferring effective couplings with Restricted Boltzmann Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02292v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02292v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aurélien Decelle, Cyril Furtlehner, Alfonso De Jesus Navas Gómez, Beatriz Seoane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models offer a direct way of modeling complex data. Energy-based
+models attempt to encode the statistical correlations observed in the data at
+the level of the Boltzmann weight associated with an energy function in the
+form of a neural network. We address here the challenge of understanding the
+physical interpretation of such models. In this study, we propose a simple
+solution by implementing a direct mapping between the Restricted Boltzmann
+Machine and an effective Ising spin Hamiltonian. This mapping includes
+interactions of all possible orders, going beyond the conventional pairwise
+interactions typically considered in the inverse Ising (or Boltzmann Machine)
+approach, and allowing the description of complex datasets. Earlier works
+attempted to achieve this goal, but the proposed mappings were inaccurate for
+inference applications, did not properly treat the complexity of the problem,
+or did not provide precise prescriptions for practical application. To validate
+our method, we performed several controlled inverse numerical experiments in
+which we trained the RBMs using equilibrium samples of predefined models with
+local external fields, 2-body and 3-body interactions in different sparse
+topologies. The results demonstrate the effectiveness of our proposed approach
+in learning the correct interaction network and pave the way for its
+application in modeling interesting binary variable datasets. We also evaluate
+the quality of the inferred model based on different training methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 figures, 39 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual cognition in multimodal large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.16093v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.16093v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca M. Schulze Buschoff, Elif Akata, Matthias Bethge, Eric Schulz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A chief goal of artificial intelligence is to build machines that think like
+people. Yet it has been argued that deep neural network architectures fail to
+accomplish this. Researchers have asserted these models' limitations in the
+domains of causal reasoning, intuitive physics, and intuitive psychology. Yet
+recent advancements, namely the rise of large language models, particularly
+those designed for visual processing, have rekindled interest in the potential
+to emulate human-like cognitive abilities. This paper evaluates the current
+state of vision-based large language models in the domains of intuitive
+physics, causal reasoning, and intuitive psychology. Through a series of
+controlled experiments, we investigate the extent to which these modern models
+grasp complex physical interactions, causal relationships, and intuitive
+understanding of others' preferences. Our findings reveal that, while these
+models demonstrate a notable proficiency in processing and interpreting visual
+data, they still fall short of human capabilities in these areas. The models
+exhibit a rudimentary understanding of physical laws and causal relationships,
+but their performance is hindered by a lack of deeper insights - a key aspect
+of human cognition. Furthermore, in tasks requiring an intuitive theory of
+mind, the models fail altogether. Our results emphasize the need for
+integrating more robust mechanisms for understanding causality, physical
+dynamics, and social cognition into modern-day, vision-based language models,
+and point out the importance of cognitively-inspired benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Changed title and main text</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language Modeling on a SpiNNaker 2 Neuromorphic Chip 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09084v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09084v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khaleelulla Khan Nazeer, Mark Schöne, Rishav Mukherji, Bernhard Vogginger, Christian Mayr, David Kappel, Anand Subramoney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models continue to scale in size rapidly, so too does the
+computational power required to run them. Event-based networks on neuromorphic
+devices offer a potential way to reduce energy consumption for inference
+significantly. However, to date, most event-based networks that can run on
+neuromorphic hardware, including spiking neural networks (SNNs), have not
+achieved task performance even on par with LSTM models for language modeling.
+As a result, language modeling on neuromorphic devices has seemed a distant
+prospect. In this work, we demonstrate the first-ever implementation of a
+language model on a neuromorphic device - specifically the SpiNNaker 2 chip -
+based on a recently published event-based architecture called the EGRU.
+SpiNNaker 2 is a many-core neuromorphic chip designed for large-scale
+asynchronous processing, while the EGRU is architected to leverage such
+hardware efficiently while maintaining competitive task performance. This
+implementation marks the first time a neuromorphic language model matches
+LSTMs, setting the stage for taking task performance to the level of large
+language models. We also demonstrate results on a gesture recognition task
+based on inputs from a DVS camera. Overall, our results showcase the
+feasibility of this neuro-inspired neural network in hardware, highlighting
+significant gains versus conventional hardware in energy efficiency for the
+common use case of single batch inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Neural Networks based Log Anomaly Detection and Explanation <span class="chip">ICSE'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00527v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00527v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhong Li, Jiayang Shi, Matthijs van Leeuwen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event logs are widely used to record the status of high-tech systems, making
+log anomaly detection important for monitoring those systems. Most existing log
+anomaly detection methods take a log event count matrix or log event sequences
+as input, exploiting quantitative and/or sequential relationships between log
+events to detect anomalies. Unfortunately, only considering quantitative or
+sequential relationships may result in low detection accuracy. To alleviate
+this problem, we propose a graph-based method for unsupervised log anomaly
+detection, dubbed Logs2Graphs, which first converts event logs into attributed,
+directed, and weighted graphs, and then leverages graph neural networks to
+perform graph-level anomaly detection. Specifically, we introduce One-Class
+Digraph Inception Convolutional Networks, abbreviated as OCDiGCN, a novel graph
+neural network model for detecting graph-level anomalies in a collection of
+attributed, directed, and weighted graphs. By coupling the graph representation
+and anomaly detection steps, OCDiGCN can learn a representation that is
+especially suited for anomaly detection, resulting in a high detection
+accuracy. Importantly, for each identified anomaly, we additionally provide a
+small subset of nodes that play a crucial role in OCDiGCN's prediction as
+explanations, which can offer valuable cues for subsequent root cause
+diagnosis. Experiments on five benchmark datasets show that Logs2Graphs
+performs at least on par with state-of-the-art log anomaly detection methods on
+simple datasets while largely outperforming state-of-the-art log anomaly
+detection methods on complicated datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report (A short version was accepted by ICSE'24 poster
+  track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ $\pi2\text{vec}$: Policy Representations with Successor Features <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09800v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09800v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianluca Scarpellini, Ksenia Konyushkova, Claudio Fantacci, Tom Le Paine, Yutian Chen, Misha Denil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes $\pi2\text{vec}$, a method for representing behaviors of
+black box policies as feature vectors. The policy representations capture how
+the statistics of foundation model features change in response to the policy
+behavior in a task agnostic way, and can be trained from offline data, allowing
+them to be used in offline policy selection. This work provides a key piece of
+a recipe for fusing together three modern lines of research: Offline policy
+evaluation as a counterpart to offline RL, foundation models as generic and
+powerful state representations, and efficient policy selection in resource
+constrained environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted paper at ICLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tensor PCA from basis in tensor space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02803v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02803v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claudio Turchetti, Laura Falaschetti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aim of this paper is to present a mathematical framework for tensor PCA.
+The proposed approach is able to overcome the limitations of previous methods
+that extract a low dimensional subspace by iteratively solving an optimization
+problem. The core of the proposed approach is the derivation of a basis in
+tensor space from a real self-adjoint tensor operator, thus reducing the
+problem of deriving a basis to an eigenvalue problem. Three different cases
+have been studied to derive: i) a basis from a self-adjoint tensor operator;
+ii) a rank-1 basis; iii) a basis in a subspace. In particular, the equivalence
+between eigenvalue equation for a real self-adjoint tensor operator and
+standard matrix eigenvalue equation has been proven. For all the three cases
+considered, a subspace approach has been adopted to derive a tensor PCA.
+Experiments on image datasets validate the proposed mathematical framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version contains a new experiment better showing the
+  potentiality of the paper and a corrected autor list. This work has been
+  submitted to the IEEE for possible publication. Copyright may be transferred
+  without notice, after which this version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decentralized Personalized Federated Learning for Min-Max Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.07289v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.07289v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ekaterina Borodich, Aleksandr Beznosikov, Abdurakhmon Sadiev, Vadim Sushko, Nikolay Savelyev, Martin Takáč, Alexander Gasnikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized Federated Learning (PFL) has witnessed remarkable advancements,
+enabling the development of innovative machine learning applications that
+preserve the privacy of training data. However, existing theoretical research
+in this field has primarily focused on distributed optimization for
+minimization problems. This paper is the first to study PFL for saddle point
+problems encompassing a broader range of optimization problems, that require
+more than just solving minimization problems. In this work, we consider a
+recently proposed PFL setting with the mixing objective function, an approach
+combining the learning of a global model together with locally distributed
+learners. Unlike most previous work, which considered only the centralized
+setting, we work in a more general and decentralized setup that allows us to
+design and analyze more practical and federated ways to connect devices to the
+network. We proposed new algorithms to address this problem and provide a
+theoretical analysis of the smooth (strongly) convex-(strongly) concave saddle
+point problems in stochastic and deterministic cases. Numerical experiments for
+bilinear problems and neural networks with adversarial noise demonstrate the
+effectiveness of the proposed methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 3 algorithms, 5 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safe and Generalized end-to-end Autonomous Driving System with
+  Reinforcement Learning and Demonstrations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11792v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11792v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuojin Tang, Xiaoyu Chen, YongQiang Li, Jianyu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An intelligent driving system should be capable of dynamically formulating
+appropriate driving strategies based on the current environment and vehicle
+status, while ensuring the security and reliability of the system. However,
+existing methods based on reinforcement learning and imitation learning suffer
+from low safety, poor generalization, and inefficient sampling. Additionally,
+they cannot accurately predict future driving trajectories, and the accurate
+prediction of future driving trajectories is a precondition for making optimal
+decisions. To solve these problems, in this paper, we introduce a Safe and
+Generalized end-to-end Autonomous Driving System (SGADS) for complex and
+various scenarios. Our SGADS incorporates variational inference with
+normalizing flows, enabling the intelligent vehicle to accurately predict
+future driving trajectories. Moreover, we propose the formulation of robust
+safety constraints. Furthermore, we combine reinforcement learning with
+demonstrations to augment search process of the agent. The experimental results
+demonstrate that our SGADS can significantly improve safety performance,
+exhibit strong generalization, and enhance the training efficiency of
+intelligent vehicles in complex urban scenarios compared to existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Split Learning in 6G Edge Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12194v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12194v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Lin, Guanqiao Qu, Xianhao Chen, Kaibin Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the proliferation of distributed edge computing resources, the 6G mobile
+network will evolve into a network for connected intelligence. Along this line,
+the proposal to incorporate federated learning into the mobile edge has gained
+considerable interest in recent years. However, the deployment of federated
+learning faces substantial challenges as massive resource-limited IoT devices
+can hardly support on-device model training. This leads to the emergence of
+split learning (SL) which enables servers to handle the major training workload
+while still enhancing data privacy. In this article, we offer a brief overview
+of key advancements in SL and articulate its seamless integration with wireless
+edge networks. We begin by illustrating the tailored 6G architecture to support
+edge SL. Then, we examine the critical design issues for edge SL, including
+innovative resource-efficient learning frameworks and resource management
+strategies under a single edge server. Additionally, we expand the scope to
+multi-edge scenarios, exploring multi-edge collaboration and mobility
+management from a networking perspective. Finally, we discuss open problems for
+edge SL, including convergence analysis, asynchronous SL and U-shaped SL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Digital Over-the-Air Federated Learning in Multi-Antenna Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sihua Wang, Mingzhe Chen, Cong Shen, Changchuan Yin, Christopher G. Brinton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, the performance optimization of federated learning (FL), when
+deployed over a realistic wireless multiple-input multiple-output (MIMO)
+communication system with digital modulation and over-the-air computation
+(AirComp) is studied. In particular, a MIMO system is considered in which edge
+devices transmit their local FL models (trained using their locally collected
+data) to a parameter server (PS) using beamforming to maximize the number of
+devices scheduled for transmission. The PS, acting as a central controller,
+generates a global FL model using the received local FL models and broadcasts
+it back to all devices. Due to the limited bandwidth in a wireless network,
+AirComp is adopted to enable efficient wireless data aggregation. However,
+fading of wireless channels can produce aggregate distortions in an
+AirComp-based FL scheme. To tackle this challenge, we propose a modified
+federated averaging (FedAvg) algorithm that combines digital modulation with
+AirComp to mitigate wireless fading while ensuring the communication
+efficiency. This is achieved by a joint transmit and receive beamforming
+design, which is formulated as an optimization problem to dynamically adjust
+the beamforming matrices based on current FL model parameters so as to minimize
+the transmitting error and ensure the FL performance. To achieve this goal, we
+first analytically characterize how the beamforming matrices affect the
+performance of the FedAvg in different iterations. Based on this relationship,
+an artificial neural network (ANN) is used to estimate the local FL models of
+all devices and adjust the beamforming matrices at the PS for future model
+transmission. The algorithmic advantages and improved performance of the
+proposed methodologies are demonstrated through extensive numerical
+experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Memorization in <span class="highlight-title">Self-Supervised</span> Learning Improves Downstream
+  Generalization <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12233v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12233v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Wang, Muhammad Ahmad Kaleem, Adam Dziedzic, Michael Backes, Nicolas Papernot, Franziska Boenisch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) has recently received significant attention
+due to its ability to train high-performance encoders purely on unlabeled
+data-often scraped from the internet. This data can still be sensitive and
+empirical evidence suggests that SSL encoders memorize private information of
+their training data and can disclose them at inference time. Since existing
+theoretical definitions of memorization from supervised learning rely on
+labels, they do not transfer to SSL. To address this gap, we propose SSLMem, a
+framework for defining memorization within SSL. Our definition compares the
+difference in alignment of representations for data points and their augmented
+views returned by both encoders that were trained on these data points and
+encoders that were not. Through comprehensive empirical analysis on diverse
+encoder architectures and datasets we highlight that even though SSL relies on
+large datasets and strong augmentations-both known in supervised learning as
+regularization techniques that reduce overfitting-still significant fractions
+of training data points experience high memorization. Through our empirical
+results, we show that this memorization is essential for encoders to achieve
+higher generalization performance on different downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HG<span class="highlight-title">PROMPT</span>: Bridging Homogeneous and Heterogeneous Graphs for Few-shot
+  <span class="highlight-title">Prompt</span> Learning <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01878v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01878v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingtong Yu, Yuan Fang, Zemin Liu, Xinming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) and heterogeneous graph neural networks (HGNNs)
+are prominent techniques for homogeneous and heterogeneous graph representation
+learning, yet their performance in an end-to-end supervised framework greatly
+depends on the availability of task-specific supervision. To reduce the
+labeling cost, pre-training on self-supervised pretext tasks has become a
+popular paradigm,but there is often a gap between the pre-trained model and
+downstream tasks, stemming from the divergence in their objectives. To bridge
+the gap, prompt learning has risen as a promising direction especially in
+few-shot settings, without the need to fully fine-tune the pre-trained model.
+While there has been some early exploration of prompt-based learning on graphs,
+they primarily deal with homogeneous graphs, ignoring the heterogeneous graphs
+that are prevalent in downstream applications. In this paper, we propose
+HGPROMPT, a novel pre-training and prompting framework to unify not only
+pre-training and downstream tasks but also homogeneous and heterogeneous graphs
+via a dual-template design. Moreover, we propose dual-prompt in HGPROMPT to
+assist a downstream task in locating the most relevant prior to bridge the gaps
+caused by not only feature variations but also heterogeneity differences across
+tasks. Finally, we thoroughly evaluate and analyze HGPROMPT through extensive
+experiments on three public datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bidirectional recurrent imputation and abundance estimation of LULC
+  classes with MODIS multispectral time series and geo-topographic and climatic
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07223v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07223v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José Rodríguez-Ortega, Rohaifa Khaldi, Domingo Alcaraz-Segura, Siham Tabik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remotely sensed data are dominated by mixed Land Use and Land Cover (LULC)
+types. Spectral unmixing (SU) is a key technique that disentangles mixed pixels
+into constituent LULC types and their abundance fractions. While existing
+studies on Deep Learning (DL) for SU typically focus on single time-step
+hyperspectral (HS) or multispectral (MS) data, our work pioneers SU using MODIS
+MS time series, addressing missing data with end-to-end DL models. Our approach
+enhances a Long-Short Term Memory (LSTM)-based model by incorporating
+geographic, topographic (geo-topographic), and climatic ancillary information.
+Notably, our method eliminates the need for explicit endmember extraction,
+instead learning the input-output relationship between mixed spectra and LULC
+abundances through supervised learning. Experimental results demonstrate that
+integrating spectral-temporal input data with geo-topographic and climatic
+information significantly improves the estimation of LULC abundances in mixed
+pixels. To facilitate this study, we curated a novel labeled dataset for
+Andalusia (Spain) with monthly MODIS multispectral time series at 460m
+resolution for 2013. Named Andalusia MultiSpectral MultiTemporal Unmixing
+(Andalusia-MSMTU), this dataset provides pixel-level annotations of LULC
+abundances along with ancillary information. The dataset
+(https://zenodo.org/records/7752348) and code
+(https://github.com/jrodriguezortega/MSMTU) are available to the public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Toward Practical Entity Alignment Method Design: Insights from New
+  Highly Heterogeneous Knowledge Graph <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03468v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03468v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhui Jiang, Chengjin Xu, Yinghan Shen, Yuanzhuo Wang, Fenglong Su, Fei Sun, Zixuan Li, Zhichao Shi, Jian Guo, Huawei Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The flourishing of knowledge graph applications has driven the need for
+entity alignment (EA) across KGs. However, the heterogeneity of practical KGs,
+characterized by differing scales, structures, and limited overlapping
+entities, greatly surpasses that of existing EA datasets. This discrepancy
+highlights an oversimplified heterogeneity in current EA datasets, which
+obstructs a full understanding of the advancements achieved by recent EA
+methods. In this paper, we study the performance of EA methods in practical
+settings, specifically focusing on the alignment of highly heterogeneous KGs
+(HHKGs). Firstly, we address the oversimplified heterogeneity settings of
+current datasets and propose two new HHKG datasets that closely mimic practical
+EA scenarios. Then, based on these datasets, we conduct extensive experiments
+to evaluate previous representative EA methods. Our findings reveal that, in
+aligning HHKGs, valuable structure information can hardly be exploited through
+message-passing and aggregation mechanisms. This phenomenon leads to inferior
+performance of existing EA methods, especially those based on GNNs. These
+findings shed light on the potential problems associated with the conventional
+application of GNN-based methods as a panacea for all EA datasets.
+Consequently, in light of these observations and to elucidate what EA
+methodology is genuinely beneficial in practical scenarios, we undertake an
+in-depth analysis by implementing a simple but effective approach: Simple-HHEA.
+This method adaptly integrates entity name, structure, and temporal information
+to navigate the challenges posed by HHKGs. Our experiment results conclude that
+the key to the future EA model design in practice lies in their adaptability
+and efficiency to varying information quality conditions, as well as their
+capability to capture patterns across HHKGs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TopP&R: Robust Support Estimation Approach for Evaluating Fidelity and
+  Diversity in Generative Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08013v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08013v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pum Jun Kim, Yoojin Jang, Jisu Kim, Jaejun Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a robust and reliable evaluation metric for generative models by
+introducing topological and statistical treatments for rigorous support
+estimation. Existing metrics, such as Inception Score (IS), Frechet Inception
+Distance (FID), and the variants of Precision and Recall (P&R), heavily rely on
+supports that are estimated from sample features. However, the reliability of
+their estimation has not been seriously discussed (and overlooked) even though
+the quality of the evaluation entirely depends on it. In this paper, we propose
+Topological Precision and Recall (TopP&R, pronounced 'topper'), which provides
+a systematic approach to estimating supports, retaining only topologically and
+statistically important features with a certain level of confidence. This not
+only makes TopP&R strong for noisy features, but also provides statistical
+consistency. Our theoretical and experimental results show that TopP&R is
+robust to outliers and non-independent and identically distributed (Non-IID)
+perturbations, while accurately capturing the true trend of change in samples.
+To the best of our knowledge, this is the first evaluation metric focused on
+the robust estimation of the support and provides its statistical consistency
+under noise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mimicking the Maestro: Exploring the Efficacy of a Virtual AI Teacher in
+  Fine Motor Skill Acquisition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10280v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10280v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hadar Mulian, Segev Shlomov, Lior Limonad, Alessia Noccaro, Silvia Buscaglione
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motor skills, especially fine motor skills like handwriting, play an
+essential role in academic pursuits and everyday life. Traditional methods to
+teach these skills, although effective, can be time-consuming and inconsistent.
+With the rise of advanced technologies like robotics and artificial
+intelligence, there is increasing interest in automating such teaching
+processes using these technologies, via human-robot and human-computer
+interactions. In this study, we examine the potential of a virtual AI teacher
+in emulating the techniques of human educators for motor skill acquisition. We
+introduce an AI teacher model that captures the distinct characteristics of
+human instructors. Using a Reinforcement Learning environment tailored to mimic
+teacher-learner interactions, we tested our AI model against four guiding
+hypotheses, emphasizing improved learner performance, enhanced rate of skill
+acquisition, and reduced variability in learning outcomes. Our findings,
+validated on synthetic learners, revealed significant improvements across all
+tested hypotheses. Notably, our model showcased robustness across different
+learners and settings and demonstrated adaptability to handwriting. This
+research underscores the potential of integrating Reinforcement Learning and
+Imitation Learning models with robotics in revolutionizing the teaching of
+critical motor skills.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: The first version of this paper has been removed by
+  arXiv administrators as the submitter did not have the right to agree to the
+  license at the time of submission. This version resolves the rights issue,
+  includes two additional authors, and is cleared to go public</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Parallel Split Learning over Resource-constrained Wireless
+  Edge Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15991v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15991v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Lin, Guangyu Zhu, Yiqin Deng, Xianhao Chen, Yue Gao, Kaibin Huang, Yuguang Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasingly deeper neural networks hinder the democratization of
+privacy-enhancing distributed learning, such as federated learning (FL), to
+resource-constrained devices. To overcome this challenge, in this paper, we
+advocate the integration of edge computing paradigm and parallel split learning
+(PSL), allowing multiple client devices to offload substantial training
+workloads to an edge server via layer-wise model split. By observing that
+existing PSL schemes incur excessive training latency and large volume of data
+transmissions, we propose an innovative PSL framework, namely, efficient
+parallel split learning (EPSL), to accelerate model training. To be specific,
+EPSL parallelizes client-side model training and reduces the dimension of local
+gradients for back propagation (BP) via last-layer gradient aggregation,
+leading to a significant reduction in server-side training and communication
+latency. Moreover, by considering the heterogeneous channel conditions and
+computing capabilities at client devices, we jointly optimize subchannel
+allocation, power control, and cut layer selection to minimize the per-round
+latency. Simulation results show that the proposed EPSL framework significantly
+decreases the training latency needed to achieve a target accuracy compared
+with the state-of-the-art benchmarks, and the tailored resource management and
+layer split strategy can considerably reduce latency than the counterpart
+without optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PECAN: A Deterministic Certified Defense Against Backdoor Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11824v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11824v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Zhang, Aws Albarghouthi, Loris D'Antoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks are vulnerable to backdoor poisoning attacks, where the
+attackers maliciously poison the training set and insert triggers into the test
+input to change the prediction of the victim model. Existing defenses for
+backdoor attacks either provide no formal guarantees or come with
+expensive-to-compute and ineffective probabilistic guarantees. We present
+PECAN, an efficient and certified approach for defending against backdoor
+attacks. The key insight powering PECAN is to apply off-the-shelf test-time
+evasion certification techniques on a set of neural networks trained on
+disjoint partitions of the data. We evaluate PECAN on image classification and
+malware detection datasets. Our results demonstrate that PECAN can (1)
+significantly outperform the state-of-the-art certified backdoor defense, both
+in defense strength and efficiency, and (2) on real back-door attacks, PECAN
+can reduce attack success rate by order of magnitude when compared to a range
+of baselines from the literature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Continuous Control with Consistency Policy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06343v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06343v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhui Chen, Haoran Li, Dongbin Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to its training stability and strong expression, the diffusion model has
+attracted considerable attention in offline reinforcement learning. However,
+several challenges have also come with it: 1) The demand for a large number of
+diffusion steps makes the diffusion-model-based methods time inefficient and
+limits their applications in real-time control; 2) How to achieve policy
+improvement with accurate guidance for diffusion model-based policy is still an
+open problem. Inspired by the consistency model, we propose a novel
+time-efficiency method named Consistency Policy with Q-Learning (CPQL), which
+derives action from noise by a single step. By establishing a mapping from the
+reverse diffusion trajectories to the desired policy, we simultaneously address
+the issues of time efficiency and inaccurate guidance when updating diffusion
+model-based policy with the learned Q-function. We demonstrate that CPQL can
+achieve policy improvement with accurate guidance for offline reinforcement
+learning, and can be seamlessly extended for online RL tasks. Experimental
+results indicate that CPQL achieves new state-of-the-art performance on 11
+offline and 21 online tasks, significantly improving inference speed by nearly
+45 times compared to Diffusion-QL. We will release our code later.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Falcon: Fair Active Learning using Multi-armed Bandits <span class="chip">VLDB 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12722v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12722v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ki Hyun Tae, Hantian Zhang, Jaeyoung Park, Kexin Rong, Steven Euijong Whang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biased data can lead to unfair machine learning models, highlighting the
+importance of embedding fairness at the beginning of data analysis,
+particularly during dataset curation and labeling. In response, we propose
+Falcon, a scalable fair active learning framework. Falcon adopts a data-centric
+approach that improves machine learning model fairness via strategic sample
+selection. Given a user-specified group fairness measure, Falcon identifies
+samples from "target groups" (e.g., (attribute=female, label=positive)) that
+are the most informative for improving fairness. However, a challenge arises
+since these target groups are defined using ground truth labels that are not
+available during sample selection. To handle this, we propose a novel
+trial-and-error method, where we postpone using a sample if the predicted label
+is different from the expected one and falls outside the target group. We also
+observe the trade-off that selecting more informative samples results in higher
+likelihood of postponing due to undesired label prediction, and the optimal
+balance varies per dataset. We capture the trade-off between informativeness
+and postpone rate as policies and propose to automatically select the best
+policy using adversarial multi-armed bandit methods, given their computational
+efficiency and theoretical guarantees. Experiments show that Falcon
+significantly outperforms existing fair active learning approaches in terms of
+fairness and accuracy and is more efficient. In particular, only Falcon
+supports a proper trade-off between accuracy and fairness where its maximum
+fairness score is 1.8-4.5x higher than the second-best results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to VLDB 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pure Message Passing Can Estimate Common Neighbor for Link Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00976v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00976v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiwen Dong, Zhichun Guo, Nitesh V. Chawla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Message Passing Neural Networks (MPNNs) have emerged as the {\em de facto}
+standard in graph representation learning. However, when it comes to link
+prediction, they often struggle, surpassed by simple heuristics such as Common
+Neighbor (CN). This discrepancy stems from a fundamental limitation: while
+MPNNs excel in node-level representation, they stumble with encoding the joint
+structural features essential to link prediction, like CN. To bridge this gap,
+we posit that, by harnessing the orthogonality of input vectors, pure
+message-passing can indeed capture joint structural features. Specifically, we
+study the proficiency of MPNNs in approximating CN heuristics. Based on our
+findings, we introduce the Message Passing Link Predictor (MPLP), a novel link
+prediction model. MPLP taps into quasi-orthogonal vectors to estimate
+link-level structural features, all while preserving the node-level
+complexities. Moreover, our approach demonstrates that leveraging
+message-passing to capture structural features could offset MPNNs'
+expressiveness limitations at the expense of estimation variance. We conduct
+experiments on benchmark datasets from various domains, where our method
+consistently outperforms the baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal
+  Contrastive EHR Modelling with Hierarchical Regularisation <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heejoon Koo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting next visit diagnosis using Electronic Health Records (EHR) is an
+essential task in healthcare, critical for devising proactive future plans for
+both healthcare providers and patients. Nonetheless, many preceding studies
+have not sufficiently addressed the heterogeneous and hierarchical
+characteristics inherent in EHR data, inevitably leading to sub-optimal
+performance. To this end, we propose NECHO, a novel medical code-centric
+multimodal contrastive EHR learning framework with hierarchical regularisation.
+First, we integrate multifaceted information encompassing medical codes,
+demographics, and clinical notes using a tailored network design and a pair of
+bimodal contrastive losses, all of which pivot around a medical code
+representation. We also regularise modality-specific encoders using a parental
+level information in medical ontology to learn hierarchical structure of EHR
+data. A series of experiments on MIMIC-III data demonstrates effectiveness of
+our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EACL 2024 (The 18th Conference of the European Chapter of
+  the Association for Computational Linguistics)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unleashing the Potential of Acquisition Functions in High-Dimensional
+  Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08298v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08298v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Zhao, Renyu Yang, Shenghao Qiu, Zheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian optimization (BO) is widely used to optimize expensive-to-evaluate
+black-box functions.BO first builds a surrogate model to represent the
+objective function and assesses its uncertainty. It then decides where to
+sample by maximizing an acquisition function (AF) based on the surrogate model.
+However, when dealing with high-dimensional problems, finding the global
+maximum of the AF becomes increasingly challenging. In such cases, the
+initialization of the AF maximizer plays a pivotal role, as an inadequate setup
+can severely hinder the effectiveness of the AF.
+  This paper investigates a largely understudied problem concerning the impact
+of AF maximizer initialization on exploiting AFs' capability. Our large-scale
+empirical study shows that the widely used random initialization strategy often
+fails to harness the potential of an AF. In light of this, we propose a better
+initialization approach by employing multiple heuristic optimizers to leverage
+the historical data of black-box optimization to generate initial points for
+the AF maximize. We evaluate our approach with a range of heavily studied
+synthetic functions and real-world applications. Experimental results show that
+our techniques, while simple, can significantly enhance the standard BO and
+outperform state-of-the-art methods by a large margin in most test cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving the Accuracy and Interpretability of Random Forests via Forest
+  Pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.05535v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.05535v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Albert Dorador
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decades after their inception, random forests continue to provide
+state-of-the-art accuracy in a variety of learning problems, outperforming in
+this respect alternative machine learning algorithms such as decision trees or
+even neural networks. However, being an ensemble method, the one aspect where
+random forests tend to severely underperform decision trees is
+interpretability. In the present work, we propose a post-hoc approach that aims
+to have the best of both worlds: the accuracy of random forests and the
+interpretability of decision trees. To this end, we present two forest-pruning
+methods to find an optimal sub-forest within a given random forest, and then,
+when applicable, combine the selected trees into one. Our first method relies
+on constrained exhaustive search, while our second method is based on an
+adaptation of the LASSO methodology. Extensive experiments over synthetic and
+real world datasets show that, in the majority of scenarios, at least one of
+the two methods proposed is more accurate than the original random forest,
+while just using a small fraction of the trees, aiding result interpretability.
+Compared to current state-of-the-art forest pruning methods, namely sequential
+forward selection and (a variation of) sequential backward selection, our
+methods tend to outperform both of them, whether in terms of accuracy, number
+of trees employed, or both.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EasyTPP: Towards Open Benchmarking Temporal Point Processes <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08097v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08097v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siqiao Xue, Xiaoming Shi, Zhixuan Chu, Yan Wang, Hongyan Hao, Fan Zhou, Caigao Jiang, Chen Pan, James Y. Zhang, Qingsong Wen, Jun Zhou, Hongyuan Mei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continuous-time event sequences play a vital role in real-world domains such
+as healthcare, finance, online shopping, social networks, and so on. To model
+such data, temporal point processes (TPPs) have emerged as the most natural and
+competitive models, making a significant impact in both academic and
+application communities. Despite the emergence of many powerful models in
+recent years, there hasn't been a central benchmark for these models and future
+research endeavors. This lack of standardization impedes researchers and
+practitioners from comparing methods and reproducing results, potentially
+slowing down progress in this field. In this paper, we present EasyTPP, the
+first central repository of research assets (e.g., data, models, evaluation
+programs, documentations) in the area of event sequence modeling. Our EasyTPP
+makes several unique contributions to this area: a unified interface of using
+existing datasets and adding new datasets; a wide range of evaluation programs
+that are easy to use and extend as well as facilitate reproducible research;
+implementations of popular neural TPPs, together with a rich library of modules
+by composing which one could quickly build complex models. All the data and
+implementation can be found at
+https://github.com/ant-research/EasyTemporalPointProcess. We will actively
+maintain this benchmark and welcome contributions from other researchers and
+practitioners. Our benchmark will help promote reproducible research in this
+field, thus accelerating research progress as well as making more significant
+real-world impacts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024 camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantitative Analysis of Molecular Transport in the Extracellular Space
+  Using Physics-Informed Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Xie, Hongfeng Li, Jin Cheng, Qingrui Cai, Hanbo Tan, Lingyun Zu, Xiaobo Qu, Hongbin Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The brain extracellular space (ECS), an irregular, extremely tortuous
+nanoscale space located between cells or between cells and blood vessels, is
+crucial for nerve cell survival. It plays a pivotal role in high-level brain
+functions such as memory, emotion, and sensation. However, the specific form of
+molecular transport within the ECS remain elusive. To address this challenge,
+this paper proposes a novel approach to quantitatively analyze the molecular
+transport within the ECS by solving an inverse problem derived from the
+advection-diffusion equation (ADE) using a physics-informed neural network
+(PINN). PINN provides a streamlined solution to the ADE without the need for
+intricate mathematical formulations or grid settings. Additionally, the
+optimization of PINN facilitates the automatic computation of the diffusion
+coefficient governing long-term molecule transport and the velocity of
+molecules driven by advection. Consequently, the proposed method allows for the
+quantitative analysis and identification of the specific pattern of molecular
+transport within the ECS through the calculation of the Peclet number.
+Experimental validation on two datasets of magnetic resonance images (MRIs)
+captured at different time points showcases the effectiveness of the proposed
+method. Notably, our simulations reveal identical molecular transport patterns
+between datasets representing rats with tracer injected into the same brain
+region. These findings highlight the potential of PINN as a promising tool for
+comprehensively exploring molecular transport within the ECS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Cell Library Characterization for Design Technology Co-Optimization
+  Based on Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12784v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12784v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianliang Ma, Zhihui Deng, Xuguang Sun, Leilai Shao Kainlu Low
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Design technology co-optimization (DTCO) plays a critical role in achieving
+optimal power, performance, and area (PPA) for advanced semiconductor process
+development. Cell library characterization is essential in DTCO flow, but
+traditional methods are time-consuming and costly. To overcome these
+challenges, we propose a graph neural network (GNN)-based machine learning
+model for rapid and accurate cell library characterization. Our model
+incorporates cell structures and demonstrates high prediction accuracy across
+various process-voltage-temperature (PVT) corners and technology parameters.
+Validation with 512 unseen technology corners and over one million test data
+points shows accurate predictions of delay, power, and input pin capacitance
+for 33 types of cells, with a mean absolute percentage error (MAPE) $\le$ 0.95%
+and a speed-up of 100X compared with SPICE simulations. Additionally, we
+investigate system-level metrics such as worst negative slack (WNS), leakage
+power, and dynamic power using predictions obtained from the GNN-based model on
+unseen corners. Our model achieves precise predictions, with absolute error
+$\le$3.0 ps for WNS, percentage errors $\le$0.60% for leakage power, and
+$\le$0.99% for dynamic power, when compared to golden reference. With the
+developed model, we further proposed a fine-grained drive strength
+interpolation methodology to enhance PPA for small-to-medium-scale designs,
+resulting in an approximate 1-3% improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Nonlinear Two-Time-Scale Stochastic Approximation: Achieving
+  $O(1/k)$ Finite-Sample Complexity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12764v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12764v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thinh T. Doan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes to develop a new variant of the two-time-scale stochastic
+approximation to find the roots of two coupled nonlinear operators, assuming
+only noisy samples of these operators can be observed. Our key idea is to
+leverage the classic Ruppert-Polyak averaging technique to dynamically estimate
+the operators through their samples. The estimated values of these averaging
+steps will then be used in the two-time-scale stochastic approximation updates
+to find the desired solution. Our main theoretical result is to show that under
+the strongly monotone condition of the underlying nonlinear operators the
+mean-squared errors of the iterates generated by the proposed method converge
+to zero at an optimal rate $O(1/k)$, where $k$ is the number of iterations. Our
+result significantly improves the existing result of two-time-scale stochastic
+approximation, where the best known finite-time convergence rate is
+$O(1/k^{2/3})$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How False Data Affects Machine Learning Models in Electrochemistry? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.10795v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.10795v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krittapong Deshsorna, Luckhana Lawtrakul, Pawin Iamprasertkun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the selection of machine learning model based on only the data
+distribution without concerning the noise of the data. This study aims to
+distinguish, which models perform well under noisy data, and establish whether
+stacking machine learning models actually provide robustness to otherwise
+weak-to-noise models. The electrochemical data were tested with 12 standalone
+models and stacking model. This includes XGB, LGBM, RF, GB, ADA, NN, ELAS,
+LASS, RIDGE, SVM, KNN, DT, and the stacking model. It is found that linear
+models handle noise well with the average error of (slope) to 1.75 F g-1 up to
+error per 100% percent noise added; but it suffers from prediction accuracy due
+to having an average of 60.19 F g-1 estimated at minimal error at 0% noise
+added. Tree-based models fail in terms of noise handling (average slope is
+55.24 F g-1 at 100% percent noise), but it can provide higher prediction
+accuracy (lowest error of 23.9 F g-1) than that of linear. To address the
+controversial between prediction accuracy and error handling, the stacking
+model was constructed, which is not only show high accuracy (intercept of 25.03
+F g-1), but it also exhibits good noise handling (slope of 43.58 F g-1), making
+stacking models a relatively low risk and viable choice for beginner and
+experienced machine learning research in electrochemistry. Even though neural
+networks (NN) are gaining popularity in the electrochemistry field. However,
+this study presents that NN is not suitable for electrochemical data, and
+improper tuning resulting in a model that is susceptible to noise. Thus, STACK
+models should provide better benefits in that even with untuned base models,
+they can achieve an accurate and noise-tolerant model. Overall, this work
+provides insight into machine learning model selection for electrochemical
+data, which should aid the understanding of data science in chemistry context.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Digital cloning of online social networks for language-sensitive
+  agent-based modeling of misinformation spread 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12509v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12509v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prateek Puri, Gabriel Hassler, Anton Shenk, Sai Katragadda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a simulation framework for studying misinformation spread within
+online social networks that blends agent-based modeling and natural language
+processing techniques. While many other agent-based simulations exist in this
+space, questions over their fidelity and generalization to existing networks in
+part hinders their ability to provide actionable insights. To partially address
+these concerns, we create a 'digital clone' of a known misinformation sharing
+network by downloading social media histories for over ten thousand of its
+users. We parse these histories to both extract the structure of the network
+and model the nuanced ways in which information is shared and spread among its
+members. Unlike many other agent-based methods in this space, information
+sharing between users in our framework is sensitive to topic of discussion,
+user preferences, and online community dynamics. To evaluate the fidelity of
+our method, we seed our cloned network with a set of posts recorded in the base
+network and compare propagation dynamics between the two, observing reasonable
+agreement across the twin networks over a variety of metrics. Lastly, we
+explore how the cloned network may serve as a flexible, low-cost testbed for
+misinformation countermeasure evaluation and red teaming analysis. We hope the
+tools explored here augment existing efforts in the space and unlock new
+opportunities for misinformation countermeasure evaluation, a field that may
+become increasingly important to consider with the anticipated rise of
+misinformation campaigns fueled by generative artificial intelligence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal Misinformation Detection: Approaches, Challenges and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.13883v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.13883v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Abdali, Sina shaham, Bhaskar Krishnamachari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As social media platforms are evolving from text-based forums into
+multi-modal environments, the nature of misinformation in social media is also
+transforming accordingly. Taking advantage of the fact that visual modalities
+such as images and videos are more favorable and attractive to the users and
+textual contents are sometimes skimmed carelessly, misinformation spreaders
+have recently targeted contextual connections between the modalities e.g., text
+and image. Hence many researchers have developed automatic techniques for
+detecting possible cross-modal discordance in web-based content. We analyze,
+categorize and identify existing approaches in addition to challenges and
+shortcomings they face in order to unearth new research opportunities in the
+field of multi-modal misinformation detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formal Logic Enabled Personalized Federated Learning Through Property
+  Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07448v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07448v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyan An, Taylor T. Johnson, Meiyi Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in federated learning (FL) have greatly facilitated the
+development of decentralized collaborative applications, particularly in the
+domain of Artificial Intelligence of Things (AIoT). However, a critical aspect
+missing from the current research landscape is the ability to enable
+data-driven client models with symbolic reasoning capabilities. Specifically,
+the inherent heterogeneity of participating client devices poses a significant
+challenge, as each client exhibits unique logic reasoning properties. Failing
+to consider these device-specific specifications can result in critical
+properties being missed in the client predictions, leading to suboptimal
+performance. In this work, we propose a new training paradigm that leverages
+temporal logic reasoning to address this issue. Our approach involves enhancing
+the training process by incorporating mechanically generated logic expressions
+for each FL client. Additionally, we introduce the concept of aggregation
+clusters and develop a partitioning algorithm to effectively group clients
+based on the alignment of their temporal reasoning properties. We evaluate the
+proposed method on two tasks: a real-world traffic volume prediction task
+consisting of sensory data from fifteen states and a smart city multi-task
+prediction utilizing synthetic data. The evaluation results exhibit clear
+improvements, with performance accuracy improved by up to 54% across all
+sequential prediction models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Softmax Masking: Stop Gradient for Enhancing Stability in
+  Replay-based Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14808v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14808v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoyong Kim, Minchan Kwon, Kangil Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In replay-based methods for continual learning, replaying input samples in
+episodic memory has shown its effectiveness in alleviating catastrophic
+forgetting. However, the potential key factor of cross-entropy loss with
+softmax in causing catastrophic forgetting has been underexplored. In this
+paper, we analyze the effect of softmax and revisit softmax masking with
+negative infinity to shed light on its ability to mitigate catastrophic
+forgetting. Based on the analyses, it is found that negative infinity masked
+softmax is not always compatible with dark knowledge. To improve the
+compatibility, we propose a general masked softmax that controls the stability
+by adjusting the gradient scale to old and new classes. We demonstrate that
+utilizing our method on other replay-based methods results in better
+performance, primarily by enhancing model stability in continual learning
+benchmarks, even when the buffer size is set to an extremely small value.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CALM : A Multi-task Benchmark for Comprehensive Assessment of Language
+  Model Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12539v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12539v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vipul Gupta, Pranav Narayanan Venkit, Hugo Laurençon, Shomir Wilson, Rebecca J. Passonneau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As language models (LMs) become increasingly powerful and widely used, it is
+important to quantify them for sociodemographic bias with potential for harm.
+Prior measures of bias are sensitive to perturbations in the templates designed
+to compare performance across social groups, due to factors such as low
+diversity or limited number of templates. Also, most previous work considers
+only one NLP task. We introduce Comprehensive Assessment of Language Models
+(CALM) for robust measurement of two types of universally relevant
+sociodemographic bias, gender and race. CALM integrates sixteen datasets for
+question-answering, sentiment analysis and natural language inference. Examples
+from each dataset are filtered to produce 224 templates with high diversity
+(e.g., length, vocabulary). We assemble 50 highly frequent person names for
+each of seven distinct demographic groups to generate 78,400 prompts covering
+the three NLP tasks. Our empirical evaluation shows that CALM bias scores are
+more robust and far less sensitive than previous bias measurements to
+perturbations in the templates, such as synonym substitution, or to random
+subset selection of templates. We apply CALM to 20 large language models, and
+find that for 2 language model series, larger parameter models tend to be more
+biased than smaller ones. The T0 series is the least biased model families, of
+the 20 LLMs investigated here. The code is available at
+https://github.com/vipulgupta1011/CALM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multimodal Graph Neural Network Framework of Cancer Molecular Subtype
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12838v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12838v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingjun Li, Sheida Nabavi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent development of high-throughput sequencing creates a large
+collection of multi-omics data, which enables researchers to better investigate
+cancer molecular profiles and cancer taxonomy based on molecular subtypes.
+Integrating multi-omics data has been proven to be effective for building more
+precise classification models. Current multi-omics integrative models mainly
+use early fusion by concatenation or late fusion based on deep neural networks.
+Due to the nature of biological systems, graphs are a better representation of
+bio-medical data. Although few graph neural network (GNN) based multi-omics
+integrative methods have been proposed, they suffer from three common
+disadvantages. One is most of them use only one type of connection, either
+inter-omics or intra-omic connection; second, they only consider one kind of
+GNN layer, either graph convolution network (GCN) or graph attention network
+(GAT); and third, most of these methods lack testing on a more complex cancer
+classification task. We propose a novel end-to-end multi-omics GNN framework
+for accurate and robust cancer subtype classification. The proposed model
+utilizes multi-omics data in the form of heterogeneous multi-layer graphs that
+combines both inter-omics and intra-omic connections from established
+biological knowledge. The proposed model incorporates learned graph features
+and global genome features for accurate classification. We test the proposed
+model on TCGA Pan-cancer dataset and TCGA breast cancer dataset for molecular
+subtype and cancer subtype classification, respectively. The proposed model
+outperforms four current state-of-the-art baseline models in multiple
+evaluation metrics. The comparative analysis of GAT-based models and GCN-based
+models reveals that GAT-based models are preferred for smaller graphs with less
+information and GCN-based models are preferred for larger graphs with extra
+information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 4 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RS-Del: Edit Distance Robustness Certificates for Sequence Classifiers
+  via Randomized Deletion <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01757v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01757v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoqun Huang, Neil G. Marchant, Keane Lucas, Lujo Bauer, Olga Ohrimenko, Benjamin I. P. Rubinstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Randomized smoothing is a leading approach for constructing classifiers that
+are certifiably robust against adversarial examples. Existing work on
+randomized smoothing has focused on classifiers with continuous inputs, such as
+images, where $\ell_p$-norm bounded adversaries are commonly studied. However,
+there has been limited work for classifiers with discrete or variable-size
+inputs, such as for source code, which require different threat models and
+smoothing mechanisms. In this work, we adapt randomized smoothing for discrete
+sequence classifiers to provide certified robustness against edit
+distance-bounded adversaries. Our proposed smoothing mechanism randomized
+deletion (RS-Del) applies random deletion edits, which are (perhaps
+surprisingly) sufficient to confer robustness against adversarial deletion,
+insertion and substitution edits. Our proof of certification deviates from the
+established Neyman-Pearson approach, which is intractable in our setting, and
+is instead organized around longest common subsequences. We present a case
+study on malware detection--a binary classification problem on byte sequences
+where classifier evasion is a well-established threat model. When applied to
+the popular MalConv malware detection model, our smoothing mechanism RS-Del
+achieves a certified accuracy of 91% at an edit distance radius of 128 bytes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Final camera-ready version for NeurIPS 2023. 36 pages, 7 figures, 12
+  tables. Includes 20 pages of appendices. Code available at
+  https://github.com/Dovermore/randomized-deletion</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciMMIR: Benchmarking Scientific Multi-modal Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siwei Wu, Yizhi Li, Kang Zhu, Ge Zhang, Yiming Liang, Kaijing Ma, Chenghao Xiao, Haoran Zhang, Bohao Yang, Wenhu Chen, Wenhao Huang, Noura Al Moubayed, Jie Fu, Chenghua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal information retrieval (MMIR) is a rapidly evolving field, where
+significant progress, particularly in image-text pairing, has been made through
+advanced representation learning and cross-modality alignment research.
+However, current benchmarks for evaluating MMIR performance in image-text
+pairing within the scientific domain show a notable gap, where chart and table
+images described in scholarly language usually do not play a significant role.
+To bridge this gap, we develop a specialised scientific MMIR (SciMMIR)
+benchmark by leveraging open-access paper collections to extract data relevant
+to the scientific domain. This benchmark comprises 530K meticulously curated
+image-text pairs, extracted from figures and tables with detailed captions in
+scientific documents. We further annotate the image-text pairs with two-level
+subset-subcategory hierarchy annotations to facilitate a more comprehensive
+evaluation of the baselines. We conducted zero-shot and fine-tuning evaluations
+on prominent multi-modal image-captioning and visual language models, such as
+CLIP and BLIP. Our analysis offers critical insights for MMIR in the scientific
+domain, including the impact of pre-training and fine-tuning settings and the
+influence of the visual and textual encoders. All our data and checkpoints are
+publicly available at https://github.com/Wusiwei0410/SciMMIR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MF-AED-AEC: Speech Emotion Recognition by Leveraging Multimodal Fusion,
+  ASR Error Detection, and ASR Error Correction <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun He, Xiaohan Shi, Xingfeng Li, Tomoki Toda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prevalent approach in speech emotion recognition (SER) involves
+integrating both audio and textual information to comprehensively identify the
+speaker's emotion, with the text generally obtained through automatic speech
+recognition (ASR). An essential issue of this approach is that ASR errors from
+the text modality can worsen the performance of SER. Previous studies have
+proposed using an auxiliary ASR error detection task to adaptively assign
+weights of each word in ASR hypotheses. However, this approach has limited
+improvement potential because it does not address the coherence of semantic
+information in the text. Additionally, the inherent heterogeneity of different
+modalities leads to distribution gaps between their representations, making
+their fusion challenging. Therefore, in this paper, we incorporate two
+auxiliary tasks, ASR error detection (AED) and ASR error correction (AEC), to
+enhance the semantic coherence of ASR text, and further introduce a novel
+multi-modal fusion (MF) method to learn shared representations across
+modalities. We refer to our method as MF-AED-AEC. Experimental results indicate
+that MF-AED-AEC significantly outperforms the baseline model by a margin of
+4.1\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MOS-FAD: Improving Fake Audio Detection Via Automatic Mean Opinion Score
+  Prediction <span class="chip">ICASSP2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wangjin Zhou, Zhengdong Yang, Chenhui Chu, Sheng Li, Raj Dabre, Yi Zhao, Kawahara Tatsuya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic Mean Opinion Score (MOS) prediction is employed to evaluate the
+quality of synthetic speech. This study extends the application of predicted
+MOS to the task of Fake Audio Detection (FAD), as we expect that MOS can be
+used to assess how close synthesized speech is to the natural human voice. We
+propose MOS-FAD, where MOS can be leveraged at two key points in FAD: training
+data selection and model fusion. In training data selection, we demonstrate
+that MOS enables effective filtering of samples from unbalanced datasets. In
+the model fusion, our results demonstrate that incorporating MOS as a gating
+mechanism in FAD model fusion enhances overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICASSP2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video Quality Assessment with Texture Information Fusion for Streaming
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14465v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14465v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vignesh V Menon, Prajit T Rajendran, Reza Farahani, Klaus Schoeffmann, Christian Timmerer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise in video streaming applications has increased the demand for video
+quality assessment (VQA). In 2016, Netflix introduced Video Multi-Method
+Assessment Fusion (VMAF), a full reference VQA metric that strongly correlates
+with perceptual quality, but its computation is time-intensive. We propose a
+Discrete Cosine Transform (DCT)-energy-based VQA with texture information
+fusion (VQ-TIF) model for video streaming applications that determines the
+visual quality of the reconstructed video compared to the original video.
+VQ-TIF extracts Structural Similarity (SSIM) and spatiotemporal features of the
+frames from the original and reconstructed videos and fuses them using a long
+short-term memory (LSTM)-based model to estimate the visual quality.
+Experimental results show that VQ-TIF estimates the visual quality with a
+Pearson Correlation Coefficient (PCC) of 0.96 and a Mean Absolute Error (MAE)
+of 2.71, on average, compared to the ground truth VMAF scores. Additionally,
+VQ-TIF estimates the visual quality at a rate of 9.14 times faster than the
+state-of-the-art VMAF implementation, along with an 89.44 % reduction in energy
+consumption, assuming an Ultra HD (2160p) display resolution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2024 Mile High Video (MHV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ M2ORT: Many-To-One Regression <span class="highlight-title">Transformer</span> for Spatial Transcriptomics
+  Prediction from Histopathology Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10608v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10608v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyi Wang, Xiuju Du, Jing Liu, Shuyi Ouyang, Yen-Wei Chen, Lanfen Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of Spatial Transcriptomics (ST) has facilitated the
+spatially-aware profiling of gene expressions based on histopathology images.
+Although ST data offers valuable insights into the micro-environment of tumors,
+its acquisition cost remains expensive. Therefore, directly predicting the ST
+expressions from digital pathology images is desired. Current methods usually
+adopt existing regression backbones for this task, which ignore the inherent
+multi-scale hierarchical data structure of digital pathology images. To address
+this limit, we propose M2ORT, a many-to-one regression Transformer that can
+accommodate the hierarchical structure of the pathology images through a
+decoupled multi-scale feature extractor. Different from traditional models that
+are trained with one-to-one image-label pairs, M2ORT accepts multiple pathology
+images of different magnifications at a time to jointly predict the gene
+expressions at their corresponding common ST spot, aiming at learning a
+many-to-one relationship through training. We have tested M2ORT on three public
+ST datasets and the experimental results show that M2ORT can achieve
+state-of-the-art performance with fewer parameters and floating-point
+operations (FLOPs). The code is available at:
+https://github.com/Dootmaan/M2ORT/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modularized Zero-shot VQA with <span class="highlight-title">Pre-train</span>ed Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17369v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17369v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Cao, Jing Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale pre-trained models (PTMs) show great zero-shot capabilities. In
+this paper, we study how to leverage them for zero-shot visual question
+answering (VQA). Our approach is motivated by a few observations. First, VQA
+questions often require multiple steps of reasoning, which is still a
+capability that most PTMs lack. Second, different steps in VQA reasoning chains
+require different skills such as object detection and relational reasoning, but
+a single PTM may not possess all these skills. Third, recent work on zero-shot
+VQA does not explicitly consider multi-step reasoning chains, which makes them
+less interpretable compared with a decomposition-based approach. We propose a
+modularized zero-shot network that explicitly decomposes questions into sub
+reasoning steps and is highly interpretable. We convert sub reasoning tasks to
+acceptable objectives of PTMs and assign tasks to proper PTMs without any
+adaptation. Our experiments on two VQA benchmarks under the zero-shot setting
+demonstrate the effectiveness of our method and better interpretability
+compared with several baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted as Findings in ACL 2023; Code available:
+  https://github.com/abril4416/Mod-Zero-VQA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal Misinformation Detection: Approaches, Challenges and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.13883v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.13883v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Abdali, Sina shaham, Bhaskar Krishnamachari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As social media platforms are evolving from text-based forums into
+multi-modal environments, the nature of misinformation in social media is also
+transforming accordingly. Taking advantage of the fact that visual modalities
+such as images and videos are more favorable and attractive to the users and
+textual contents are sometimes skimmed carelessly, misinformation spreaders
+have recently targeted contextual connections between the modalities e.g., text
+and image. Hence many researchers have developed automatic techniques for
+detecting possible cross-modal discordance in web-based content. We analyze,
+categorize and identify existing approaches in addition to challenges and
+shortcomings they face in order to unearth new research opportunities in the
+field of multi-modal misinformation detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-01-23T00:00:00Z">2024-01-23</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">79</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HAZARD Challenge: Embodied Decision Making in Dynamically Changing
+  Environments <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinhong Zhou, Sunli Chen, Yisong Wang, Haozhe Xu, Weihua Du, Hongxin Zhang, Yilun Du, Joshua B. Tenenbaum, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in high-fidelity virtual environments serve as one of the
+major driving forces for building intelligent embodied agents to perceive,
+reason and interact with the physical world. Typically, these environments
+remain unchanged unless agents interact with them. However, in real-world
+scenarios, agents might also face dynamically changing environments
+characterized by unexpected events and need to rapidly take action accordingly.
+To remedy this gap, we propose a new simulated embodied benchmark, called
+HAZARD, specifically designed to assess the decision-making abilities of
+embodied agents in dynamic situations. HAZARD consists of three unexpected
+disaster scenarios, including fire, flood, and wind, and specifically supports
+the utilization of large language models (LLMs) to assist common sense
+reasoning and decision-making. This benchmark enables us to evaluate autonomous
+agents' decision-making capabilities across various pipelines, including
+reinforcement learning (RL), rule-based, and search-based methods in
+dynamically changing environments. As a first step toward addressing this
+challenge using large language models, we further develop an LLM-based agent
+and perform an in-depth analysis of its promise and challenge of solving these
+challenging tasks. HAZARD is available at https://vis-www.cs.umass.edu/hazard/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. The first two authors contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Language Learning: Arhitectures and Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ekin Akyürek, Bailin Wang, Yoon Kim, Jacob Andreas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale neural language models exhibit a remarkable capacity for
+in-context learning (ICL): they can infer novel functions from datasets
+provided as input. Most of our current understanding of when and how ICL arises
+comes from LMs trained on extremely simple learning problems like linear
+regression and associative recall. There remains a significant gap between
+these model problems and the "real" ICL exhibited by LMs trained on large text
+corpora, which involves not just retrieval and function approximation but
+free-form generation of language and other structured outputs. In this paper,
+we study ICL through the lens of a new family of model problems we term in
+context language learning (ICLL). In ICLL, LMs are presented with a set of
+strings from a formal language, and must generate additional strings from the
+same language. We focus on in-context learning of regular languages generated
+by random finite automata. We evaluate a diverse set of neural sequence models
+(including several RNNs, Transformers, and state-space model variants) on
+regular ICLL tasks, aiming to answer three questions: (1) Which model classes
+are empirically capable of ICLL? (2) What algorithmic solutions do successful
+models implement to perform ICLL? (3) What architectural changes can improve
+ICLL in less performant models? We first show that Transformers significantly
+outperform neural sequence models with recurrent or convolutional
+representations on ICLL tasks. Next, we provide evidence that their ability to
+do so relies on specialized "n-gram heads" (higher-order variants of induction
+heads) that compute input-conditional next-token distributions. Finally, we
+show that hard-wiring these heads into recurrent and convolutional models
+improves performance not just on ICLL, but natural language modeling --
+improving the perplexity of 340M-parameter models by up to 1.14 points (6.7%)
+on the SlimPajama dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Raidar: geneRative AI Detection viA Rewriting <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengzhi Mao, Carl Vondrick, Hao Wang, Junfeng Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We find that large language models (LLMs) are more likely to modify
+human-written text than AI-generated text when tasked with rewriting. This
+tendency arises because LLMs often perceive AI-generated text as high-quality,
+leading to fewer modifications. We introduce a method to detect AI-generated
+content by prompting LLMs to rewrite text and calculating the editing distance
+of the output. We dubbed our geneRative AI Detection viA Rewriting method
+Raidar. Raidar significantly improves the F1 detection scores of existing AI
+content detection models -- both academic and commercial -- across various
+domains, including News, creative writing, student essays, code, Yelp reviews,
+and arXiv papers, with gains of up to 29 points. Operating solely on word
+symbols without high-dimensional features, our method is compatible with black
+box LLMs, and is inherently robust on new content. Our results illustrate the
+unique imprint of machine-generated text through the lens of the machines
+themselves.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoRT: Embodied Foundation Models for Large Scale Orchestration of
+  Robotic Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Ahn, Debidatta Dwibedi, Chelsea Finn, Montse Gonzalez Arenas, Keerthana Gopalakrishnan, Karol Hausman, Brian Ichter, Alex Irpan, Nikhil Joshi, Ryan Julian, Sean Kirmani, Isabel Leal, Edward Lee, Sergey Levine, Yao Lu, Isabel Leal, Sharath Maddineni, Kanishka Rao, Dorsa Sadigh, Pannag Sanketi, Pierre Sermanet, Quan Vuong, Stefan Welker, Fei Xia, Ted Xiao, Peng Xu, Steve Xu, Zhuo Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models that incorporate language, vision, and more recently
+actions have revolutionized the ability to harness internet scale data to
+reason about useful tasks. However, one of the key challenges of training
+embodied foundation models is the lack of data grounded in the physical world.
+In this paper, we propose AutoRT, a system that leverages existing foundation
+models to scale up the deployment of operational robots in completely unseen
+scenarios with minimal human supervision. AutoRT leverages vision-language
+models (VLMs) for scene understanding and grounding, and further uses large
+language models (LLMs) for proposing diverse and novel instructions to be
+performed by a fleet of robots. Guiding data collection by tapping into the
+knowledge of foundation models enables AutoRT to effectively reason about
+autonomy tradeoffs and safety while significantly scaling up data collection
+for robot learning. We demonstrate AutoRT proposing instructions to over 20
+robots across multiple buildings and collecting 77k real robot episodes via
+both teleoperation and autonomous robot policies. We experimentally show that
+such "in-the-wild" data collected by AutoRT is significantly more diverse, and
+that AutoRT's use of LLMs allows for instruction following data collection
+robots that can align to human preferences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-<span class="highlight-title">Prompt</span>ing: Enhancing Language Models with Task-Agnostic Scaffolding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirac Suzgun, Adam Tauman Kalai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce meta-prompting, an effective scaffolding technique designed to
+enhance the functionality of language models (LMs). This approach transforms a
+single LM into a multi-faceted conductor, adept at managing and integrating
+multiple independent LM queries. By employing high-level instructions,
+meta-prompting guides the LM to break down complex tasks into smaller, more
+manageable subtasks. These subtasks are then handled by distinct "expert"
+instances of the same LM, each operating under specific, tailored instructions.
+Central to this process is the LM itself, in its role as the conductor, which
+ensures seamless communication and effective integration of the outputs from
+these expert models. It additionally employs its inherent critical thinking and
+robust verification processes to refine and authenticate the end result. This
+collaborative prompting approach empowers a single LM to simultaneously act as
+a comprehensive orchestrator and a panel of diverse experts, significantly
+enhancing its performance across a wide array of tasks. The zero-shot,
+task-agnostic nature of meta-prompting greatly simplifies user interaction by
+obviating the need for detailed, task-specific instructions. Furthermore, our
+research demonstrates the seamless integration of external tools, such as a
+Python interpreter, into the meta-prompting framework, thereby broadening its
+applicability and utility. Through rigorous experimentation with GPT-4, we
+establish the superiority of meta-prompting over conventional scaffolding
+methods: When averaged across all tasks, including the Game of 24,
+Checkmate-in-One, and Python Programming Puzzles, meta-prompting, augmented
+with a Python interpreter functionality, surpasses standard prompting by 17.1%,
+expert (dynamic) prompting by 17.3%, and multipersona prompting by 15.2%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://github.com/suzgunmirac/meta-prompting</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>-Based Models Are Not Yet Perfect At Learning to Emulate
+  Structural Recursion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dylan Zhang, Curt Tigges, Zory Zhang, Stella Biderman, Maxim Raginsky, Talia Ringer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the ability of transformer-based models to learn
+structural recursion from examples. Recursion is a universal concept in both
+natural and formal languages. Structural recursion is central to the
+programming language and formal mathematics tasks where symbolic tools
+currently excel beyond neural models, such as inferring semantic relations
+between datatypes and emulating program behavior. We introduce a general
+framework that nicely connects the abstract concepts of structural recursion in
+the programming language domain to concrete sequence modeling problems and
+learned models' behavior. The framework includes a representation that captures
+the general \textit{syntax} of structural recursion, coupled with two different
+frameworks for understanding their \textit{semantics} -- one that is more
+natural from a programming languages perspective and one that helps bridge that
+perspective with a mechanistic understanding of the underlying transformer
+architecture.
+  With our framework as a powerful conceptual tool, we identify different
+issues under various set-ups. The models trained to emulate recursive
+computations cannot fully capture the recursion yet instead fit short-cut
+algorithms and thus cannot solve certain edge cases that are under-represented
+in the training distribution. In addition, it is difficult for state-of-the-art
+large language models (LLMs) to mine recursive rules from in-context
+demonstrations. Meanwhile, these LLMs fail in interesting ways when emulating
+reduction (step-wise computation) of the recursive function.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2305.14699</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multicultural Name Recognition For Previously Unseen Names 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandra Loessberg-Zahl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State of the art Named Entity Recognition (NER) models have achieved an
+impressive ability to extract common phrases from text that belong to labels
+such as location, organization, time, and person. However, typical NER systems
+that rely on having seen a specific entity in their training data in order to
+label an entity perform poorly on rare or unseen entities ta in order to label
+an entity perform poorly on rare or unseen entities (Derczynski et al., 2017).
+This paper attempts to improve recognition of person names, a diverse category
+that can grow any time someone is born or changes their name. In order for
+downstream tasks to not exhibit bias based on cultural background, a model
+should perform well on names from a variety of backgrounds. In this paper I
+experiment with the training data and input structure of an English Bi-LSTM
+name recognition model. I look at names from 103 countries to compare how well
+the model performs on names from different cultures, specifically in the
+context of a downstream task where extracted names will be matched to
+information on file. I find that a model with combined character and word input
+outperforms word-only models and may improve on accuracy compared to classical
+NER models that are not geared toward identifying unseen entity values.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Red Teaming Visual Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mukai Li, Lei Li, Yuwei Yin, Masood Ahmed, Zhenguang Liu, Qi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  VLMs (Vision-Language Models) extend the capabilities of LLMs (Large Language
+Models) to accept multimodal inputs. Since it has been verified that LLMs can
+be induced to generate harmful or inaccurate content through specific test
+cases (termed as Red Teaming), how VLMs perform in similar scenarios,
+especially with their combination of textual and visual inputs, remains a
+question. To explore this problem, we present a novel red teaming dataset
+RTVLM, which encompasses 10 subtasks (e.g., image misleading, multi-modal
+jail-breaking, face fairness, etc) under 4 primary aspects (faithfulness,
+privacy, safety, fairness). Our RTVLM is the first red-teaming dataset to
+benchmark current VLMs in terms of these 4 different aspects. Detailed analysis
+shows that 10 prominent open-sourced VLMs struggle with the red teaming in
+different degrees and have up to 31% performance gap with GPT-4V. Additionally,
+we simply apply red teaming alignment to LLaVA-v1.5 with Supervised Fine-tuning
+(SFT) using RTVLM, and this bolsters the models' performance with 10% in RTVLM
+test set, 13% in MM-Hal, and without noticeable decline in MM-Bench,
+overpassing other LLaVA-based models with regular alignment data. This reveals
+that current open-sourced VLMs still lack red teaming alignment. Our code and
+datasets will be open-source.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Working in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Understanding to Utilization: A <span class="highlight-title">Survey</span> on Explainability for Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyan Luo, Lucia Specia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This survey paper delves into the burgeoning field of explainability for
+Large Language Models (LLMs), a critical yet challenging aspect of natural
+language processing. With LLMs playing a pivotal role in various applications,
+their "black-box" nature raises concerns about transparency and ethical use.
+This paper emphasizes the necessity for enhanced explainability in LLMs,
+addressing both the general public's trust and the technical community's need
+for a deeper understanding of these models. We concentrate on pre-trained
+Transformer-based LLMs, such as LLaMA, which present unique interpretability
+challenges due to their scale and complexity. Our review categorizes existing
+explainability methods and discusses their application in improving model
+transparency and reliability. We also discuss representative evaluation
+methods, highlighting their strengths and limitations. The goal of this survey
+is to bridge the gap between theoretical understanding and practical
+application, offering insights for future research and development in the field
+of LLM explainability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Machine Translation with Human Feedback: An Exploration of
+  Quality Estimation as a Reward Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei He, Xing Wang, Wenxiang Jiao, Zhuosheng Zhang, Rui Wang, Shuming Shi, Zhaopeng Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Insufficient modeling of human preferences within the reward model is a major
+obstacle for leveraging human feedback to improve translation quality.
+Fortunately, quality estimation (QE), which predicts the quality of a given
+translation without reference, has achieved impressive alignment with human
+evaluations in the last two years. In this work, we investigate the potential
+of employing the QE model as the reward model (the QE-based reward model) to
+predict human preferences for feedback training. We first identify the
+overoptimization problem during QE-based feedback training, manifested as an
+increase in reward while translation quality declines. We examine the problem
+and argue that the vulnerability of the QE model might lead to high rewards for
+incorrect translations, resulting in overoptimization and error propagation. To
+address the problem, we adopt a simple yet effective method that uses heuristic
+rules to detect the incorrect translations and assigns a penalty term to the
+QE-based rewards for the detected incorrect translations. Experimental results
+show that the proposed QE-based feedback training achieves consistent and
+significant improvements across various settings, further verified through
+human preference studies. Our subsequent analysis demonstrates the high data
+efficiency of the proposed QE-based feedback training: the proposed approach
+using a small amount of monolingual data can outperform systems using larger
+parallel corpora.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KAM-CoT: Knowledge Augmented Multimodal Chain-of-Thoughts Reasoning <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12863v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12863v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debjyoti Mondal, Suraj Modi, Subhadarshi Panda, Rituraj Singh, Godawari Sudhakar Rao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated impressive performance in
+natural language processing tasks by leveraging chain of thought (CoT) that
+enables step-by-step thinking. Extending LLMs with multimodal capabilities is
+the recent interest, but incurs computational cost and requires substantial
+hardware resources. To address these challenges, we propose KAM-CoT a framework
+that integrates CoT reasoning, Knowledge Graphs (KGs), and multiple modalities
+for a comprehensive understanding of multimodal tasks. KAM-CoT adopts a
+two-stage training process with KG grounding to generate effective rationales
+and answers. By incorporating external knowledge from KGs during reasoning, the
+model gains a deeper contextual understanding reducing hallucinations and
+enhancing the quality of answers. This knowledge-augmented CoT reasoning
+empowers the model to handle questions requiring external context, providing
+more informed answers. Experimental findings show KAM-CoT outperforms the
+state-of-the-art methods. On the ScienceQA dataset, we achieve an average
+accuracy of 93.87%, surpassing GPT-3.5 (75.17%) by 18% and GPT-4 (83.99%) by
+10%. Remarkably, KAM-CoT achieves these results with only 280M trainable
+parameters at a time, demonstrating its cost-efficiency and effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient Flow of Energy: A General and Efficient Approach for Entity
+  Alignment Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12798v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12798v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyi Wang, Haifeng Sun, Jingyu Wang, Qi Qi, Shaoling Sun, Jianxin Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity alignment (EA), a pivotal process in integrating multi-source
+Knowledge Graphs (KGs), seeks to identify equivalent entity pairs across these
+graphs. Most existing approaches regard EA as a graph representation learning
+task, concentrating on enhancing graph encoders. However, the decoding process
+in EA - essential for effective operation and alignment accuracy - has received
+limited attention and remains tailored to specific datasets and model
+architectures, necessitating both entity and additional explicit relation
+embeddings. This specificity limits its applicability, particularly in
+GNN-based models. To address this gap, we introduce a novel, generalized, and
+efficient decoding approach for EA, relying solely on entity embeddings. Our
+method optimizes the decoding process by minimizing Dirichlet energy, leading
+to the gradient flow within the graph, to promote graph homophily. The
+discretization of the gradient flow produces a fast and scalable approach,
+termed Triple Feature Propagation (TFP). TFP innovatively channels gradient
+flow through three views: entity-to-entity, entity-to-relation, and
+relation-to-entity. This generalized gradient flow enables TFP to harness the
+multi-view structural information of KGs. Rigorous experimentation on diverse
+real-world datasets demonstrates that our approach significantly enhances
+various EA methods. Notably, the approach achieves these advancements with less
+than 6 seconds of additional computational time, establishing a new benchmark
+in efficiency and adaptability for future EA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking LLMs via Uncertainty Quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12794v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12794v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanghua Ye, Mingming Yang, Jianhui Pang, Longyue Wang, Derek F. Wong, Emine Yilmaz, Shuming Shi, Zhaopeng Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of open-source Large Language Models (LLMs) from various
+institutions has highlighted the urgent need for comprehensive evaluation
+methods. However, current evaluation platforms, such as the widely recognized
+HuggingFace open LLM leaderboard, neglect a crucial aspect -- uncertainty,
+which is vital for thoroughly assessing LLMs. To bridge this gap, we introduce
+a new benchmarking approach for LLMs that integrates uncertainty
+quantification. Our examination involves eight LLMs (LLM series) spanning five
+representative natural language processing tasks. Additionally, we introduce an
+uncertainty-aware evaluation metric, UAcc, which takes into account both
+prediction accuracy and prediction uncertainty. Our findings reveal that: I)
+LLMs with higher accuracy may exhibit lower certainty; II) Larger-scale LLMs
+may display greater uncertainty compared to their smaller counterparts; and
+III) Instruction-finetuning tends to increase the uncertainty of LLMs. By
+taking uncertainty into account, our new UAcc metric can either amplify or
+diminish the relative improvement of one LLM over another and may even change
+the relative ranking of two LLMs. These results underscore the significance of
+incorporating uncertainty in the evaluation of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, preprints</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multilingual and Fully Non-Autoregressive ASR with Large Language Model
+  Fusion: A Comprehensive Study <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12789v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12789v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        W. Ronny Huang, Cyril Allauzen, Tongzhou Chen, Kilol Gupta, Ke Hu, James Qin, Yu Zhang, Yongqiang Wang, Shuo-Yiin Chang, Tara N. Sainath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of large models, the autoregressive nature of decoding often
+results in latency serving as a significant bottleneck. We propose a
+non-autoregressive LM-fused ASR system that effectively leverages the
+parallelization capabilities of accelerator hardware. Our approach combines the
+Universal Speech Model (USM) and the PaLM 2 language model in per-segment
+scoring mode, achieving an average relative WER improvement across all
+languages of 10.8% on FLEURS and 3.6% on YouTube captioning. Furthermore, our
+comprehensive ablation study analyzes key parameters such as LLM size, context
+length, vocabulary size, fusion methodology. For instance, we explore the
+impact of LLM size ranging from 128M to 340B parameters on ASR performance.
+This study provides valuable insights into the factors influencing the
+effectiveness of practical large-scale LM-fused speech recognition systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What the Weight?! A Unified Framework for Zero-Shot Knowledge
+  Composition <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carolin Holtermann, Markus Frohmann, Navid Rekabsaz, Anne Lauscher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The knowledge encapsulated in a model is the core factor determining its
+final performance on downstream tasks. Much research in NLP has focused on
+efficient methods for storing and adapting different types of knowledge, e.g.,
+in dedicated modularized structures, and on how to effectively combine these,
+e.g., by learning additional parameters. However, given the many possible
+options, a thorough understanding of the mechanisms involved in these
+compositions is missing, and hence it remains unclear which strategies to
+utilize. To address this research gap, we propose a novel framework for
+zero-shot module composition, which encompasses existing and some novel
+variations for selecting, weighting, and combining parameter modules under a
+single unified notion. Focusing on the scenario of domain knowledge and adapter
+layers, our framework provides a systematic unification of concepts, allowing
+us to conduct the first comprehensive benchmarking study of various zero-shot
+knowledge composition strategies. In particular, we test two module combination
+methods and five selection and weighting strategies for their effectiveness and
+efficiency in an extensive experimental setup. Our results highlight the
+efficacy of ensembling but also hint at the power of simple though
+often-ignored weighting methods. Further in-depth analyses allow us to
+understand the role of weighting vs. top-k selection, and show that, to a
+certain extent, the performance of adapter composition can even be predicted.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of the ACL: EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive View of the Biases of Toxicity and Sentiment Analysis
+  Methods Towards Utterances with African American English Expressions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guilherme H. Resende, Luiz F. Nery, Fabrício Benevenuto, Savvas Zannettou, Flavio Figueiredo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language is a dynamic aspect of our culture that changes when expressed in
+different technologies/communities. Online social networks have enabled the
+diffusion and evolution of different dialects, including African American
+English (AAE). However, this increased usage is not without barriers. One
+particular barrier is how sentiment (Vader, TextBlob, and Flair) and toxicity
+(Google's Perspective and the open-source Detoxify) methods present biases
+towards utterances with AAE expressions. Consider Google's Perspective to
+understand bias. Here, an utterance such as ``All n*ggers deserve to die
+respectfully. The police murder us.'' it reaches a higher toxicity than
+``African-Americans deserve to die respectfully. The police murder us.''. This
+score difference likely arises because the tool cannot understand the
+re-appropriation of the term ``n*gger''. One explanation for this bias is that
+AI models are trained on limited datasets, and using such a term in training
+data is more likely to appear in a toxic utterance. While this may be
+plausible, the tool will make mistakes regardless. Here, we study bias on two
+Web-based (YouTube and Twitter) datasets and two spoken English datasets. Our
+analysis shows how most models present biases towards AAE in most settings. We
+isolate the impact of AAE expression usage via linguistic control features from
+the Linguistic Inquiry and Word Count (LIWC) software, grammatical control
+features extracted via Part-of-Speech (PoS) tagging from Natural Language
+Processing (NLP) models, and the semantic of utterances by comparing sentence
+embeddings from recent language models. We present consistent results on how a
+heavy usage of AAE expressions may cause the speaker to be considered
+substantially more toxic, even when speaking about nearly the same subject. Our
+study complements similar analyses focusing on small datasets and/or one method
+only.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under peer review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Unsupervised Abstractive Explanations for Rumour Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iman Munire Bilal, Preslav Nakov, Rob Procter, Maria Liakata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of rumour verification in social media concerns assessing the
+veracity of a claim on the basis of conversation threads that result from it.
+While previous work has focused on predicting a veracity label, here we
+reformulate the task to generate model-centric, free-text explanations of a
+rumour's veracity. We follow an unsupervised approach by first utilising
+post-hoc explainability methods to score the most important posts within a
+thread and then we use these posts to generate informative explanatory
+summaries by employing template-guided summarisation. To evaluate the
+informativeness of the explanatory summaries, we exploit the few-shot learning
+capabilities of a large language model (LLM). Our experiments show that LLMs
+can have similar agreement to humans in evaluating summaries. Importantly, we
+show that explanatory abstractive summaries are more informative and better
+reflect the predicted rumour veracity than just using the highest ranking posts
+in the thread.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy-based Automated Model Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ru Peng, Heming Zou, Haobo Wang, Yawen Zeng, Zenan Huang, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional evaluation protocols on machine learning models rely heavily
+on a labeled, i.i.d-assumed testing dataset, which is not often present in real
+world applications. The Automated Model Evaluation (AutoEval) shows an
+alternative to this traditional workflow, by forming a proximal prediction
+pipeline of the testing performance without the presence of ground-truth
+labels. Despite its recent successes, the AutoEval frameworks still suffer from
+an overconfidence issue, substantial storage and computational cost. In that
+regard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that
+allows the AutoEval framework to be both more efficient and effective. The core
+of the MDE is to establish a meta-distribution statistic, on the information
+(energy) associated with individual samples, then offer a smoother
+representation enabled by energy-based learning. We further provide our
+theoretical insights by connecting the MDE with the classification loss. We
+provide extensive experiments across modalities, datasets and different
+architectural backbones to validate MDE's validity, together with its
+superiority compared with prior approaches. We also prove MDE's versatility by
+showing its seamless integration with large-scale models, and easy adaption to
+learning scenarios with noisy- or imbalanced- labels.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context Matters: Pushing the Boundaries of Open-Ended Answer Generation
+  with Graph-Structured Knowledge Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Somnath Banerjee, Amruit Sahoo, Sayan Layek, Avik Dutta, Rima Hazra, Animesh Mukherjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the continuously advancing AI landscape, crafting context-rich and
+meaningful responses via Large Language Models (LLMs) is essential. Researchers
+are becoming more aware of the challenges that LLMs with fewer parameters
+encounter when trying to provide suitable answers to open-ended questions. To
+address these hurdles, the integration of cutting-edge strategies, augmentation
+of rich external domain knowledge to LLMs, offers significant improvements.
+This paper introduces a novel framework that combines graph-driven context
+retrieval in conjunction to knowledge graphs based enhancement, honing the
+proficiency of LLMs, especially in domain specific community question answering
+platforms like AskUbuntu, Unix, and ServerFault. We conduct experiments on
+various LLMs with different parameter sizes to evaluate their ability to ground
+knowledge and determine factual accuracy in answers to open-ended questions.
+Our methodology GraphContextGen consistently outperforms dominant text-based
+retrieval systems, demonstrating its robustness and adaptability to a larger
+number of use cases. This advancement highlights the importance of pairing
+context rich data retrieval with LLMs, offering a renewed approach to knowledge
+sourcing and generation in AI systems. We also show that, due to rich
+contextual data retrieval, the crucial entities, along with the generated
+answer, remain factually coherent with the gold answer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Reply to Makelov et al. (2023)'s "Interpretability Illusion" Arguments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxuan Wu, Atticus Geiger, Jing Huang, Aryaman Arora, Thomas Icard, Christopher Potts, Noah D. Goodman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We respond to the recent paper by Makelov et al. (2023), which reviews
+subspace interchange intervention methods like distributed alignment search
+(DAS; Geiger et al. 2023) and claims that these methods potentially cause
+"interpretability illusions". We first review Makelov et al. (2023)'s technical
+notion of what an "interpretability illusion" is, and then we show that even
+intuitive and desirable explanations can qualify as illusions in this sense. As
+a result, their method of discovering "illusions" can reject explanations they
+consider "non-illusory". We then argue that the illusions Makelov et al. (2023)
+see in practice are artifacts of their training and evaluation paradigms. We
+close by emphasizing that, though we disagree with their core characterization,
+Makelov et al. (2023)'s examples and discussion have undoubtedly pushed the
+field of interpretability forward.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLANG: New Concept Comprehension of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12585v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12585v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingrui Mei, Shenghua Liu, Yiwei Wang, Baolong Bi, Xueqi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dynamic nature of language, particularly evident in the realm of slang
+and memes on the Internet, poses serious challenges to the adaptability of
+large language models (LLMs). Traditionally anchored to static datasets, these
+models often struggle to keep up with the rapid linguistic evolution
+characteristic of online communities. This research addresses the critical need
+to bridge this gap, aiming to enhance LLMs' comprehension of evolving new
+concepts on the internet, without the high cost and impracticality of continual
+retraining. To address this issue, we propose a new benchmark $\textbf{SLANG}$
+to assess LLMs' proficiency in comprehending emerging linguistic trends and a
+baseline approach $\textbf{FOCUS}$, which uses causal inference to enhance LLMs
+to understand new phrases and usage patterns. This approach involves
+scrutinizing real-world instances of linguistic shifts, serving as contextual
+beacons, to form more precise and contextually relevant connections between
+newly emerging expressions and their intended meanings. The empirical analysis
+shows that our causal inference-based approach outperforms the traditional
+models in terms of precision and relevance in the interpretation of Internet
+slang and memes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLMCheckup: Conversational Examination of Large Language Models via
+  Interpretability Tools 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianli Wang, Tatiana Anikina, Nils Feldhus, Josef van Genabith, Leonhard Hennig, Sebastian Möller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretability tools that offer explanations in the form of a dialogue have
+demonstrated their efficacy in enhancing users' understanding, as one-off
+explanations may occasionally fall short in providing sufficient information to
+the user. Current solutions for dialogue-based explanations, however, require
+many dependencies and are not easily transferable to tasks they were not
+designed for. With LLMCheckup, we present an easily accessible tool that allows
+users to chat with any state-of-the-art large language model (LLM) about its
+behavior. We enable LLMs to generate all explanations by themselves and take
+care of intent recognition without fine-tuning, by connecting them with a broad
+spectrum of Explainable AI (XAI) tools, e.g. feature attributions,
+embedding-based similarity, and prompting strategies for counterfactual and
+rationale generation. LLM (self-)explanations are presented as an interactive
+dialogue that supports follow-up questions and generates suggestions.
+LLMCheckup provides tutorials for operations available in the system, catering
+to individuals with varying levels of expertise in XAI and supports multiple
+input modalities. We introduce a new parsing strategy called multi-prompt
+parsing substantially enhancing the parsing accuracy of LLMs. Finally, we
+showcase the tasks of fact checking and commonsense question answering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Fact-Checking of Climate Change Claims with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus Leippold, Saeid Ashraf Vaghefi, Dominik Stammbach, Veruska Muccione, Julia Bingler, Jingwei Ni, Chiara Colesanti-Senni, Tobias Wekhof, Tobias Schimanski, Glen Gostlow, Tingyu Yu, Juerg Luterbacher, Christian Huggel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents Climinator, a novel AI-based tool designed to automate
+the fact-checking of climate change claims. Utilizing an array of Large
+Language Models (LLMs) informed by authoritative sources like the IPCC reports
+and peer-reviewed scientific literature, Climinator employs an innovative
+Mediator-Advocate framework. This design allows Climinator to effectively
+synthesize varying scientific perspectives, leading to robust, evidence-based
+evaluations. Our model demonstrates remarkable accuracy when testing claims
+collected from Climate Feedback and Skeptical Science. Notably, when
+integrating an advocate with a climate science denial perspective in our
+framework, Climinator's iterative debate process reliably converges towards
+scientific consensus, underscoring its adeptness at reconciling diverse
+viewpoints into science-based, factual conclusions. While our research is
+subject to certain limitations and necessitates careful interpretation, our
+approach holds significant potential. We hope to stimulate further research and
+encourage exploring its applicability in other contexts, including political
+fact-checking and legal domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DREditor: An Time-efficient Approach for Building a Domain-specific
+  Dense Retrieval Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Huang, Duanyu Feng, Wenqiang Lei, Jiancheng Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying dense retrieval models efficiently is becoming increasingly
+important across various industries. This is especially true for enterprise
+search services, where customizing search engines to meet the time demands of
+different enterprises in different domains is crucial. Motivated by this, we
+develop a time-efficient approach called DREditor to edit the matching rule of
+an off-the-shelf dense retrieval model to suit a specific domain. This is
+achieved by directly calibrating the output embeddings of the model using an
+efficient and effective linear mapping. This mapping is powered by an edit
+operator that is obtained by solving a specially constructed least squares
+problem. Compared to implicit rule modification via long-time finetuning, our
+experimental results show that DREditor provides significant advantages on
+different domain-specific datasets, dataset sources, retrieval models, and
+computing devices. It consistently enhances time efficiency by 100-300 times
+while maintaining comparable or even superior retrieval performance. In a
+broader context, we take the first step to introduce a novel embedding
+calibration approach for the retrieval task, filling the technical blank in the
+current field of embedding calibration. This approach also paves the way for
+building domain-specific dense retrieval models efficiently and inexpensively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures, Codes are available at
+  https://github.com/huangzichun/DREditor</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BiTA: Bi-Directional Tuning for Lossless Acceleration in Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Lin, Hanling Yi, Hongbin Li, Yifan Yang, Xiaotian Yu, Guangming Lu, Rong Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) commonly employ autoregressive generation during
+inference, leading to high memory bandwidth demand and consequently extended
+latency. To mitigate this inefficiency, we present Bi-directional Tuning for
+lossless Acceleration (BiTA), an innovative method expediting LLMs via
+streamlined semi-autoregressive generation and draft verification. Inspired by
+the concept of prompt tuning, we enhance LLMs with a parameter-efficient design
+called bi-directional tuning for the capability in semi-autoregressive
+generation. Employing efficient tree-based decoding, the models perform draft
+candidate generation and verification in parallel, ensuring outputs identical
+to their autoregressive counterparts under greedy sampling. BiTA serves as a
+lightweight plug-in module, seamlessly boosting the inference efficiency of
+existing LLMs without requiring additional assistance models or incurring
+significant extra memory costs. Applying the proposed BiTA, LLaMA-2-70B-Chat
+achieves a 2.7$\times$ speedup on the MT-Bench benchmark. Extensive experiments
+confirm our method surpasses state-of-the-art acceleration techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Source code at https://github.com/linfeng93/BiTA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Key Information Retrieval to Classify the Unstructured Data Content of
+  Preferential Trade Agreements <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahui Zhao, Ziyi Meng, Stepan Gordeev, Zijie Pan, Dongjin Song, Sandro Steinbach, Caiwen Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid proliferation of textual data, predicting long texts has
+emerged as a significant challenge in the domain of natural language
+processing. Traditional text prediction methods encounter substantial
+difficulties when grappling with long texts, primarily due to the presence of
+redundant and irrelevant information, which impedes the model's capacity to
+capture pivotal insights from the text. To address this issue, we introduce a
+novel approach to long-text classification and prediction. Initially, we employ
+embedding techniques to condense the long texts, aiming to diminish the
+redundancy therein. Subsequently,the Bidirectional Encoder Representations from
+Transformers (BERT) embedding method is utilized for text classification
+training. Experimental outcomes indicate that our method realizes considerable
+performance enhancements in classifying long texts of Preferential Trade
+Agreements. Furthermore, the condensation of text through embedding methods not
+only augments prediction accuracy but also substantially reduces computational
+complexity. Overall, this paper presents a strategy for long-text prediction,
+offering a valuable reference for researchers and engineers in the natural
+language processing sphere.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AI4TS Workshop@AAAI 2024 accepted publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparing Human-Centered Language Modeling: Is it Better to Model
+  Groups, Individual Traits, or Both? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikita Soni, Niranjan Balasubramanian, H. Andrew Schwartz, Dirk Hovy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language processing has made progress in incorporating human context
+into its models, but whether it is more effective to use group-wise attributes
+(e.g., over-45-year-olds) or model individuals remains open. Group attributes
+are technically easier but coarse: not all 45-year-olds write the same way. In
+contrast, modeling individuals captures the complexity of each person's
+identity. It allows for a more personalized representation, but we may have to
+model an infinite number of users and require data that may be impossible to
+get. We compare modeling human context via group attributes, individual users,
+and combined approaches. Combining group and individual features significantly
+benefits user-level regression tasks like age estimation or personality
+assessment from a user's documents. Modeling individual users significantly
+improves the performance of single document-level classification tasks like
+stance and topic detection. We also find that individual-user modeling does
+well even without user's historical data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing and Understanding Creativity in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunpu Zhao, Rui Zhang, Wenyi Li, Di Huang, Jiaming Guo, Shaohui Peng, Yifan Hao, Yuanbo Wen, Xing Hu, Zidong Du, Qi Guo, Ling Li, Yunji Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of natural language processing, the rapid development of large
+language model (LLM) has attracted more and more attention. LLMs have shown a
+high level of creativity in various tasks, but the methods for assessing such
+creativity are inadequate. The assessment of LLM creativity needs to consider
+differences from humans, requiring multi-dimensional measurement while
+balancing accuracy and efficiency. This paper aims to establish an efficient
+framework for assessing the level of creativity in LLMs. By adapting the
+modified Torrance Tests of Creative Thinking, the research evaluates the
+creative performance of various LLMs across 7 tasks, emphasizing 4 criteria
+including Fluency, Flexibility, Originality, and Elaboration. In this context,
+we develop a comprehensive dataset of 700 questions for testing and an
+LLM-based evaluation method. In addition, this study presents a novel analysis
+of LLMs' responses to diverse prompts and role-play situations. We found that
+the creativity of LLMs primarily falls short in originality, while excelling in
+elaboration. Besides, the use of prompts and the role-play settings of the
+model significantly influence creativity. Additionally, the experimental
+results also indicate that collaboration among multiple LLMs can enhance
+originality. Notably, our findings reveal a consensus between human evaluations
+and LLMs regarding the personality traits that influence creativity. The
+findings underscore the significant impact of LLM design on creativity and
+bridges artificial intelligence and human creativity, offering insights into
+LLMs' creativity and potential applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models are Superpositions of All Characters: Attaining
+  Arbitrary Role-play via Self-Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12474v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12474v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keming Lu, Bowen Yu, Chang Zhou, Jingren Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Considerable efforts have been invested in augmenting the role-playing
+proficiency of open-source large language models (LLMs) by emulating
+proprietary counterparts. Nevertheless, we posit that LLMs inherently harbor
+role-play capabilities, owing to the extensive knowledge of characters and
+potential dialogues ingrained in their vast training corpora. Thus, in this
+study, we introduce Ditto, a self-alignment method for role-play. Ditto
+capitalizes on character knowledge, encouraging an instruction-following LLM to
+simulate role-play dialogues as a variant of reading comprehension. This method
+creates a role-play training set comprising 4,000 characters, surpassing the
+scale of currently available datasets by tenfold regarding the number of roles.
+Subsequently, we fine-tune the LLM using this self-generated dataset to augment
+its role-playing capabilities. Upon evaluating our meticulously constructed and
+reproducible role-play benchmark and the roleplay subset of MT-Bench, Ditto, in
+various parameter scales, consistently maintains a consistent role identity and
+provides accurate role-specific knowledge in multi-turn role-play
+conversations. Notably, it outperforms all open-source role-play baselines,
+showcasing performance levels comparable to advanced proprietary chatbots.
+Furthermore, we present the first comprehensive cross-supervision alignment
+experiment in the role-play domain, revealing that the intrinsic capabilities
+of LLMs confine the knowledge within role-play. Meanwhile, the role-play styles
+can be easily acquired with the guidance of smaller models. We open-source
+related resources at https://github.com/OFA-Sys/Ditto.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Learning in Distilled Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valerie Lim, Kai Wen Ng, Kenneth Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural Language Processing models like BERT can provide state-of-the-art
+word embeddings for downstream NLP tasks. However, these models yet to perform
+well on Semantic Textual Similarity, and may be too large to be deployed as
+lightweight edge applications. We seek to apply a suitable contrastive learning
+method based on the SimCSE paper, to a model architecture adapted from a
+knowledge distillation based model, DistilBERT, to address these two issues.
+Our final lightweight model DistilFace achieves an average of 72.1 in
+Spearman's correlation on STS tasks, a 34.2 percent improvement over BERT base.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Adversarial Training against Textual Adversarial Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichen Yang, Xin Liu, Kun He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many adversarial defense methods have been proposed to enhance the
+adversarial robustness of natural language processing models. However, most of
+them introduce additional pre-set linguistic knowledge and assume that the
+synonym candidates used by attackers are accessible, which is an ideal
+assumption. We delve into adversarial training in the embedding space and
+propose a Fast Adversarial Training (FAT) method to improve the model
+robustness in the synonym-unaware scenario from the perspective of single-step
+perturbation generation and perturbation initialization. Based on the
+observation that the adversarial perturbations crafted by single-step and
+multi-step gradient ascent are similar, FAT uses single-step gradient ascent to
+craft adversarial examples in the embedding space to expedite the training
+process. Based on the observation that the perturbations generated on the
+identical training sample in successive epochs are similar, FAT fully utilizes
+historical information when initializing the perturbation. Extensive
+experiments demonstrate that FAT significantly boosts the robustness of BERT
+models in the synonym-unaware scenario, and outperforms the defense baselines
+under various attacks with character-level and word-level modifications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CIM-MLC: A Multi-level Compilation Stack for Computing-In-Memory
+  Accelerators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songyun Qu, Shixin Zhao, Bing Li, Yintao He, Xuyi Cai, Lei Zhang, Ying Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, various computing-in-memory (CIM) processors have been
+presented, showing superior performance over traditional architectures. To
+unleash the potential of various CIM architectures, such as device precision,
+crossbar size, and crossbar number, it is necessary to develop compilation
+tools that are fully aware of the CIM architectural details and implementation
+diversity. However, due to the lack of architectural support in current popular
+open-source compiling stacks, existing CIM designs either manually deploy
+networks or build their own compilers, which is time-consuming and
+labor-intensive. Although some works expose the specific CIM device programming
+interfaces to compilers, they are often bound to a fixed CIM architecture,
+lacking the flexibility to support the CIM architectures with different
+computing granularity. On the other hand, existing compilation works usually
+consider the scheduling of limited operation types (such as crossbar-bound
+matrix-vector multiplication). Unlike conventional processors, CIM accelerators
+are featured by their diverse architecture, circuit, and device, which cannot
+be simply abstracted by a single level if we seek to fully explore the
+advantages brought by CIM. Therefore, we propose CIM-MLC, a universal
+multi-level compilation framework for general CIM architectures. We first
+establish a general hardware abstraction for CIM architectures and computing
+modes to represent various CIM accelerators. Based on the proposed abstraction,
+CIM-MLC can compile tasks onto a wide range of CIM accelerators having
+different devices, architectures, and programming interfaces. More importantly,
+compared with existing compilation work, CIM-MLC can explore the mapping and
+scheduling strategies across multiple architectural tiers, which form a
+tractable yet effective design space, to achieve better scheduling and
+instruction generation results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 22 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Neglected Tails of Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Parashar, Zhiqiu Lin, Tian Liu, Xiangjue Dong, Yanan Li, Deva Ramanan, James Caverlee, Shu Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) excel in zero-shot recognition but exhibit
+drastically imbalanced performance across visual concepts. For example, CLIP,
+despite an impressive mean zero-shot accuracy on ImageNet (72.7%), yields
+$<$10% on ten concepts (e.g., gyromitra and night snake), presumably, because
+these concepts are under-represented in VLMs' imbalanced pretraining data. Yet,
+assessing this imbalance is challenging as it is non-trivial to calculate the
+frequency of specific concepts within VLMs' large-scale pretraining data. Our
+work makes the first attempt to measure the concept frequency by analyzing
+pretraining texts. We use off-the-shelf language models to help count relevant
+texts that contain synonyms of the given concepts and resolve linguistic
+ambiguity. We confirm that popular VLM datasets like LAION indeed exhibit
+long-tailed concept distributions, which strongly correlate with per-class
+accuracies. Further, contemporary multimodal systems, e.g., visual chatbots and
+text-to-image generators, also struggle with the rare concepts identified by
+our method. To mitigate VLMs' imbalanced performance in zero-shot recognition,
+we propose REtrieval-Augmented Learning REAL. First, instead of prompting VLMs
+using the original class names, REAL uses their most frequent synonyms found in
+VLMs' pretraining texts. This already outperforms human-engineered and
+LLM-generated prompts over nine benchmark datasets, likely because VLMs have
+seen more images associated with the frequently used synonyms. Second, REAL
+uses all the concept synonyms to retrieve a small, class-balanced set of
+pretraining data to train a robust classifier. REAL surpasses the recent
+retrieval-augmented solution REACT, using 400x less storage and 10,000x less
+training time!
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page:
+  https://shubhamprshr27.github.io/neglected-tails-of-vlms/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Locality enhanced dynamic biasing and sampling strategies for contextual
+  ASR 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Asif Jalal, Pablo Peso Parada, George Pavlidis, Vasileios Moschopoulos, Karthikeyan Saravanan, Chrysovalantis-Giorgos Kontoulis, Jisi Zhang, Anastasios Drosou, Gil Ho Lee, Jungin Lee, Seokyeong Jung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic Speech Recognition (ASR) still face challenges when recognizing
+time-variant rare-phrases. Contextual biasing (CB) modules bias ASR model
+towards such contextually-relevant phrases. During training, a list of biasing
+phrases are selected from a large pool of phrases following a sampling
+strategy. In this work we firstly analyse different sampling strategies to
+provide insights into the training of CB for ASR with correlation plots between
+the bias embeddings among various training stages. Secondly, we introduce a
+neighbourhood attention (NA) that localizes self attention (SA) to the nearest
+neighbouring frames to further refine the CB output. The results show that this
+proposed approach provides on average a 25.84% relative WER improvement on
+LibriSpeech sets and rare-word evaluation compared to the baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for IEEE ASRU 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Language Barrier: Dissecting Safety Challenges of LLMs in
+  Multilingual Contexts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingfeng Shen, Weiting Tan, Sihao Chen, Yunmo Chen, Jingyu Zhang, Haoran Xu, Boyuan Zheng, Philipp Koehn, Daniel Khashabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the influence of large language models (LLMs) spans across global
+communities, their safety challenges in multilingual settings become paramount
+for alignment research. This paper examines the variations in safety challenges
+faced by LLMs across different languages and discusses approaches to
+alleviating such concerns. By comparing how state-of-the-art LLMs respond to
+the same set of malicious prompts written in higher- vs. lower-resource
+languages, we observe that (1) LLMs tend to generate unsafe responses much more
+often when a malicious prompt is written in a lower-resource language, and (2)
+LLMs tend to generate more irrelevant responses to malicious prompts in
+lower-resource languages. To understand where the discrepancy can be
+attributed, we study the effect of instruction tuning with reinforcement
+learning from human feedback (RLHF) or supervised finetuning (SFT) on the
+HH-RLHF dataset. Surprisingly, while training with high-resource languages
+improves model alignment, training in lower-resource languages yields minimal
+improvement. This suggests that the bottleneck of cross-lingual alignment is
+rooted in the pretraining stage. Our findings highlight the challenges in
+cross-lingual LLM safety, and we hope they inform future research in this
+direction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing COVID-19 Vaccination Sentiments in Nigerian Cyberspace:
+  Insights from a Manually Annotated Twitter <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibrahim Said Ahmad, Lukman Jibril Aliyu, Abubakar Auwal Khalid, Saminu Muhammad Aliyu, Shamsuddeen Hassan Muhammad, Idris Abdulmumin, Bala Mairiga Abduljalil, Bello Shehu Bello, Amina Imam Abubakar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous successes have been achieved in combating the COVID-19 pandemic,
+initially using various precautionary measures like lockdowns, social
+distancing, and the use of face masks. More recently, various vaccinations have
+been developed to aid in the prevention or reduction of the severity of the
+COVID-19 infection. Despite the effectiveness of the precautionary measures and
+the vaccines, there are several controversies that are massively shared on
+social media platforms like Twitter. In this paper, we explore the use of
+state-of-the-art transformer-based language models to study people's acceptance
+of vaccines in Nigeria. We developed a novel dataset by crawling multi-lingual
+tweets using relevant hashtags and keywords. Our analysis and visualizations
+revealed that most tweets expressed neutral sentiments about COVID-19 vaccines,
+with some individuals expressing positive views, and there was no strong
+preference for specific vaccine types, although Moderna received slightly more
+positive sentiment. We also found out that fine-tuning a pre-trained LLM with
+an appropriate dataset can yield competitive results, even if the LLM was not
+initially pre-trained on the specific language of that dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seed-Guided Fine-Grained Entity Typing in Science and Engineering
+  Domains <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhang, Yunyi Zhang, Yanzhen Shen, Yu Deng, Lucian Popa, Larisa Shwartz, ChengXiang Zhai, Jiawei Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately typing entity mentions from text segments is a fundamental task
+for various natural language processing applications. Many previous approaches
+rely on massive human-annotated data to perform entity typing. Nevertheless,
+collecting such data in highly specialized science and engineering domains
+(e.g., software engineering and security) can be time-consuming and costly,
+without mentioning the domain gaps between training and inference data if the
+model needs to be applied to confidential datasets. In this paper, we study the
+task of seed-guided fine-grained entity typing in science and engineering
+domains, which takes the name and a few seed entities for each entity type as
+the only supervision and aims to classify new entity mentions into both seen
+and unseen types (i.e., those without seed entities). To solve this problem, we
+propose SEType which first enriches the weak supervision by finding more
+entities for each seen type from an unlabeled corpus using the contextualized
+representations of pre-trained language models. It then matches the enriched
+entities to unlabeled text to get pseudo-labeled samples and trains a textual
+entailment model that can make inferences for both seen and unseen types.
+Extensive experiments on two datasets covering four domains demonstrate the
+effectiveness of SEType in comparison with various baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages; Accepted to AAAI 2024 (Code:
+  https://github.com/yuzhimanhua/SEType)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Trustable Language Models: Investigating Information Quality of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rick Rejeleene, Xiaowei Xu, John Talburt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLM) are generating information at a rapid pace,
+requiring users to increasingly rely and trust the data. Despite remarkable
+advances of LLM, Information generated by LLM is not completely trustworthy,
+due to challenges in information quality. Specifically, integrity of
+Information quality decreases due to unreliable, biased, tokenization during
+pre-training of LLM. Moreover, due to decreased information quality issues, has
+led towards hallucination, fabricated information. Unreliable information can
+lead towards flawed decisions in businesses, which impacts economic activity.
+In this work, we introduce novel mathematical information quality evaluation of
+LLM, we furthermore analyze and highlight information quality challenges,
+scaling laws to systematically scale language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IndiText Boost: Text Augmentation for Low Resource India Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Onkar Litake, Niraj Yagnik, Shreyas Labhsetwar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text Augmentation is an important task for low-resource languages. It helps
+deal with the problem of data scarcity. A data augmentation strategy is used to
+deal with the problem of data scarcity. Through the years, much work has been
+done on data augmentation for the English language. In contrast, very less work
+has been done on Indian languages. This is contrary to the fact that data
+augmentation is used to deal with data scarcity. In this work, we focus on
+implementing techniques like Easy Data Augmentation, Back Translation,
+Paraphrasing, Text Generation using LLMs, and Text Expansion using LLMs for
+text classification on different languages. We focus on 6 Indian languages
+namely: Sindhi, Marathi, Hindi, Gujarati, Telugu, and Sanskrit. According to
+our knowledge, no such work exists for text augmentation on Indian languages.
+We carry out binary as well as multi-class text classification to make our
+results more comparable. We get surprising results as basic data augmentation
+techniques surpass LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TCE at Qur'an QA 2023 Shared Task: Low Resource Enhanced
+  <span class="highlight-title">Transformer</span>-based Ensemble Approach for Qur'anic QA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Alaa Elkomy, Amany Sarhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present our approach to tackle Qur'an QA 2023 shared tasks
+A and B. To address the challenge of low-resourced training data, we rely on
+transfer learning together with a voting ensemble to improve prediction
+stability across multiple runs. Additionally, we employ different architectures
+and learning mechanisms for a range of Arabic pre-trained transformer-based
+models for both tasks. To identify unanswerable questions, we propose using a
+thresholding mechanism. Our top-performing systems greatly surpass the baseline
+performance on the hidden split, achieving a MAP score of 25.05% for task A and
+a partial Average Precision (pAP) of 57.11% for task B.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formally Specifying the High-Level Behavior of LLM-Based Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08535v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08535v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxwell Crouse, Ibrahim Abdelaziz, Ramon Astudillo, Kinjal Basu, Soham Dan, Sadhana Kumaravel, Achille Fokoue, Pavan Kapanipathi, Salim Roukos, Luis Lastras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous, goal-driven agents powered by LLMs have recently emerged as
+promising tools for solving challenging problems without the need for
+task-specific finetuned models that can be expensive to procure. Currently, the
+design and implementation of such agents is ad hoc, as the wide variety of
+tasks that LLM-based agents may be applied to naturally means there can be no
+one-size-fits-all approach to agent design. In this work we aim to alleviate
+the difficulty of designing and implementing new agents by proposing a
+minimalistic generation framework that simplifies the process of building
+agents. The framework we introduce allows the user to define desired agent
+behaviors in a high-level, declarative specification that is then used to
+construct a decoding monitor which guarantees the LLM will produce an output
+exhibiting the desired behavior. Our declarative approach, in which the
+behavior is described without concern for how it should be implemented or
+enforced, enables rapid design, implementation, and experimentation with
+different LLM-based agents. We demonstrate how the proposed framework can be
+used to implement recent LLM-based agents (e.g., ReACT), and show how the
+flexibility of our approach can be leveraged to define a new agent with more
+complex behavior, the Plan-Act-Summarize-Solve (PASS) agent. Lastly, we
+demonstrate that our method outperforms other agents on multiple popular
+reasoning-centric question-answering benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Outlier Dimensions Encode Task-Specific Knowledge <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.17715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.17715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Rudman, Catherine Chen, Carsten Eickhoff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representations from large language models (LLMs) are known to be dominated
+by a small subset of dimensions with exceedingly high variance. Previous works
+have argued that although ablating these outlier dimensions in LLM
+representations hurts downstream performance, outlier dimensions are
+detrimental to the representational quality of embeddings. In this study, we
+investigate how fine-tuning impacts outlier dimensions and show that 1) outlier
+dimensions that occur in pre-training persist in fine-tuned models and 2) a
+single outlier dimension can complete downstream tasks with a minimal error
+rate. Our results suggest that outlier dimensions can encode crucial
+task-specific knowledge and that the value of a representation in a single
+outlier dimension drives downstream model decisions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera-ready version for EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OWQ: Lessons learned from activation outliers for weight quantization in
+  large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02272v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02272v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changhun Lee, Jungyu Jin, Taesu Kim, Hyungjun Kim, Eunhyeok Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) with hundreds of billions of parameters require
+powerful server-grade GPUs for inference, limiting their practical deployment.
+To address this challenge, we introduce the outlier-aware weight quantization
+(OWQ) method, which aims to minimize LLM's footprint through low-precision
+representation. OWQ prioritizes a small subset of structured weights sensitive
+to quantization, storing them in high-precision, while applying highly tuned
+quantization to the remaining dense weights. This sensitivity-aware
+mixed-precision scheme reduces the quantization error notably, and extensive
+experiments demonstrate that 3.1-bit models using OWQ perform comparably to
+4-bit models optimized by OPTQ. Furthermore, OWQ incorporates a
+parameter-efficient fine-tuning for task-specific adaptation, called weak
+column tuning (WCT), enabling accurate task-specific LLM adaptation with
+minimal memory overhead in the optimized format. OWQ represents a notable
+advancement in the flexibility, efficiency, and practicality of LLM
+optimization literature. The source code is available at
+https://github.com/xvyaward/owq
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Energy-based Models are Zero-Shot Planners for Compositional Scene
+  Rearrangement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14391v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14391v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolaos Gkanatsios, Ayush Jain, Zhou Xian, Yunchu Zhang, Christopher Atkeson, Katerina Fragkiadaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language is compositional; an instruction can express multiple relation
+constraints to hold among objects in a scene that a robot is tasked to
+rearrange. Our focus in this work is an instructable scene-rearranging
+framework that generalizes to longer instructions and to spatial concept
+compositions never seen at training time. We propose to represent
+language-instructed spatial concepts with energy functions over relative object
+arrangements. A language parser maps instructions to corresponding energy
+functions and an open-vocabulary visual-language model grounds their arguments
+to relevant objects in the scene. We generate goal scene configurations by
+gradient descent on the sum of energy functions, one per language predicate in
+the instruction. Local vision-based policies then re-locate objects to the
+inferred goal locations. We test our model on established instruction-guided
+manipulation benchmarks, as well as benchmarks of compositional instructions we
+introduce. We show our model can execute highly compositional instructions
+zero-shot in simulation and in the real world. It outperforms
+language-to-action reactive policies and Large Language Model planners by a
+large margin, especially for long instructions that involve compositions of
+multiple spatial concepts. Simulation and real-world robot execution videos, as
+well as our code and datasets are publicly available on our website:
+https://ebmplanner.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally | RSS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutomaTikZ: Text-Guided Synthesis of Scientific Vector Graphics with
+  TikZ <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00367v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00367v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Belouadi, Anne Lauscher, Steffen Eger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating bitmap graphics from text has gained considerable attention, yet
+for scientific figures, vector graphics are often preferred. Given that vector
+graphics are typically encoded using low-level graphics primitives, generating
+them directly is difficult. To address this, we propose the use of TikZ, a
+well-known abstract graphics language that can be compiled to vector graphics,
+as an intermediate representation of scientific figures. TikZ offers
+human-oriented, high-level commands, thereby facilitating conditional language
+modeling with any large language model. To this end, we introduce DaTikZ, the
+first large-scale TikZ dataset consisting of 120k TikZ drawings aligned with
+captions. We fine-tune LLaMA on DaTikZ, as well as our new model CLiMA, which
+augments LLaMA with multimodal CLIP embeddings. In both human and automatic
+evaluation, CLiMA and LLaMA outperform commercial GPT-4 and Claude 2 in terms
+of similarity to human-created figures, with CLiMA additionally improving
+text-image alignment. Our detailed analysis shows that all models generalize
+well and are not susceptible to memorization. GPT-4 and Claude 2, however, tend
+to generate more simplistic figures compared to both humans and our models. We
+make our framework, AutomaTikZ, along with model weights and datasets, publicly
+available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024 (poster); Project Page:
+  https://github.com/potamides/AutomaTikZ</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multilingual acoustic word embeddings for zero-resource languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10543v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10543v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christiaan Jacobs
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research addresses the challenge of developing speech applications for
+zero-resource languages that lack labelled data. It specifically uses acoustic
+word embedding (AWE) -- fixed-dimensional representations of variable-duration
+speech segments -- employing multilingual transfer, where labelled data from
+several well-resourced languages are used for pertaining. The study introduces
+a new neural network that outperforms existing AWE models on zero-resource
+languages. It explores the impact of the choice of well-resourced languages.
+AWEs are applied to a keyword-spotting system for hate speech detection in
+Swahili radio broadcasts, demonstrating robustness in real-world scenarios.
+Additionally, novel semantic AWE models improve semantic query-by-example
+search.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PhD thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Vote: <span class="highlight-title">Prompt</span>ing for Rare Disease Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12890v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12890v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Oniani, Jordan Hilsman, Hang Dong, Fengyi Gao, Shiven Verma, Yanshan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of generative Large Language Models (LLMs) emphasizes the need
+for accurate and efficient prompting approaches. LLMs are often applied in
+Few-Shot Learning (FSL) contexts, where tasks are executed with minimal
+training data. FSL has become popular in many Artificial Intelligence (AI)
+subdomains, including AI for health. Rare diseases affect a small fraction of
+the population. Rare disease identification from clinical notes inherently
+requires FSL techniques due to limited data availability. Manual data
+collection and annotation is both expensive and time-consuming. In this paper,
+we propose Models-Vote Prompting (MVP), a flexible prompting approach for
+improving the performance of LLM queries in FSL settings. MVP works by
+prompting numerous LLMs to perform the same tasks and then conducting a
+majority vote on the resulting outputs. This method achieves improved results
+to any one model in the ensemble on one-shot rare disease identification and
+classification tasks. We also release a novel rare disease dataset for FSL,
+available to those who signed the MIMIC-IV Data Use Agreement (DUA).
+Furthermore, in using MVP, each model is prompted multiple times, substantially
+increasing the time needed for manual annotation, and to address this, we
+assess the feasibility of using JSON for automating generative LLM evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Supporting Student Decisions on Learning Recommendations: An LLM-Based
+  Chatbot with Knowledge Graph Contextualization for Conversational
+  Explainability and Mentoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08517v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08517v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hasan Abu-Rasheed, Mohamad Hussam Abdulsalam, Christian Weber, Madjid Fathi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Student commitment towards a learning recommendation is not separable from
+their understanding of the reasons it was recommended to them; and their
+ability to modify it based on that understanding. Among explainability
+approaches, chatbots offer the potential to engage the student in a
+conversation, similar to a discussion with a peer or a mentor. The capabilities
+of chatbots, however, are still not sufficient to replace a human mentor,
+despite the advancements of generative AI (GenAI) and large language models
+(LLM). Therefore, we propose an approach to utilize chatbots as mediators of
+the conversation and sources of limited and controlled generation of
+explanations, to harvest the potential of LLMs while reducing their potential
+risks at the same time. The proposed LLM-based chatbot supports students in
+understanding learning-paths recommendations. We use a knowledge graph (KG) as
+a human-curated source of information, to regulate the LLM's output through
+defining its prompt's context. A group chat approach is developed to connect
+students with human mentors, either on demand or in cases that exceed the
+chatbot's pre-defined tasks. We evaluate the chatbot with a user study, to
+provide a proof-of-concept and highlight the potential requirements and
+limitations of utilizing chatbots in conversational explainability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Text Watermarking in the Era of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07913v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07913v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aiwei Liu, Leyi Pan, Yijian Lu, Jingjing Li, Xuming Hu, Xi Zhang, Lijie Wen, Irwin King, Hui Xiong, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text watermarking algorithms play a crucial role in the copyright protection
+of textual content, yet their capabilities and application scenarios have been
+limited historically. The recent developments in large language models (LLMs)
+have opened new opportunities for the advancement of text watermarking
+techniques. LLMs not only enhance the capabilities of text watermarking
+algorithms through their text understanding and generation abilities but also
+necessitate the use of text watermarking algorithms for their own copyright
+protection. This paper conducts a comprehensive survey of the current state of
+text watermarking technology, covering four main aspects: (1) an overview and
+comparison of different text watermarking techniques; (2) evaluation methods
+for text watermarking algorithms, including their success rates, impact on text
+quality, robustness, and unforgeability; (3) potential application scenarios
+for text watermarking technology; (4) current challenges and future directions
+for development. This survey aims to provide researchers with a thorough
+understanding of text watermarking technology, thereby promoting its further
+advancement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Claim Detection for Automated Fact-checking: A <span class="highlight-title">Survey</span> on Monolingual,
+  Multilingual and Cross-Lingual Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11969v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11969v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rrubaa Panchendrarajan, Arkaitz Zubiaga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated fact-checking has drawn considerable attention over the past few
+decades due to the increase in the diffusion of misinformation on online
+platforms. This is often carried out as a sequence of tasks comprising (i) the
+detection of sentences circulating in online platforms which constitute claims
+needing verification, followed by (ii) the verification process of those
+claims. This survey focuses on the former, by discussing existing efforts
+towards detecting claims needing fact-checking, with a particular focus on
+multilingual data and methods. This is a challenging and fertile direction
+where existing methods are yet far from matching human performance due to the
+profoundly challenging nature of the issue. Especially, the dissemination of
+information across multiple social platforms, articulated in multiple languages
+and modalities demands more generalized solutions for combating misinformation.
+Focusing on multilingual misinformation, we present a comprehensive survey of
+existing multilingual claim detection research. We present state-of-the-art
+multilingual claim detection research categorized into three key factors of the
+problem, verifiability, priority, and similarity. Further, we present a
+detailed overview of the existing multilingual datasets along with the
+challenges and suggest possible future advancements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Typo corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A ripple in time: a discontinuity in American history 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01185v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01185v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Kolpakov, Igor Rivin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this note we use the State of the Union Address (SOTU) dataset from Kaggle
+to make some surprising (and some not so surprising) observations pertaining to
+the general timeline of American history, and the character and nature of the
+addresses themselves. Our main approach is using vector embeddings, such as
+BERT (DistilBERT) and GPT-2.
+  While it is widely believed that BERT (and its variations) is most suitable
+for NLP classification tasks, we find out that GPT-2 in conjunction with
+nonlinear dimension reduction methods such as UMAP provide better separation
+and stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In
+our case, no model fine-tuning is required, and the pre-trained out-of-the-box
+GPT-2 model is enough.
+  We also used a fine-tuned DistilBERT model for classification detecting which
+President delivered which address, with very good results (accuracy 93% - 95%
+depending on the run). An analogous task was performed to determine the year of
+writing, and we were able to pin it down to about 4 years (which is a single
+presidential term).
+  It is worth noting that SOTU addresses provide relatively small writing
+samples (with about 8'000 words on average, and varying widely from under 2'000
+words to more than 20'000), and that the number of authors is relatively large
+(we used SOTU addresses of 42 US presidents). This shows that the techniques
+employed turn out to be rather efficient, while all the computations described
+in this note can be performed using a single GPU instance of Google Colab.
+  The accompanying code is available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 8 figures; GitHub repository
+  https://github.com/sashakolpakov/ripple_in_time</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Generate Questions by Enhancing Text Generation with
+  Sentence Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pham Quoc-Hung, Minh-Tien Nguyen, Manh Tran-Tien, Hung Le, Xuan-Hieu Phan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce an approach for the answer-aware question generation problem.
+Instead of only relying on the capability of strong pre-trained language
+models, we observe that the information of answers and questions can be found
+in some relevant sentences in the context. Based on that, we design a model
+which includes two modules: a selector and a generator. The selector forces the
+model to more focus on relevant sentences regarding an answer to provide
+implicit local information. The generator generates questions by implicitly
+combining local information from the selector and global information from the
+whole context encoded by the encoder. The model is trained jointly to take
+advantage of latent interactions between the two modules. Experimental results
+on two benchmark datasets show that our model is better than strong pre-trained
+models for the question generation task. The code is also available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper describes an on-going work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ APLe: Token-Wise Adaptive for Multi-Modal <span class="highlight-title">Prompt</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guiming Cao, Kaize Shi, Hong Fu, Huaiwen Zhang, Guandong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained Vision-Language (V-L) models set the benchmark for generalization
+to downstream tasks among the noteworthy contenders. Many characteristics of
+the V-L model have been explored in existing research including the challenge
+of the sensitivity to text input and the tuning process across multi-modal
+prompts. With the advanced utilization of the V-L model like CLIP, recent
+approaches deploy learnable prompts instead of hand-craft prompts to boost the
+generalization performance and address the aforementioned challenges. Inspired
+by layer-wise training, which is wildly used in image fusion, we note that
+using a sequential training process to adapt different modalities branches of
+CLIP efficiently facilitates the improvement of generalization. In the context
+of addressing the multi-modal prompting challenge, we propose Token-wise
+Adaptive for Multi-modal Prompt Learning (APLe) for tuning both modalities
+prompts, vision and language, as tokens in a sequential manner. APLe addresses
+the challenges in V-L models to promote prompt learning across both modalities,
+which indicates a competitive generalization performance in line with the
+state-of-the-art. Preeminently, APLe shows robustness and favourable
+performance in prompt-length experiments with an absolute advantage in adopting
+the V-L models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages,3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retrieval meets Long Context Large Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03025v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03025v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xu, Wei Ping, Xianchao Wu, Lawrence McAfee, Chen Zhu, Zihan Liu, Sandeep Subramanian, Evelina Bakhturina, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extending the context window of large language models (LLMs) is getting
+popular recently, while the solution of augmenting LLMs with retrieval has
+existed for years. The natural questions are: i) Retrieval-augmentation versus
+long context window, which one is better for downstream tasks? ii) Can both
+methods be combined to get the best of both worlds? In this work, we answer
+these questions by studying both solutions using two state-of-the-art
+pretrained LLMs, i.e., a proprietary 43B GPT and Llama2-70B. Perhaps
+surprisingly, we find that LLM with 4K context window using simple
+retrieval-augmentation at generation can achieve comparable performance to
+finetuned LLM with 16K context window via positional interpolation on long
+context tasks, while taking much less computation. More importantly, we
+demonstrate that retrieval can significantly improve the performance of LLMs
+regardless of their extended context window sizes. Our best model,
+retrieval-augmented Llama2-70B with 32K context window, outperforms
+GPT-3.5-turbo-16k and Davinci003 in terms of average score on nine long context
+tasks including question answering, query-based summarization, and in-context
+few-shot learning tasks. It also outperforms its non-retrieval Llama2-70B-32k
+baseline by a margin, while being much faster at generation. Our study provides
+general insights on the choice of retrieval-augmentation versus long context
+extension of LLM for practitioners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatial-Temporal Large Language Model for Traffic Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10134v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10134v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxi Liu, Sun Yang, Qianxiong Xu, Zhishuai Li, Cheng Long, Ziyue Li, Rui Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic prediction, a critical component for intelligent transportation
+systems, endeavors to foresee future traffic at specific locations using
+historical data. Although existing traffic prediction models often emphasize
+developing complex neural network structures, their accuracy has not seen
+improvements accordingly. Recently, Large Language Models (LLMs) have shown
+outstanding capabilities in time series analysis. Differing from existing
+models, LLMs progress mainly through parameter expansion and extensive
+pre-training while maintaining their fundamental structures. In this paper, we
+propose a Spatial-Temporal Large Language Model (ST-LLM) for traffic
+prediction. Specifically, ST-LLM redefines the timesteps at each location as
+tokens and incorporates a spatial-temporal embedding module to learn the
+spatial location and global temporal representations of tokens. Then these
+representations are fused to provide each token with unified spatial and
+temporal information. Furthermore, we propose a novel partially frozen
+attention strategy of the LLM, which is designed to capture spatial-temporal
+dependencies for traffic prediction. Comprehensive experiments on real traffic
+datasets offer evidence that ST-LLM outperforms state-of-the-art models.
+Notably, the ST-LLM also exhibits robust performance in both few-shot and
+zero-shot prediction scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revise</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Turing: A Comparative Analysis of Approaches for Detecting
+  Machine-Generated Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12373v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12373v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Farid Adilazuarda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant progress has been made on text generation by pre-trained language
+models (PLMs), yet distinguishing between human and machine-generated text
+poses an escalating challenge. This paper offers an in-depth evaluation of
+three distinct methods used to address this task: traditional shallow learning,
+Language Model (LM) fine-tuning, and Multilingual Model fine-tuning. These
+approaches are rigorously tested on a wide range of machine-generated texts,
+providing a benchmark of their competence in distinguishing between
+human-authored and machine-authored linguistic constructs. The results reveal
+considerable differences in performance across methods, thus emphasizing the
+continued need for advancement in this crucial area of NLP. This study offers
+valuable insights and paves the way for future research aimed at creating
+robust and highly discriminative models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Analysis of User Behaviors for Objectively Evaluating Spoken Dialogue
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koji Inoue, Divesh Lala, Keiko Ochi, Tatsuya Kawahara, Gabriel Skantze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Establishing evaluation schemes for spoken dialogue systems is important, but
+it can also be challenging. While subjective evaluations are commonly used in
+user experiments, objective evaluations are necessary for research comparison
+and reproducibility. To address this issue, we propose a framework for
+indirectly but objectively evaluating systems based on users' behaviors. In
+this paper, to this end, we investigate the relationship between user behaviors
+and subjective evaluation scores in social dialogue tasks: attentive listening,
+job interview, and first-meeting conversation. The results reveal that in
+dialogue tasks where user utterances are primary, such as attentive listening
+and job interview, indicators like the number of utterances and words play a
+significant role in evaluation. Observing disfluency also can indicate the
+effectiveness of formal tasks, such as job interview. On the other hand, in
+dialogue tasks with high interactivity, such as first-meeting conversation,
+behaviors related to turn-taking, like average switch pause length, become more
+important. These findings suggest that selecting appropriate user behaviors can
+provide valuable insights for objective evaluation in each social dialogue
+task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for presentation at International
+  Workshop on Spoken Dialogue Systems Technology 2024 (IWSDS 2024) and
+  represents the author's version of the work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChatQA: Building <span class="highlight-title">GPT</span>-4 Level Conversational QA Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10225v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10225v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Liu, Wei Ping, Rajarshi Roy, Peng Xu, Chankyu Lee, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce ChatQA, a family of conversational question
+answering (QA) models that obtain GPT-4 level accuracies. Specifically, we
+propose a two-stage instruction tuning method that can significantly improve
+the zero-shot conversational QA results from large language models (LLMs). To
+handle retrieval-augmented generation in conversational QA, we fine-tune a
+dense retriever on a multi-turn QA dataset, which provides comparable results
+to using the state-of-the-art query rewriting model while largely reducing
+deployment cost. Notably, our ChatQA-70B can outperform GPT-4 in terms of
+average score on 10 conversational QA datasets (54.14 vs. 53.90), without
+relying on any synthetic data from OpenAI GPT models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We added ChatQA-22B results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpecInfer: Accelerating Generative Large Language Model Serving with
+  Tree-based Speculative Inference and Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09781v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09781v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Zhengxin Zhang, Rae Ying Yee Wong, Alan Zhu, Lijie Yang, Xiaoxiang Shi, Chunan Shi, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, Zhihao Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces SpecInfer, a system that accelerates generative large
+language model (LLM) serving with tree-based speculative inference and
+verification. The key idea behind SpecInfer is leveraging small speculative
+models to predict the LLM's outputs; the predictions are organized as a token
+tree, whose nodes each represent a candidate token sequence. The correctness of
+all candidate token sequences represented by a token tree is verified against
+the LLM in parallel using a novel tree-based parallel decoding mechanism.
+SpecInfer uses an LLM as a token tree verifier instead of an incremental
+decoder, which significantly reduces the end-to-end latency and computational
+requirement for serving generative LLMs while provably preserving model
+quality. Our evaluation shows that SpecInfer outperforms existing LLM serving
+systems by 1.5-2.8x for distributed LLM inference and by 2.6-3.5x for
+offloading-based LLM inference, while preserving the same generative
+performance. SpecInfer is publicly available at
+https://github.com/flexflow/FlexFlow/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Human-centered NLP Fact-checking: Co-Designing with Fact-checkers using
+  Matchmaking for AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07213v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07213v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Houjiang Liu, Anubrata Das, Alexander Boltz, Didi Zhou, Daisy Pinaroc, Matthew Lease, Min Kyung Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While many Natural Language Processing (NLP) techniques have been proposed
+for fact-checking, both academic research and fact-checking organizations
+report limited adoption of such NLP work due to poor alignment with
+fact-checker practices, values, and needs. To address this, we investigate a
+co-design method, Matchmaking for AI, to enable fact-checkers, designers, and
+NLP researchers to collaboratively identify what fact-checker needs should be
+addressed by technology, and to brainstorm ideas for potential solutions.
+Co-design sessions we conducted with 22 professional fact-checkers yielded a
+set of 11 design ideas that offer a "north star", integrating fact-checker
+criteria into novel NLP design concepts. These concepts range from pre-bunking
+misinformation, efficient and personalized monitoring misinformation,
+proactively reducing fact-checker potential biases, and collaborative writing
+fact-check reports. Our work provides new insights into both human-centered
+fact-checking research and practice and AI co-design research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Blending Is All You Need: Cheaper, Better Alternative to
+  Trillion-Parameters LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.02994v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.02994v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoding Lu, Zongyi Liu, Adian Liusie, Vyas Raina, Vineet Mudupalli, Yuwen Zhang, William Beauchamp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In conversational AI research, there's a noticeable trend towards developing
+models with a larger number of parameters, exemplified by models like ChatGPT.
+While these expansive models tend to generate increasingly better chat
+responses, they demand significant computational resources and memory. This
+study explores a pertinent question: Can a combination of smaller models
+collaboratively achieve comparable or enhanced performance relative to a
+singular large model? We introduce an approach termed "blending", a
+straightforward yet effective method of integrating multiple chat AIs. Our
+empirical evidence suggests that when specific smaller models are
+synergistically blended, they can potentially outperform or match the
+capabilities of much larger counterparts. For instance, integrating just three
+models of moderate size (6B/13B paramaeters) can rival or even surpass the
+performance metrics of a substantially larger model like ChatGPT (175B+
+paramaters). This hypothesis is rigorously tested using A/B testing
+methodologies with a large user base on the Chai research platform over a span
+of thirty days. The findings underscore the potential of the "blending"
+strategy as a viable approach for enhancing chat AI efficacy without a
+corresponding surge in computational demands.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-context Learning with Retrieved Demonstrations for Language Models: A
+  <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11624v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11624v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Man Luo, Xin Xu, Yue Liu, Panupong Pasupat, Mehran Kazemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models, especially pre-trained large language models, have showcased
+remarkable abilities as few-shot in-context learners (ICL), adept at adapting
+to new tasks with just a few demonstrations in the input context. However, the
+model's ability to perform ICL is sensitive to the choice of the few-shot
+demonstrations. Instead of using a fixed set of demonstrations, one recent
+development is to retrieve demonstrations tailored to each input query. The
+implementation of demonstration retrieval is relatively straightforward,
+leveraging existing databases and retrieval systems. This not only improves the
+efficiency and scalability of the learning process but also has been shown to
+reduce biases inherent in manual example selection. In light of the encouraging
+results and growing research in ICL with retrieved demonstrations, we conduct
+an extensive review of studies in this area. In this survey, we discuss and
+compare different design choices for retrieval models, retrieval training
+procedures, and inference algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FAIR Enough: How Can We Develop and Assess a FAIR-Compliant <span class="highlight-title">Dataset</span> for
+  Large Language Models' Training? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11033v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11033v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaina Raza, Shardul Ghuge, Chen Ding, Deval Pandya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid evolution of Large Language Models (LLMs) underscores the critical
+importance of ethical considerations and data integrity in AI development,
+emphasizing the role of FAIR (Findable, Accessible, Interoperable, Reusable)
+data principles. While these principles have long been a cornerstone of ethical
+data stewardship, their application in LLM training data is less prevalent, an
+issue our research aims to address. Our study begins with a review of existing
+literature, highlighting the significance of FAIR principles in data management
+for model training. Building on this foundation, we introduce a novel framework
+that incorporates FAIR principles into the LLM training process. A key aspect
+of this approach is a comprehensive checklist, designed to assist researchers
+and developers in consistently applying FAIR data principles throughout the
+model development lifecycle. The practicality and effectiveness of our
+framework are demonstrated through a case study that involves creating a
+FAIR-compliant dataset to detect and reduce biases. This case study not only
+validates the usefulness of our framework but also establishes new benchmarks
+for more equitable, transparent, and ethical practices in LLM training. We
+offer this framework to the community as a means to promote technologically
+advanced, ethically sound, and socially responsible AI models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EMO: Earth Mover Distance Optimization for Auto-Regressive Language
+  Modeling <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04691v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04691v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Ren, Zhiyong Wu, Kenny Q. Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural language models are probabilistic models of human text. They are
+predominantly trained using maximum likelihood estimation (MLE), which is
+equivalent to minimizing the forward cross-entropy between the empirical data
+distribution and the model distribution. However, various degeneration
+phenomena are still widely observed when decoding from the distributions
+learned by such models. We establish that the forward cross-entropy is
+suboptimal as a distance metric for aligning human and model distribution due
+to its (1) recall-prioritization (2) negative diversity ignorance and (3)
+train-test mismatch. In this paper, we propose Earth Mover Distance
+Optimization (EMO) for auto-regressive language modeling. EMO capitalizes on
+the inherent properties of earth mover distance to address the aforementioned
+challenges. Due to the high complexity of direct computation, we further
+introduce a feasible upper bound for EMO to ease end-to-end training. Upon
+extensive evaluation of language models trained using EMO and MLE. We find that
+EMO demonstrates a consistently better language modeling performance than MLE
+across domains. Moreover, EMO demonstrates noteworthy enhancements in
+downstream performance with minimal fine-tuning on merely 25,000 sentences.
+This highlights the tremendous potential of EMO as a lightweight calibration
+method for enhancing large-scale pre-trained language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multitask Training Approach to Enhance Whisper with Contextual Biasing
+  and Open-Vocabulary Keyword Spotting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09552v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09552v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuang Li, Yinglu Li, Min Zhang, Chang Su, Mengxin Ren, Xiaosong Qiao, Xiaofeng Zhao, Mengyao Piao, Jiawei Yu, Xinglin Lv, Miaomiao Ma, Yanqing Zhao, Hao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end automatic speech recognition (ASR) systems often struggle to
+recognize rare name entities, such as personal names, organizations, and
+terminologies not frequently encountered in the training data. This paper
+presents Contextual Biasing Whisper (CB-Whisper), a novel ASR system based on
+OpenAI's Whisper model that can recognize user-defined name entities by
+performing open-vocabulary keyword-spotting (OV-KWS) using the hidden states of
+Whisper encoder. The recognized entities are used as prompts for the Whisper
+decoder. We first propose a multitask training approach with OV-KWS and ASR
+tasks to optimize the model. Experiments show that this approach substantially
+improves the entity recalls compared to the original Whisper model on Chinese
+Aishell hot word subsets and two internal code-switch test sets. However, we
+observed a slight increase in mixed-error-rate (MER) on internal test sets due
+to catastrophic forgetting. To address this problem and use different sizes of
+the Whisper model without finetuning, we propose to use OV-KWS as a separate
+module and construct a spoken form prompt to prevent hallucination. The OV-KWS
+module consistently improves MER and Entity Recall for whisper-small, medium,
+and large models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Chain of Thought: Bridging Logical Gaps with Multimodal
+  Infillings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02317v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02317v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Rose, Vaishnavi Himakunthala, Andy Ouyang, Ryan He, Alex Mei, Yujie Lu, Michael Saxon, Chinmay Sonar, Diba Mirza, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language models elicit reasoning in a
+chain-of-thought that allows models to decompose problems in a human-like
+fashion. Though this paradigm improves multi-step reasoning ability in language
+models, it is limited by being unimodal and applied mainly to
+question-answering tasks. We claim that incorporating visual augmentation into
+reasoning is essential, especially for complex, imaginative tasks.
+Consequently, we introduce VCoT, a novel method that leverages chain-of-thought
+prompting with vision-language grounding to recursively bridge the logical gaps
+within sequential data. Our method uses visual guidance to generate synthetic
+multimodal infillings that add consistent and novel information to reduce the
+logical gaps for downstream tasks that can benefit from temporal reasoning, as
+well as provide interpretability into models' multi-step reasoning. We apply
+VCoT to the Visual Storytelling and WikiHow summarization datasets and
+demonstrate through human evaluation that VCoT offers novel and consistent
+synthetic data augmentation beating chain-of-thought baselines, which can be
+used to enhance downstream performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AgentCoder: Multi-Agent-based Code Generation with Iterative Testing and
+  Optimisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13010v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13010v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Huang, Qingwen Bu, Jie M. Zhang, Michael Luck, Heming Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of natural language processing (NLP) has been significantly
+boosted by the development of transformer-based large language models (LLMs).
+These models have revolutionized NLP tasks, particularly in code generation,
+aiding developers in creating software with enhanced efficiency. Despite their
+advancements, challenges in balancing code snippet generation with effective
+test case generation and execution persist. To address these issues, this paper
+introduces Multi-Agent Assistant Code Generation (AgentCoder), a novel solution
+comprising a multi-agent framework with specialized agents: the programmer
+agent, the test designer agent, and the test executor agent. During the coding
+procedure, the programmer agent will focus on the code generation and
+refinement based on the test executor agent's feedback. The test designer agent
+will generate test cases for the generated code, and the test executor agent
+will run the code with the test cases and write the feedback to the programmer.
+This collaborative system ensures robust code generation, surpassing the
+limitations of single-agent models and traditional methodologies. Our extensive
+experiments on 9 code generation models and 12 enhancement approaches showcase
+AgentCoder's superior performance over existing code generation models and
+prompt engineering techniques across various benchmarks. For example,
+AgentCoder achieves 77.4% and 89.1% pass@1 in HumanEval-ET and MBPP-ET with
+GPT-3.5, while SOTA baselines obtain only 69.5% and 63.0%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpeechTokenizer: Unified Speech Tokenizer for Speech Large Language
+  Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16692v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16692v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhang, Dong Zhang, Shimin Li, Yaqian Zhou, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current speech large language models build upon discrete speech
+representations, which can be categorized into semantic tokens and acoustic
+tokens. However, existing speech tokens are not specifically designed for
+speech language modeling. To assess the suitability of speech tokens for
+building speech language models, we established the first benchmark,
+SLMTokBench. Our results indicate that neither semantic nor acoustic tokens are
+ideal for this purpose. Therefore, we propose SpeechTokenizer, a unified speech
+tokenizer for speech large language models. SpeechTokenizer adopts the
+Encoder-Decoder architecture with residual vector quantization (RVQ). Unifying
+semantic and acoustic tokens, SpeechTokenizer disentangles different aspects of
+speech information hierarchically across different RVQ layers. Furthermore, We
+construct a Unified Speech Language Model (USLM) leveraging SpeechTokenizer.
+Experiments show that SpeechTokenizer performs comparably to EnCodec in speech
+reconstruction and demonstrates strong performance on the SLMTokBench
+benchmark. Also, USLM outperforms VALL-E in zero-shot Text-to-Speech tasks.
+Code and models are available at
+https://github.com/ZhangXInFD/SpeechTokenizer/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024. Project page is at
+  https://0nutation.github.io/SpeechTokenizer.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Statistical Rejection Sampling Improves Preference Optimization <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06657v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06657v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianqi Liu, Yao Zhao, Rishabh Joshi, Misha Khalman, Mohammad Saleh, Peter J. Liu, Jialu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Improving the alignment of language models with human preferences remains an
+active research challenge. Previous approaches have primarily utilized
+Reinforcement Learning from Human Feedback (RLHF) via online RL methods such as
+Proximal Policy Optimization (PPO). Recently, offline methods such as Sequence
+Likelihood Calibration (SLiC) and Direct Preference Optimization (DPO) have
+emerged as attractive alternatives, offering improvements in stability and
+scalability while maintaining competitive performance. SLiC refines its loss
+function using sequence pairs sampled from a supervised fine-tuned (SFT)
+policy, while DPO directly optimizes language models based on preference data,
+foregoing the need for a separate reward model. However, the maximum likelihood
+estimator (MLE) of the target optimal policy requires labeled preference pairs
+sampled from that policy. DPO's lack of a reward model constrains its ability
+to sample preference pairs from the optimal policy, and SLiC is restricted to
+sampling preference pairs only from the SFT policy. To address these
+limitations, we introduce a novel approach called Statistical Rejection
+Sampling Optimization (RSO) that aims to source preference data from the target
+optimal policy using rejection sampling, enabling a more accurate estimation of
+the optimal policy. We also propose a unified framework that enhances the loss
+functions used in both SLiC and DPO from a preference modeling standpoint.
+Through extensive experiments across three diverse tasks, we demonstrate that
+RSO consistently outperforms both SLiC and DPO on evaluations from both Large
+Language Model (LLM) and human raters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Johnny Can Persuade LLMs to Jailbreak Them: Rethinking Persuasion to
+  Challenge AI Safety by Humanizing LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06373v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06373v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Zeng, Hongpeng Lin, Jingwen Zhang, Diyi Yang, Ruoxi Jia, Weiyan Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most traditional AI safety research has approached AI models as machines and
+centered on algorithm-focused attacks developed by security experts. As large
+language models (LLMs) become increasingly common and competent, non-expert
+users can also impose risks during daily interactions. This paper introduces a
+new perspective to jailbreak LLMs as human-like communicators, to explore this
+overlooked intersection between everyday language interaction and AI safety.
+Specifically, we study how to persuade LLMs to jailbreak them. First, we
+propose a persuasion taxonomy derived from decades of social science research.
+Then, we apply the taxonomy to automatically generate interpretable persuasive
+adversarial prompts (PAP) to jailbreak LLMs. Results show that persuasion
+significantly increases the jailbreak performance across all risk categories:
+PAP consistently achieves an attack success rate of over $92\%$ on Llama 2-7b
+Chat, GPT-3.5, and GPT-4 in $10$ trials, surpassing recent algorithm-focused
+attacks. On the defense side, we explore various mechanisms against PAP and,
+found a significant gap in existing defenses, and advocate for more fundamental
+mitigation for highly interactive LLMs
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages of the main text, qualitative examples of jailbreaks may be
+  harmful in nature</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Oolong: Investigating What Makes Transfer Learning Hard with Controlled
+  Studies <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.12312v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.12312v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxuan Wu, Alex Tamkin, Isabel Papadimitriou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When we transfer a pretrained language model to a new language, there are
+many axes of variation that change at once. To disentangle the impact of
+different factors like syntactic similarity and vocabulary similarity, we
+propose a set of controlled transfer studies: we systematically transform the
+language of the GLUE benchmark, altering one axis of crosslingual variation at
+a time, and then measure the resulting drops in a pretrained model's downstream
+performance. We find that models can largely recover from syntactic-style
+shifts, but cannot recover from vocabulary misalignment and embedding matrix
+re-initialization, even with continued pretraining on 15 million tokens. %On
+the other hand, transferring to a dataset with an unaligned vocabulary is
+extremely hard to recover from in the low-data regime. Moreover, good-quality
+tokenizers in the transfer language do not make vocabulary alignment easier.
+Our experiments provide insights into the factors of cross-lingual transfer
+that researchers should most focus on when designing language transfer
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReCOGS: How Incidental Details of a Logical Form Overshadow an
+  Evaluation of Semantic Interpretation <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13716v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13716v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxuan Wu, Christopher D. Manning, Christopher Potts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositional generalization benchmarks for semantic parsing seek to assess
+whether models can accurately compute meanings for novel sentences, but
+operationalize this in terms of logical form (LF) prediction. This raises the
+concern that semantically irrelevant details of the chosen LFs could shape
+model performance. We argue that this concern is realized for the COGS
+benchmark. COGS poses generalization splits that appear impossible for
+present-day models, which could be taken as an indictment of those models.
+However, we show that the negative results trace to incidental features of COGS
+LFs. Converting these LFs to semantically equivalent ones and factoring out
+capabilities unrelated to semantic interpretation, we find that even baseline
+models get traction. A recent variable-free translation of COGS LFs suggests
+similar conclusions, but we observe this format is not semantically equivalent;
+it is incapable of accurately representing some COGS meanings. These findings
+inform our proposal for ReCOGS, a modified version of COGS that comes closer to
+assessing the target semantic capabilities while remaining very challenging.
+Overall, our results reaffirm the importance of compositional generalization
+and careful benchmark task design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conversational Health Agents: A Personalized LLM-Powered Agent Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02374v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02374v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahyar Abbasian, Iman Azimi, Amir M. Rahmani, Ramesh Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational Health Agents (CHAs) are interactive systems that provide
+healthcare services, such as assistance and diagnosis. Current CHAs, especially
+those utilizing Large Language Models (LLMs), primarily focus on conversation
+aspects. However, they offer limited agent capabilities, specifically lacking
+multi-step problem-solving, personalized conversations, and multimodal data
+analysis. Our aim is to overcome these limitations. We propose openCHA, an
+open-source LLM-powered framework, to empower conversational agents to generate
+a personalized response for users' healthcare queries. This framework enables
+developers to integrate external sources including data sources, knowledge
+bases, and analysis models, into their LLM-based solutions. openCHA includes an
+orchestrator to plan and execute actions for gathering information from
+external sources, essential for formulating responses to user inquiries. It
+facilitates knowledge acquisition, problem-solving capabilities, multilingual
+and multimodal conversations, and fosters interaction with various AI
+platforms. We illustrate the framework's proficiency in handling complex
+healthcare tasks via three demonstrations. Moreover, we release openCHA as open
+source available to the community via GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 6 figures, 3 tables, journal paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpretability at Scale: Identifying Causal Mechanisms in Alpaca <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08809v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08809v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxuan Wu, Atticus Geiger, Christopher Potts, Noah D. Goodman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining human-interpretable explanations of large, general-purpose language
+models is an urgent goal for AI safety. However, it is just as important that
+our interpretability methods are faithful to the causal dynamics underlying
+model behavior and able to robustly generalize to unseen inputs. Distributed
+Alignment Search (DAS) is a powerful gradient descent method grounded in a
+theory of causal abstraction that has uncovered perfect alignments between
+interpretable symbolic algorithms and small deep learning models fine-tuned for
+specific tasks. In the present paper, we scale DAS significantly by replacing
+the remaining brute-force search steps with learned parameters -- an approach
+we call Boundless DAS. This enables us to efficiently search for interpretable
+causal structure in large language models while they follow instructions. We
+apply Boundless DAS to the Alpaca model (7B parameters), which, off the shelf,
+solves a simple numerical reasoning problem. With Boundless DAS, we discover
+that Alpaca does this by implementing a causal model with two interpretable
+boolean variables. Furthermore, we find that the alignment of neural
+representations with these variables is robust to changes in inputs and
+instructions. These findings mark a first step toward faithfully understanding
+the inner-workings of our ever-growing and most widely deployed language
+models. Our tool is extensible to larger LLMs and is released publicly at
+`https://github.com/stanfordnlp/pyvene`.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linguistic Binding in Diffusion Models: Enhancing Attribute
+  Correspondence through Attention Map Alignment <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08877v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08877v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Royi Rassin, Eran Hirsch, Daniel Glickman, Shauli Ravfogel, Yoav Goldberg, Gal Chechik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-conditioned image generation models often generate incorrect
+associations between entities and their visual attributes. This reflects an
+impaired mapping between linguistic binding of entities and modifiers in the
+prompt and visual binding of the corresponding elements in the generated image.
+As one notable example, a query like "a pink sunflower and a yellow flamingo"
+may incorrectly produce an image of a yellow sunflower and a pink flamingo. To
+remedy this issue, we propose SynGen, an approach which first syntactically
+analyses the prompt to identify entities and their modifiers, and then uses a
+novel loss function that encourages the cross-attention maps to agree with the
+linguistic binding reflected by the syntax. Specifically, we encourage large
+overlap between attention maps of entities and their modifiers, and small
+overlap with other entities and modifier words. The loss is optimized during
+inference, without retraining or fine-tuning the model. Human evaluation on
+three datasets, including one new and challenging set, demonstrate significant
+improvements of SynGen compared with current state of the art methods. This
+work highlights how making use of sentence structure during inference can
+efficiently and substantially improve the faithfulness of text-to-image
+generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023 (oral). Our code is publicly available at
+  https://github.com/RoyiRa/Syntax-Guided-Generation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Peering Through Preferences: Unraveling Feedback Acquisition for
+  Aligning Large Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.15812v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.15812v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hritik Bansal, John Dang, Aditya Grover
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aligning large language models (LLMs) with human values and intents
+critically involves the use of human or AI feedback. While dense feedback
+annotations are expensive to acquire and integrate, sparse feedback presents a
+structural design choice between ratings (e.g., score Response A on a scale of
+1-7) and rankings (e.g., is Response A better than Response B?). In this work,
+we analyze the effect of this design choice for the alignment and evaluation of
+LLMs. We uncover an inconsistency problem wherein the preferences inferred from
+ratings and rankings significantly disagree 60% for both human and AI
+annotators. Our subsequent analysis identifies various facets of annotator
+biases that explain this phenomena, such as human annotators would rate denser
+responses higher while preferring accuracy during pairwise judgments. To our
+surprise, we also observe that the choice of feedback protocol also has a
+significant effect on the evaluation of aligned LLMs. In particular, we find
+that LLMs that leverage rankings data for alignment (say model X) are preferred
+over those that leverage ratings data (say model Y), with a rank-based
+evaluation protocol (is X/Y's response better than reference response?) but not
+with a rating-based evaluation protocol (score Rank X/Y's response on a scale
+of 1-7). Our findings thus shed light on critical gaps in methods for
+evaluating the real-world utility of language models and their strong
+dependence on the feedback protocol used for alignment. Our code and data are
+available at https://github.com/Hritikbansal/sparse_feedback.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, Accepted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using LLMs to discover emerging coded antisemitic hate-speech in
+  extremist social media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10841v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10841v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhanush Kikkisetti, Raza Ul Mustafa, Wendy Melillo, Roberto Corizzo, Zois Boukouvalas, Jeff Gill, Nathalie Japkowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online hate speech proliferation has created a difficult problem for social
+media platforms. A particular challenge relates to the use of coded language by
+groups interested in both creating a sense of belonging for its users and
+evading detection. Coded language evolves quickly and its use varies over time.
+This paper proposes a methodology for detecting emerging coded hate-laden
+terminology. The methodology is tested in the context of online antisemitic
+discourse. The approach considers posts scraped from social media platforms,
+often used by extremist users. The posts are scraped using seed expressions
+related to previously known discourse of hatred towards Jews. The method begins
+by identifying the expressions most representative of each post and calculating
+their frequency in the whole corpus. It filters out grammatically incoherent
+expressions as well as previously encountered ones so as to focus on emergent
+well-formed terminology. This is followed by an assessment of semantic
+similarity to known antisemitic terminology using a fine-tuned large language
+model, and subsequent filtering out of the expressions that are too distant
+from known expressions of hatred. Emergent antisemitic expressions containing
+terms clearly relating to Jewish topics are then removed to return only coded
+expressions of hatred.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, 2 algorithms, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Large Language Models for Clinical Decision Support by
+  Incorporating Clinical Practice Guidelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11120v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11120v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Oniani, Xizhi Wu, Shyam Visweswaran, Sumit Kapoor, Shravan Kooragayalu, Katelyn Polanska, Yanshan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background Large Language Models (LLMs), enhanced with Clinical Practice
+Guidelines (CPGs), can significantly improve Clinical Decision Support (CDS).
+However, methods for incorporating CPGs into LLMs are not well studied. Methods
+We develop three distinct methods for incorporating CPGs into LLMs: Binary
+Decision Tree (BDT), Program-Aided Graph Construction (PAGC), and
+Chain-of-Thought-Few-Shot Prompting (CoT-FSP). To evaluate the effectiveness of
+the proposed methods, we create a set of synthetic patient descriptions and
+conduct both automatic and human evaluation of the responses generated by four
+LLMs: GPT-4, GPT-3.5 Turbo, LLaMA, and PaLM 2. Zero-Shot Prompting (ZSP) was
+used as the baseline method. We focus on CDS for COVID-19 outpatient treatment
+as the case study. Results All four LLMs exhibit improved performance when
+enhanced with CPGs compared to the baseline ZSP. BDT outperformed both CoT-FSP
+and PAGC in automatic evaluation. All of the proposed methods demonstrated high
+performance in human evaluation. Conclusion LLMs enhanced with CPGs demonstrate
+superior performance, as compared to plain LLMs with ZSP, in providing accurate
+recommendations for COVID-19 outpatient treatment, which also highlights the
+potential for broader applications beyond the case study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NLP for Maternal Healthcare: Perspectives and Guiding Principles in the
+  Age of LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11803v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11803v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Antoniak, Aakanksha Naik, Carla S. Alvarado, Lucy Lu Wang, Irene Y. Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ethical frameworks for the use of natural language processing (NLP) are
+urgently needed to shape how large language models (LLMs) and similar tools are
+used for healthcare applications. Healthcare faces existing challenges
+including the balance of power in clinician-patient relationships, systemic
+health disparities, historical injustices, and economic constraints. Drawing
+directly from the voices of those most affected, and focusing on a case study
+of a specific healthcare setting, we propose a set of guiding principles for
+the use of NLP in maternal healthcare. We led an interactive session centered
+on an LLM-based chatbot demonstration during a full-day workshop with 39
+participants, and additionally surveyed 30 healthcare workers and 30 birthing
+people about their values, needs, and perceptions of NLP tools in the context
+of maternal health. We conducted quantitative and qualitative analyses of the
+survey results and interactive discussions to consolidate our findings into a
+set of guiding principles. We propose nine principles for ethical use of NLP
+for maternal healthcare, grouped into three themes: (i) recognizing contextual
+significance (ii) holistic measurements, and (iii) who/what is valued. For each
+principle, we describe its underlying rationale and provide practical advice.
+This set of principles can provide a methodological pattern for other
+researchers and serve as a resource to practitioners working on maternal health
+and other healthcare fields to emphasize the importance of technical nuance,
+historical context, and inclusive design when developing NLP technologies for
+clinical use.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">120</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-Shot Learning for the Primitives of 3D Affordance in General
+  Objects <span class="chip">SP3</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeonwoo Kim, Sookwan Han, Patrick Kwon, Hanbyul Joo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the major challenges in AI is teaching machines to precisely respond
+and utilize environmental functionalities, thereby achieving the affordance
+awareness that humans possess. Despite its importance, the field has been
+lagging in terms of learning, especially in 3D, as annotating affordance
+accompanies a laborious process due to the numerous variations of human-object
+interaction. The low availability of affordance data limits the learning in
+terms of generalization for object categories, and also simplifies the
+representation of affordance, capturing only a fraction of the affordance. To
+overcome these challenges, we propose a novel, self-supervised method to
+generate the 3D affordance examples given only a 3D object, without any manual
+annotations. The method starts by capturing the 3D object into images and
+creating 2D affordance images by inserting humans into the image via inpainting
+diffusion models, where we present the Adaptive Mask algorithm to enable human
+insertion without altering the original details of the object. The method
+consequently lifts inserted humans back to 3D to create 3D human-object pairs,
+where the depth ambiguity is resolved within a depth optimization framework
+that utilizes pre-generated human postures from multiple viewpoints. We also
+provide a novel affordance representation defined on relative orientations and
+proximity between dense human and object points, that can be easily aggregated
+from any 3D HOI datasets. The proposed representation serves as a primitive
+that can be manifested to conventional affordance representations via simple
+transformations, ranging from physically exerted affordances to nonphysical
+ones. We demonstrate the efficacy of our method and representation by
+generating the 3D affordance samples and deriving high-quality affordance
+examples from the representation, including contact, orientation, and spatial
+occupancies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://sshowbiz.github.io/ZSP3A/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GALA: Generating Animatable Layered Assets from a Single Scan 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12979v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12979v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taeksoo Kim, Byungjun Kim, Shunsuke Saito, Hanbyul Joo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present GALA, a framework that takes as input a single-layer clothed 3D
+human mesh and decomposes it into complete multi-layered 3D assets. The outputs
+can then be combined with other assets to create novel clothed human avatars
+with any pose. Existing reconstruction approaches often treat clothed humans as
+a single-layer of geometry and overlook the inherent compositionality of humans
+with hairstyles, clothing, and accessories, thereby limiting the utility of the
+meshes for downstream applications. Decomposing a single-layer mesh into
+separate layers is a challenging task because it requires the synthesis of
+plausible geometry and texture for the severely occluded regions. Moreover,
+even with successful decomposition, meshes are not normalized in terms of poses
+and body shapes, failing coherent composition with novel identities and poses.
+To address these challenges, we propose to leverage the general knowledge of a
+pretrained 2D diffusion model as geometry and appearance prior for humans and
+other assets. We first separate the input mesh using the 3D surface
+segmentation extracted from multi-view 2D segmentations. Then we synthesize the
+missing geometry of different layers in both posed and canonical spaces using a
+novel pose-guided Score Distillation Sampling (SDS) loss. Once we complete
+inpainting high-fidelity 3D geometry, we also apply the same SDS loss to its
+texture to obtain the complete appearance including the initially occluded
+regions. Through a series of decomposition steps, we obtain multiple layers of
+3D assets in a shared canonical space normalized in terms of poses and human
+shapes, hence supporting effortless composition to novel identities and
+reanimation with novel poses. Our experiments demonstrate the effectiveness of
+our approach for decomposition, canonicalization, and composition tasks
+compared to existing solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project page is available at https://snuvclab.github.io/gala/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IRIS: Inverse Rendering of Indoor Scenes from Low Dynamic Range Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhi-Hao Lin, Jia-Bin Huang, Zhengqin Li, Zhao Dong, Christian Richardt, Tuotuo Li, Michael Zollhöfer, Johannes Kopf, Shenlong Wang, Changil Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While numerous 3D reconstruction and novel-view synthesis methods allow for
+photorealistic rendering of a scene from multi-view images easily captured with
+consumer cameras, they bake illumination in their representations and fall
+short of supporting advanced applications like material editing, relighting,
+and virtual object insertion. The reconstruction of physically based material
+properties and lighting via inverse rendering promises to enable such
+applications.
+  However, most inverse rendering techniques require high dynamic range (HDR)
+images as input, a setting that is inaccessible to most users. We present a
+method that recovers the physically based material properties and
+spatially-varying HDR lighting of a scene from multi-view, low-dynamic-range
+(LDR) images. We model the LDR image formation process in our inverse rendering
+pipeline and propose a novel optimization strategy for material, lighting, and
+a camera response model. We evaluate our approach with synthetic and real
+scenes compared to the state-of-the-art inverse rendering methods that take
+either LDR or HDR input. Our method outperforms existing methods taking LDR
+images as input, and allows for highly realistic relighting and object
+insertion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Website: https://irisldr.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HAZARD Challenge: Embodied Decision Making in Dynamically Changing
+  Environments <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinhong Zhou, Sunli Chen, Yisong Wang, Haozhe Xu, Weihua Du, Hongxin Zhang, Yilun Du, Joshua B. Tenenbaum, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in high-fidelity virtual environments serve as one of the
+major driving forces for building intelligent embodied agents to perceive,
+reason and interact with the physical world. Typically, these environments
+remain unchanged unless agents interact with them. However, in real-world
+scenarios, agents might also face dynamically changing environments
+characterized by unexpected events and need to rapidly take action accordingly.
+To remedy this gap, we propose a new simulated embodied benchmark, called
+HAZARD, specifically designed to assess the decision-making abilities of
+embodied agents in dynamic situations. HAZARD consists of three unexpected
+disaster scenarios, including fire, flood, and wind, and specifically supports
+the utilization of large language models (LLMs) to assist common sense
+reasoning and decision-making. This benchmark enables us to evaluate autonomous
+agents' decision-making capabilities across various pipelines, including
+reinforcement learning (RL), rule-based, and search-based methods in
+dynamically changing environments. As a first step toward addressing this
+challenge using large language models, we further develop an LLM-based agent
+and perform an in-depth analysis of its promise and challenge of solving these
+challenging tasks. HAZARD is available at https://vis-www.cs.umass.edu/hazard/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. The first two authors contributed equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SegmentAnyBone: A Universal Model that Segments Any Bone at Any Location
+  on MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanxue Gu, Roy Colglazier, Haoyu Dong, Jikai Zhang, Yaqian Chen, Zafer Yildiz, Yuwen Chen, Lin Li, Jichen Yang, Jay Willhite, Alex M. Meyer, Brian Guo, Yashvi Atul Shah, Emily Luo, Shipra Rajput, Sally Kuehn, Clark Bulleit, Kevin A. Wu, Jisoo Lee, Brandon Ramirez, Darui Lu, Jay M. Levin, Maciej A. Mazurowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Magnetic Resonance Imaging (MRI) is pivotal in radiology, offering
+non-invasive and high-quality insights into the human body. Precise
+segmentation of MRIs into different organs and tissues would be highly
+beneficial since it would allow for a higher level of understanding of the
+image content and enable important measurements, which are essential for
+accurate diagnosis and effective treatment planning. Specifically, segmenting
+bones in MRI would allow for more quantitative assessments of musculoskeletal
+conditions, while such assessments are largely absent in current radiological
+practice. The difficulty of bone MRI segmentation is illustrated by the fact
+that limited algorithms are publicly available for use, and those contained in
+the literature typically address a specific anatomic area. In our study, we
+propose a versatile, publicly available deep-learning model for bone
+segmentation in MRI across multiple standard MRI locations. The proposed model
+can operate in two modes: fully automated segmentation and prompt-based
+segmentation. Our contributions include (1) collecting and annotating a new MRI
+dataset across various MRI protocols, encompassing over 300 annotated volumes
+and 8485 annotated slices across diverse anatomic regions; (2) investigating
+several standard network architectures and strategies for automated
+segmentation; (3) introducing SegmentAnyBone, an innovative foundational
+model-based approach that extends Segment Anything Model (SAM); (4) comparative
+analysis of our algorithm and previous approaches; and (5) generalization
+analysis of our algorithm across different anatomical locations and MRI
+sequences, as well as an external dataset. We publicly release our model at
+https://github.com/mazurowski-lab/SegmentAnyBone.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Efficacy of Text-Based Input Modalities for Action Anticipation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Apoorva Beedu, Karan Samel, Irfan Essa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although the task of anticipating future actions is highly uncertain,
+information from additional modalities help to narrow down plausible action
+choices. Each modality provides different environmental context for the model
+to learn from. While previous multi-modal methods leverage information from
+modalities such as video and audio, we primarily explore how text inputs for
+actions and objects can also enable more accurate action anticipation.
+Therefore, we propose a Multi-modal Anticipative Transformer (MAT), an
+attention-based video transformer architecture that jointly learns from
+multi-modal features and text captions. We train our model in two-stages, where
+the model first learns to predict actions in the video clip by aligning with
+captions, and during the second stage, we fine-tune the model to predict future
+actions. Compared to existing methods, MAT has the advantage of learning
+additional environmental context from two kinds of text inputs: action
+descriptions during the pre-training stage, and the text inputs for detected
+objects and actions during modality feature fusion. Through extensive
+experiments, we evaluate the effectiveness of the pre-training stage, and show
+that our model outperforms previous methods on all datasets. In addition, we
+examine the impact of object and action information obtained via text and
+perform extensive ablations. We evaluate the performance on on three datasets:
+EpicKitchens-100, EpicKitchens-55 and EGTEA GAZE+; and show that text
+descriptions do indeed aid in more effective action anticipation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoRT: Embodied Foundation Models for Large Scale Orchestration of
+  Robotic Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Ahn, Debidatta Dwibedi, Chelsea Finn, Montse Gonzalez Arenas, Keerthana Gopalakrishnan, Karol Hausman, Brian Ichter, Alex Irpan, Nikhil Joshi, Ryan Julian, Sean Kirmani, Isabel Leal, Edward Lee, Sergey Levine, Yao Lu, Isabel Leal, Sharath Maddineni, Kanishka Rao, Dorsa Sadigh, Pannag Sanketi, Pierre Sermanet, Quan Vuong, Stefan Welker, Fei Xia, Ted Xiao, Peng Xu, Steve Xu, Zhuo Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models that incorporate language, vision, and more recently
+actions have revolutionized the ability to harness internet scale data to
+reason about useful tasks. However, one of the key challenges of training
+embodied foundation models is the lack of data grounded in the physical world.
+In this paper, we propose AutoRT, a system that leverages existing foundation
+models to scale up the deployment of operational robots in completely unseen
+scenarios with minimal human supervision. AutoRT leverages vision-language
+models (VLMs) for scene understanding and grounding, and further uses large
+language models (LLMs) for proposing diverse and novel instructions to be
+performed by a fleet of robots. Guiding data collection by tapping into the
+knowledge of foundation models enables AutoRT to effectively reason about
+autonomy tradeoffs and safety while significantly scaling up data collection
+for robot learning. We demonstrate AutoRT proposing instructions to over 20
+robots across multiple buildings and collecting 77k real robot episodes via
+both teleoperation and autonomous robot policies. We experimentally show that
+such "in-the-wild" data collected by AutoRT is significantly more diverse, and
+that AutoRT's use of LLMs allows for instruction following data collection
+robots that can align to human preferences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coverage Axis++: Efficient Inner Point Selection for 3D Shape
+  Skeletonization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zimeng Wang, Zhiyang Dou, Rui Xu, Cheng Lin, Yuan Liu, Xiaoxiao Long, Shiqing Xin, Lingjie Liu, Taku Komura, Xiaoming Yuan, Wenping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Coverage Axis++, a novel and efficient approach to 3D shape
+skeletonization. The current state-of-the-art approaches for this task often
+rely on the watertightness of the input or suffer from substantial
+computational costs, thereby limiting their practicality. To address this
+challenge, Coverage Axis++ proposes a heuristic algorithm to select skeletal
+points, offering a high-accuracy approximation of the Medial Axis Transform
+(MAT) while significantly mitigating computational intensity for various shape
+representations. We introduce a simple yet effective strategy that considers
+both shape coverage and uniformity to derive skeletal points. The selection
+procedure enforces consistency with the shape structure while favoring the
+dominant medial balls, which thus introduces a compact underlying shape
+representation in terms of MAT. As a result, Coverage Axis++ allows for
+skeletonization for various shape representations (e.g., water-tight meshes,
+triangle soups, point clouds), specification of the number of skeletal points,
+few hyperparameters, and highly efficient computation with improved
+reconstruction accuracy. Extensive experiments across a wide range of 3D shapes
+validate the efficiency and effectiveness of Coverage Axis++. The code will be
+publicly available once the paper is published.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lumiere: A Space-Time Diffusion Model for Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omer Bar-Tal, Hila Chefer, Omer Tov, Charles Herrmann, Roni Paiss, Shiran Zada, Ariel Ephrat, Junhwa Hur, Yuanzhen Li, Tomer Michaeli, Oliver Wang, Deqing Sun, Tali Dekel, Inbar Mosseri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Lumiere -- a text-to-video diffusion model designed for
+synthesizing videos that portray realistic, diverse and coherent motion -- a
+pivotal challenge in video synthesis. To this end, we introduce a Space-Time
+U-Net architecture that generates the entire temporal duration of the video at
+once, through a single pass in the model. This is in contrast to existing video
+models which synthesize distant keyframes followed by temporal super-resolution
+-- an approach that inherently makes global temporal consistency difficult to
+achieve. By deploying both spatial and (importantly) temporal down- and
+up-sampling and leveraging a pre-trained text-to-image diffusion model, our
+model learns to directly generate a full-frame-rate, low-resolution video by
+processing it in multiple space-time scales. We demonstrate state-of-the-art
+text-to-video generation results, and show that our design easily facilitates a
+wide range of content creation tasks and video editing applications, including
+image-to-video, video inpainting, and stylized generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Webpage: https://lumiere-video.github.io/ | Video:
+  https://www.youtube.com/watch?v=wxLr02Dz2Sc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural deformation fields for template-based reconstruction of cortical
+  surfaces from MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Bongratz, Anne-Marie Rickmann, Christian Wachinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The reconstruction of cortical surfaces is a prerequisite for quantitative
+analyses of the cerebral cortex in magnetic resonance imaging (MRI). Existing
+segmentation-based methods separate the surface registration from the surface
+extraction, which is computationally inefficient and prone to distortions. We
+introduce Vox2Cortex-Flow (V2C-Flow), a deep mesh-deformation technique that
+learns a deformation field from a brain template to the cortical surfaces of an
+MRI scan. To this end, we present a geometric neural network that models the
+deformation-describing ordinary differential equation in a continuous manner.
+The network architecture comprises convolutional and graph-convolutional
+layers, which allows it to work with images and meshes at the same time.
+V2C-Flow is not only very fast, requiring less than two seconds to infer all
+four cortical surfaces, but also establishes vertex-wise correspondences to the
+template during reconstruction. In addition, V2C-Flow is the first approach for
+cortex reconstruction that models white matter and pial surfaces jointly,
+therefore avoiding intersections between them. Our comprehensive experiments on
+internal and external test data demonstrate that V2C-Flow results in cortical
+surfaces that are state-of-the-art in terms of accuracy. Moreover, we show that
+the established correspondences are more consistent than in FreeSurfer and that
+they can directly be utilized for cortex parcellation and group analyses of
+cortical thickness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Medical Image Analysis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Segmentation of tibiofemoral joint tissues from knee MRI using MtRA-Unet
+  and incorporating shape information: Data from the Osteoarthritis Initiative 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12932v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12932v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akshay Daydar, Alik Pramanick, Arijit Sur, Subramani Kanagaraj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knee Osteoarthritis (KOA) is the third most prevalent Musculoskeletal
+Disorder (MSD) after neck and back pain. To monitor such a severe MSD, a
+segmentation map of the femur, tibia and tibiofemoral cartilage is usually
+accessed using the automated segmentation algorithm from the Magnetic Resonance
+Imaging (MRI) of the knee. But, in recent works, such segmentation is
+conceivable only from the multistage framework thus creating data handling
+issues and needing continuous manual inference rendering it unable to make a
+quick and precise clinical diagnosis. In order to solve these issues, in this
+paper the Multi-Resolution Attentive-Unet (MtRA-Unet) is proposed to segment
+the femur, tibia and tibiofemoral cartilage automatically. The proposed work
+has included a novel Multi-Resolution Feature Fusion (MRFF) and Shape
+Reconstruction (SR) loss that focuses on multi-contextual information and
+structural anatomical details of the femur, tibia and tibiofemoral cartilage.
+Unlike previous approaches, the proposed work is a single-stage and end-to-end
+framework producing a Dice Similarity Coefficient (DSC) of 98.5% for the femur,
+98.4% for the tibia, 89.1% for Femoral Cartilage (FC) and 86.1% for Tibial
+Cartilage (TC) for critical MRI slices that can be helpful to clinicians for
+KOA grading. The time to segment MRI volume (160 slices) per subject is 22 sec.
+which is one of the fastest among state-of-the-art. Moreover, comprehensive
+experimentation on the segmentation of FC and TC which is of utmost importance
+for morphology-based studies to check KOA progression reveals that the proposed
+method has produced an excellent result with binary segmentation
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Red Teaming Visual Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mukai Li, Lei Li, Yuwei Yin, Masood Ahmed, Zhenguang Liu, Qi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  VLMs (Vision-Language Models) extend the capabilities of LLMs (Large Language
+Models) to accept multimodal inputs. Since it has been verified that LLMs can
+be induced to generate harmful or inaccurate content through specific test
+cases (termed as Red Teaming), how VLMs perform in similar scenarios,
+especially with their combination of textual and visual inputs, remains a
+question. To explore this problem, we present a novel red teaming dataset
+RTVLM, which encompasses 10 subtasks (e.g., image misleading, multi-modal
+jail-breaking, face fairness, etc) under 4 primary aspects (faithfulness,
+privacy, safety, fairness). Our RTVLM is the first red-teaming dataset to
+benchmark current VLMs in terms of these 4 different aspects. Detailed analysis
+shows that 10 prominent open-sourced VLMs struggle with the red teaming in
+different degrees and have up to 31% performance gap with GPT-4V. Additionally,
+we simply apply red teaming alignment to LLaVA-v1.5 with Supervised Fine-tuning
+(SFT) using RTVLM, and this bolsters the models' performance with 10% in RTVLM
+test set, 13% in MM-Hal, and without noticeable decline in MM-Bench,
+overpassing other LLaVA-based models with regular alignment data. This reveals
+that current open-sourced VLMs still lack red teaming alignment. Our code and
+datasets will be open-source.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Working in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Facing the Elephant in the Room: Visual <span class="highlight-title">Prompt</span> Tuning or Full
+  Finetuning? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Han, Qifan Wang, Yiming Cui, Wenguan Wang, Lifu Huang, Siyuan Qi, Dongfang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the scale of vision models continues to grow, the emergence of Visual
+Prompt Tuning (VPT) as a parameter-efficient transfer learning technique has
+gained attention due to its superior performance compared to traditional
+full-finetuning. However, the conditions favoring VPT (the ``when") and the
+underlying rationale (the ``why") remain unclear. In this paper, we conduct a
+comprehensive analysis across 19 distinct datasets and tasks. To understand the
+``when" aspect, we identify the scenarios where VPT proves favorable by two
+dimensions: task objectives and data distributions. We find that VPT is
+preferrable when there is 1) a substantial disparity between the original and
+the downstream task objectives (e.g., transitioning from classification to
+counting), or 2) a similarity in data distributions between the two tasks
+(e.g., both involve natural images). In exploring the ``why" dimension, our
+results indicate VPT's success cannot be attributed solely to overfitting and
+optimization considerations. The unique way VPT preserves original features and
+adds parameters appears to be a pivotal factor. Our study provides insights
+into VPT's mechanisms, and offers guidance for its optimal utilization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 19 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PSAvatar: A Point-based Morphable Shape Model for Real-Time Head Avatar
+  Creation with 3D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongyuan Zhao, Zhenyu Bao, Qing Li, Guoping Qiu, Kanglin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite much progress, creating real-time high-fidelity head avatar is still
+difficult and existing methods have to trade-off between speed and quality.
+3DMM based methods often fail to model non-facial structures such as eyeglasses
+and hairstyles, while neural implicit models suffer from deformation
+inflexibility and rendering inefficiency.
+  Although 3D Gaussian has been demonstrated to possess promising capability
+for geometry representation and radiance field reconstruction, applying 3D
+Gaussian in head avatar creation remains a major challenge since it is
+difficult for 3D Gaussian to model the head shape variations caused by changing
+poses and expressions. In this paper, we introduce PSAvatar, a novel framework
+for animatable head avatar creation that utilizes discrete geometric primitive
+to create a parametric morphable shape model and employs 3D Gaussian for fine
+detail representation and high fidelity rendering. The parametric morphable
+shape model is a Point-based Morphable Shape Model (PMSM) which uses points
+instead of meshes for 3D representation to achieve enhanced representation
+flexibility. The PMSM first converts the FLAME mesh to points by sampling on
+the surfaces as well as off the meshes to enable the reconstruction of not only
+surface-like structures but also complex geometries such as eyeglasses and
+hairstyles. By aligning these points with the head shape in an
+analysis-by-synthesis manner, the PMSM makes it possible to utilize 3D Gaussian
+for fine detail representation and appearance modeling, thus enabling the
+creation of high-fidelity avatars. We show that PSAvatar can reconstruct
+high-fidelity head avatars of a variety of subjects and the avatars can be
+animated in real-time ($\ge$ 25 fps at a resolution of 512 x 512 )
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-Centric Evolution in Autonomous Driving: A Comprehensive <span class="highlight-title">Survey</span> of
+  Big Data System, Data Mining, and Closed-Loop Technologies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lincan Li, Wei Shao, Wei Dong, Yijun Tian, Kaixiang Yang, Wenjie Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aspiration of the next generation's autonomous driving (AD) technology
+relies on the dedicated integration and interaction among intelligent
+perception, prediction, planning, and low-level control. There has been a huge
+bottleneck regarding the upper bound of autonomous driving algorithm
+performance, a consensus from academia and industry believes that the key to
+surmount the bottleneck lies in data-centric autonomous driving technology.
+Recent advancement in AD simulation, closed-loop model training, and AD big
+data engine have gained some valuable experience. However, there is a lack of
+systematic knowledge and deep understanding regarding how to build efficient
+data-centric AD technology for AD algorithm self-evolution and better AD big
+data accumulation. To fill in the identified research gaps, this article will
+closely focus on reviewing the state-of-the-art data-driven autonomous driving
+technologies, with an emphasis on the comprehensive taxonomy of autonomous
+driving datasets characterized by milestone generations, key features, data
+acquisition settings, etc. Furthermore, we provide a systematic review of the
+existing benchmark closed-loop AD big data pipelines from the industrial
+frontier, including the procedure of closed-loop frameworks, key technologies,
+and empirical studies. Finally, the future directions, potential applications,
+limitations and concerns are discussed to arouse efforts from both academia and
+industry for promoting the further development of autonomous driving.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unlocking the Potential: Multi-task Deep Learning for Spaceborne
+  Quantitative Monitoring of Fugitive Methane Plumes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxin Si, Shiliang Fu, Wei Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the intensification of global warming, the monitoring of methane
+emission and detection of gas plumes from landfills have increasingly received
+attention. We decompose methane emission monitoring into three sub-tasks:
+methane concentration inversion, plume segmentation, and emission rate
+estimation. Conventional algorithms have limitations: methane concentration
+inversion usually uses the matched filter, which is sensitive to global
+spectrum distribution and contains a large amount of noises. There is limited
+research on plume segmentation, with many studies resorting to manual
+segmentation that is likely to be subjective. The estimation of methane
+emission rate often utilizes IME algorithm, which relies on obtaining
+meteorological measurement data. Using the WENT landfill site in Hong Kong and
+PRISMA hyperspectral satellite imagery, we propose a new deep learning-based
+framework for quantitative monitoring of methane emissions from remote sensing
+images based on physical simulation. We generate simulated methane plumes using
+large eddy simulation (LES) and different concentration maps of fugitive
+emission using the radiative transfer equation (RTE), while combining
+augmentation techniques to create a simulated PRISMA dataset. We train a U-Net
+network for methane concentration inversion, a Mask R-CNN network for methane
+plume segmentation, and a ResNet-50 network for methane emission rate
+estimation. All three deep networks achieve higher validation accuracy compared
+to conventional algorithms. We further respectively combine the first two
+sub-tasks and the last two sub-tasks to design the multi-task learning models -
+MTL-01 and MTL-02, both of which achieve higher accuracy than single-task
+models. Our research serves as a demonstration of applying multi-task deep
+learning to quantitative methane monitoring and can be extended to a broad
+range of methane monitoring tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedRSU: Federated Learning for Scene Flow Estimation on Roadside Units 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoheng Fang, Rui Ye, Wenhao Wang, Zuhong Liu, Yuxiao Wang, Yafei Wang, Siheng Chen, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Roadside unit (RSU) can significantly improve the safety and robustness of
+autonomous vehicles through Vehicle-to-Everything (V2X) communication.
+Currently, the usage of a single RSU mainly focuses on real-time inference and
+V2X collaboration, while neglecting the potential value of the high-quality
+data collected by RSU sensors. Integrating the vast amounts of data from
+numerous RSUs can provide a rich source of data for model training. However,
+the absence of ground truth annotations and the difficulty of transmitting
+enormous volumes of data are two inevitable barriers to fully exploiting this
+hidden value. In this paper, we introduce FedRSU, an innovative federated
+learning framework for self-supervised scene flow estimation. In FedRSU, we
+present a recurrent self-supervision training paradigm, where for each RSU, the
+scene flow prediction of points at every timestamp can be supervised by its
+subsequent future multi-modality observation. Another key component of FedRSU
+is federated learning, where multiple devices collaboratively train an ML model
+while keeping the training data local and private. With the power of the
+recurrent self-supervised learning paradigm, FL is able to leverage innumerable
+underutilized data from RSU. To verify the FedRSU framework, we construct a
+large-scale multi-modality dataset RSU-SF. The dataset consists of 17 RSU
+clients, covering various scenarios, modalities, and sensor settings. Based on
+RSU-SF, we show that FedRSU can greatly improve model performance in ITS and
+provide a comprehensive benchmark under diverse FL scenarios. To the best of
+our knowledge, we provide the first real-world LiDAR-camera multi-modal dataset
+and benchmark for the FL community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification of grapevine varieties using UAV hyperspectral imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alfonso López, Carlos Javier Ogayar, Francisco Ramón Feito, Joaquim João Sousa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The classification of different grapevine varieties is a relevant phenotyping
+task in Precision Viticulture since it enables estimating the growth of
+vineyard rows dedicated to different varieties, among other applications
+concerning the wine industry. This task can be performed with destructive
+methods that require time-consuming tasks, including data collection and
+analysis in the laboratory. However, Unmanned Aerial Vehicles (UAV) provide a
+more efficient and less prohibitive approach to collecting hyperspectral data,
+despite acquiring noisier data. Therefore, the first task is the processing of
+these data to correct and downsample large amounts of data. In addition, the
+hyperspectral signatures of grape varieties are very similar. In this work, a
+Convolutional Neural Network (CNN) is proposed for classifying seventeen
+varieties of red and white grape variants. Rather than classifying single
+samples, these are processed together with their neighbourhood. Hence, the
+extraction of spatial and spectral features is addressed with 1) a spatial
+attention layer and 2) Inception blocks. The pipeline goes from processing to
+dataset elaboration, finishing with the training phase. The fitted model is
+evaluated in terms of response time, accuracy and data separability, and
+compared with other state-of-the-art CNNs for classifying hyperspectral data.
+Our network was proven to be much more lightweight with a reduced number of
+input bands, a lower number of trainable weights and therefore, reduced
+training time. Despite this, the evaluated metrics showed much better results
+for our network (~99% overall accuracy), in comparison with previous works
+barely achieving 81% OA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SGTR+: End-to-end Scene Graph Generation with <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongjie Li, Songyang Zhang, Xuming He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Graph Generation (SGG) remains a challenging visual understanding task
+due to its compositional property. Most previous works adopt a bottom-up,
+two-stage or point-based, one-stage approach, which often suffers from high
+time complexity or suboptimal designs. In this work, we propose a novel SGG
+method to address the aforementioned issues, formulating the task as a
+bipartite graph construction problem. To address the issues above, we create a
+transformer-based end-to-end framework to generate the entity and entity-aware
+predicate proposal set, and infer directed edges to form relation triplets.
+Moreover, we design a graph assembling module to infer the connectivity of the
+bipartite scene graph based on our entity-aware structure, enabling us to
+generate the scene graph in an end-to-end manner. Based on bipartite graph
+assembling paradigm, we further propose a new technical design to address the
+efficacy of entity-aware modeling and optimization stability of graph
+assembling. Equipped with the enhanced entity-aware design, our method achieves
+optimal performance and time-complexity. Extensive experimental results show
+that our design is able to achieve the state-of-the-art or comparable
+performance on three challenging benchmarks, surpassing most of the existing
+approaches and enjoying higher efficiency in inference. Code is available:
+https://github.com/Scarecrow0/SGTR
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TPAMI: https://ieeexplore.ieee.org/document/10315230</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DatUS^2: Data-driven Unsupervised Semantic Segmentation with <span class="highlight-title">Pre-train</span>ed
+  <span class="highlight-title">Self-supervised</span> Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12820v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12820v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sonal Kumar, Arijit Sur, Rashmi Dutta Baruah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Successive proposals of several self-supervised training schemes continue to
+emerge, taking one step closer to developing a universal foundation model. In
+this process, the unsupervised downstream tasks are recognized as one of the
+evaluation methods to validate the quality of visual features learned with a
+self-supervised training scheme. However, unsupervised dense semantic
+segmentation has not been explored as a downstream task, which can utilize and
+evaluate the quality of semantic information introduced in patch-level feature
+representations during self-supervised training of a vision transformer.
+Therefore, this paper proposes a novel data-driven approach for unsupervised
+semantic segmentation (DatUS^2) as a downstream task. DatUS^2 generates
+semantically consistent and dense pseudo annotate segmentation masks for the
+unlabeled image dataset without using any visual-prior or synchronized data. We
+compare these pseudo-annotated segmentation masks with ground truth masks for
+evaluating recent self-supervised training schemes to learn shared semantic
+properties at the patch level and discriminative semantic properties at the
+segment level. Finally, we evaluate existing state-of-the-art self-supervised
+training schemes with our proposed downstream task, i.e., DatUS^2. Also, the
+best version of DatUS^2 outperforms the existing state-of-the-art method for
+the unsupervised dense semantic segmentation task with 15.02% MiOU and 21.47%
+Pixel accuracy on the SUIM dataset. It also achieves a competitive level of
+accuracy for a large-scale and complex dataset, i.e., the COCO dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The manuscript contains 13 pages, 9 figures and 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MUSES: The Multi-Sensor Semantic Perception <span class="highlight-title">Dataset</span> for Driving under
+  Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Brödermann, David Bruggemann, Christos Sakaridis, Kevin Ta, Odysseas Liagouris, Jason Corkill, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Achieving level-5 driving automation in autonomous vehicles necessitates a
+robust semantic visual perception system capable of parsing data from different
+sensors across diverse conditions. However, existing semantic perception
+datasets often lack important non-camera modalities typically used in
+autonomous vehicles, or they do not exploit such modalities to aid and improve
+semantic annotations in challenging conditions. To address this, we introduce
+MUSES, the MUlti-SEnsor Semantic perception dataset for driving in adverse
+conditions under increased uncertainty. MUSES includes synchronized multimodal
+recordings with 2D panoptic annotations for 2500 images captured under diverse
+weather and illumination. The dataset integrates a frame camera, a lidar, a
+radar, an event camera, and an IMU/GNSS sensor. Our new two-stage panoptic
+annotation protocol captures both class-level and instance-level uncertainty in
+the ground truth and enables the novel task of uncertainty-aware panoptic
+segmentation we introduce, along with standard semantic and panoptic
+segmentation. MUSES proves both effective for training and challenging for
+evaluating models under diverse visual conditions, and it opens new avenues for
+research in multimodal and uncertainty-aware dense semantic perception. Our
+dataset and benchmark will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PSDF: Prior-Driven Neural Implicit Surface Learning for Multi-view
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanjuan Su, Chen Zhang, Qingshan Xu, Wenbing Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surface reconstruction has traditionally relied on the Multi-View Stereo
+(MVS)-based pipeline, which often suffers from noisy and incomplete geometry.
+This is due to that although MVS has been proven to be an effective way to
+recover the geometry of the scenes, especially for locally detailed areas with
+rich textures, it struggles to deal with areas with low texture and large
+variations of illumination where the photometric consistency is unreliable.
+Recently, Neural Implicit Surface Reconstruction (NISR) combines surface
+rendering and volume rendering techniques and bypasses the MVS as an
+intermediate step, which has emerged as a promising alternative to overcome the
+limitations of traditional pipelines. While NISR has shown impressive results
+on simple scenes, it remains challenging to recover delicate geometry from
+uncontrolled real-world scenes which is caused by its underconstrained
+optimization. To this end, the framework PSDF is proposed which resorts to
+external geometric priors from a pretrained MVS network and internal geometric
+priors inherent in the NISR model to facilitate high-quality neural implicit
+surface learning. Specifically, the visibility-aware feature consistency loss
+and depth prior-assisted sampling based on external geometric priors are
+introduced. These proposals provide powerfully geometric consistency
+constraints and aid in locating surface intersection points, thereby
+significantly improving the accuracy and delicate reconstruction of NISR.
+Meanwhile, the internal prior-guided importance rendering is presented to
+enhance the fidelity of the reconstructed surface mesh by mitigating the biased
+rendering issue in NISR. Extensive experiments on the Tanks and Temples dataset
+show that PSDF achieves state-of-the-art performance on complex uncontrolled
+scenes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Correlation-Embedded <span class="highlight-title">Transformer</span> Tracking: A Single-Branch Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fei Xie, Wankou Yang, Chunyu Wang, Lei Chu, Yue Cao, Chao Ma, Wenjun Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing robust and discriminative appearance models has been a
+long-standing research challenge in visual object tracking. In the prevalent
+Siamese-based paradigm, the features extracted by the Siamese-like networks are
+often insufficient to model the tracked targets and distractor objects, thereby
+hindering them from being robust and discriminative simultaneously. While most
+Siamese trackers focus on designing robust correlation operations, we propose a
+novel single-branch tracking framework inspired by the transformer. Unlike the
+Siamese-like feature extraction, our tracker deeply embeds cross-image feature
+correlation in multiple layers of the feature network. By extensively matching
+the features of the two images through multiple layers, it can suppress
+non-target features, resulting in target-aware feature extraction. The output
+features can be directly used for predicting target locations without
+additional correlation steps. Thus, we reformulate the two-branch Siamese
+tracking as a conceptually simple, fully transformer-based Single-Branch
+Tracking pipeline, dubbed SBT. After conducting an in-depth analysis of the SBT
+baseline, we summarize many effective design principles and propose an improved
+tracker dubbed SuperSBT. SuperSBT adopts a hierarchical architecture with a
+local modeling layer to enhance shallow-level features. A unified relation
+modeling is proposed to remove complex handcrafted layer pattern designs.
+SuperSBT is further improved by masked image modeling pre-training, integrating
+temporal modeling, and equipping with dedicated prediction heads. Thus,
+SuperSBT outperforms the SBT baseline by 4.7%,3.0%, and 4.5% AUC scores in
+LaSOT, TrackingNet, and GOT-10K. Notably, SuperSBT greatly raises the speed of
+SBT from 37 FPS to 81 FPS. Extensive experiments show that our method achieves
+superior results on eight VOT benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shift-ConvNets: Small Convolutional Kernel with Large Kernel Effects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12736v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12736v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dachong Li, Li Li, Zhuangzhuang Chen, Jianqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies reveal that the remarkable performance of Vision transformers
+(ViTs) benefits from large receptive fields. For this reason, the large
+convolutional kernel design becomes an ideal solution to make Convolutional
+Neural Networks (CNNs) great again. However, the typical large convolutional
+kernels turn out to be hardware-unfriendly operators, resulting in discount
+compatibility of various hardware platforms. Thus, it is unwise to simply
+enlarge the convolutional kernel size. In this paper, we reveal that small
+convolutional kernels and convolution operations can achieve the closing
+effects of large kernel sizes. Then, we propose a shift-wise operator that
+ensures the CNNs capture long-range dependencies with the help of the sparse
+mechanism, while remaining hardware-friendly. Experimental results show that
+our shift-wise operator significantly improves the accuracy of a regular CNN
+while markedly reducing computational requirements. On the ImageNet-1k, our
+shift-wise enhanced CNN model outperforms the state-of-the-art models. Code &
+models at https://github.com/lidc54/shift-wiseConv.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Object Detection Performance for Small Objects through
+  Synthetic Data Generation and Proportional Class-Balancing Technique: A
+  Comparative Study in Industrial Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jibinraj Antony, Vinit Hegiste, Ali Nazeri, Hooman Tavakoli, Snehal Walunj, Christiane Plociennik, Martin Ruskowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object Detection (OD) has proven to be a significant computer vision method
+in extracting localized class information and has multiple applications in the
+industry. Although many of the state-of-the-art (SOTA) OD models perform well
+on medium and large sized objects, they seem to under perform on small objects.
+In most of the industrial use cases, it is difficult to collect and annotate
+data for small objects, as it is time-consuming and prone to human errors.
+Additionally, those datasets are likely to be unbalanced and often result in an
+inefficient model convergence. To tackle this challenge, this study presents a
+novel approach that injects additional data points to improve the performance
+of the OD models. Using synthetic data generation, the difficulties in data
+collection and annotations for small object data points can be minimized and to
+create a dataset with balanced distribution. This paper discusses the effects
+of a simple proportional class-balancing technique, to enable better anchor
+matching of the OD models. A comparison was carried out on the performances of
+the SOTA OD models: YOLOv5, YOLOv7 and SSD, for combinations of real and
+synthetic datasets within an industrial use case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted and presented in conference ESAIM23 1st European Symposium
+  on Artificial Intelligence in Manufacturing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Two-View Topogram-Based Anatomy-Guided CT Reconstruction for Prospective
+  Risk Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Liu, Laura Klein, Yixing Huang, Edith Baader, Michael Lell, Marc Kachelrieß, Andreas Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To facilitate a prospective estimation of CT effective dose and risk
+minimization process, a prospective spatial dose estimation and the known
+anatomical structures are expected. To this end, a CT reconstruction method is
+required to reconstruct CT volumes from as few projections as possible, i.e. by
+using the topograms, with anatomical structures as correct as possible. In this
+work, an optimized CT reconstruction model based on a generative adversarial
+network (GAN) is proposed. The GAN is trained to reconstruct 3D volumes from an
+anterior-posterior and a lateral CT projection. To enhance anatomical
+structures, a pre-trained organ segmentation network and the 3D perceptual loss
+are applied during the training phase, so that the model can then generate both
+organ-enhanced CT volume and the organ segmentation mask. The proposed method
+can reconstruct CT volumes with PSNR of 26.49, RMSE of 196.17, and SSIM of
+0.64, compared to 26.21, 201.55 and 0.63 using the baseline method. In terms of
+the anatomical structure, the proposed method effectively enhances the organ
+shape and boundary and allows for a straight-forward identification of the
+relevant anatomical structures. We note that conventional reconstruction
+metrics fail to indicate the enhancement of anatomical structures. In addition
+to such metrics, the evaluation is expanded with assessing the organ
+segmentation performance. The average organ dice of the proposed method is 0.71
+compared with 0.63 in baseline model, indicating the enhancement of anatomical
+structures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pragmatic Communication in Multi-Agent Collaborative Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12694v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12694v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Hu, Xianghe Pang, Xiaoqi Qin, Yonina C. Eldar, Siheng Chen, Ping Zhang, Wenjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative perception allows each agent to enhance its perceptual
+abilities by exchanging messages with others. It inherently results in a
+trade-off between perception ability and communication costs. Previous works
+transmit complete full-frame high-dimensional feature maps among agents,
+resulting in substantial communication costs. To promote communication
+efficiency, we propose only transmitting the information needed for the
+collaborator's downstream task. This pragmatic communication strategy focuses
+on three key aspects: i) pragmatic message selection, which selects
+task-critical parts from the complete data, resulting in spatially and
+temporally sparse feature vectors; ii) pragmatic message representation, which
+achieves pragmatic approximation of high-dimensional feature vectors with a
+task-adaptive dictionary, enabling communicating with integer indices; iii)
+pragmatic collaborator selection, which identifies beneficial collaborators,
+pruning unnecessary communication links. Following this strategy, we first
+formulate a mathematical optimization framework for the
+perception-communication trade-off and then propose PragComm, a multi-agent
+collaborative perception system with two key components: i) single-agent
+detection and tracking and ii) pragmatic collaboration. The proposed PragComm
+promotes pragmatic communication and adapts to a wide range of communication
+conditions. We evaluate PragComm for both collaborative 3D object detection and
+tracking tasks in both real-world, V2V4Real, and simulation datasets, OPV2V and
+V2X-SIM2.0. PragComm consistently outperforms previous methods with more than
+32.7K times lower communication volume on OPV2V. Code is available at
+github.com/PhyllisH/PragComm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy-based Automated Model Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ru Peng, Heming Zou, Haobo Wang, Yawen Zeng, Zenan Huang, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional evaluation protocols on machine learning models rely heavily
+on a labeled, i.i.d-assumed testing dataset, which is not often present in real
+world applications. The Automated Model Evaluation (AutoEval) shows an
+alternative to this traditional workflow, by forming a proximal prediction
+pipeline of the testing performance without the presence of ground-truth
+labels. Despite its recent successes, the AutoEval frameworks still suffer from
+an overconfidence issue, substantial storage and computational cost. In that
+regard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that
+allows the AutoEval framework to be both more efficient and effective. The core
+of the MDE is to establish a meta-distribution statistic, on the information
+(energy) associated with individual samples, then offer a smoother
+representation enabled by energy-based learning. We further provide our
+theoretical insights by connecting the MDE with the classification loss. We
+provide extensive experiments across modalities, datasets and different
+architectural backbones to validate MDE's validity, together with its
+superiority compared with prior approaches. We also prove MDE's versatility by
+showing its seamless integration with large-scale models, and easy adaption to
+learning scenarios with noisy- or imbalanced- labels.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ClipSAM: CLIP and SAM Collaboration for Zero-Shot Anomaly Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12665v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12665v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengze Li, Jianjian Cao, Peng Ye, Yuhan Ding, Chongjun Tu, Tao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, foundational models such as CLIP and SAM have shown promising
+performance for the task of Zero-Shot Anomaly Segmentation (ZSAS). However,
+either CLIP-based or SAM-based ZSAS methods still suffer from non-negligible
+key drawbacks: 1) CLIP primarily focuses on global feature alignment across
+different inputs, leading to imprecise segmentation of local anomalous parts;
+2) SAM tends to generate numerous redundant masks without proper prompt
+constraints, resulting in complex post-processing requirements. In this work,
+we innovatively propose a CLIP and SAM collaboration framework called ClipSAM
+for ZSAS. The insight behind ClipSAM is to employ CLIP's semantic understanding
+capability for anomaly localization and rough segmentation, which is further
+used as the prompt constraints for SAM to refine the anomaly segmentation
+results. In details, we introduce a crucial Unified Multi-scale Cross-modal
+Interaction (UMCI) module for interacting language with visual features at
+multiple scales of CLIP to reason anomaly positions. Then, we design a novel
+Multi-level Mask Refinement (MMR) module, which utilizes the positional
+information as multi-level prompts for SAM to acquire hierarchical levels of
+masks and merges them. Extensive experiments validate the effectiveness of our
+approach, achieving the optimal segmentation performance on the MVTec-AD and
+VisA datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages,6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistency Enhancement-Based Deep Multiview Clustering via Contrastive
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Yang, Hua Mao, Wai Lok Woo, Jie Chen, Xi Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiview clustering (MVC) segregates data samples into meaningful clusters
+by synthesizing information across multiple views. Moreover, deep
+learning-based methods have demonstrated their strong feature learning
+capabilities in MVC scenarios. However, effectively generalizing feature
+representations while maintaining consistency is still an intractable problem.
+In addition, most existing deep clustering methods based on contrastive
+learning overlook the consistency of the clustering representations during the
+clustering process. In this paper, we show how the above problems can be
+overcome and propose a consistent enhancement-based deep MVC method via
+contrastive learning (CCEC). Specifically, semantic connection blocks are
+incorporated into a feature representation to preserve the consistent
+information among multiple views. Furthermore, the representation process for
+clustering is enhanced through spectral clustering, and the consistency across
+multiple views is improved. Experiments conducted on five datasets demonstrate
+the effectiveness and superiority of our method in comparison with the
+state-of-the-art (SOTA) methods. The code for this method can be accessed at
+https://anonymous.4open.science/r/CCEC-E84E/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Semi-supervised Unmixing using Non-convex Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behnood Rasti, Alexandre Zouaoui, Julien Mairal, Jocelyn Chanussot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a novel linear model tailored for
+semisupervised/library-based unmixing. Our model incorporates considerations
+for library mismatch while enabling the enforcement of the abundance sum-to-one
+constraint (ASC). Unlike conventional sparse unmixing methods, this model
+involves nonconvex optimization, presenting significant computational
+challenges. We demonstrate the efficacy of Alternating Methods of Multipliers
+(ADMM) in cyclically solving these intricate problems. We propose two
+semisupervised unmixing approaches, each relying on distinct priors applied to
+the new model in addition to the ASC: sparsity prior and convexity constraint.
+Our experimental results validate that enforcing the convexity constraint
+outperforms the sparsity prior for the endmember library. These results are
+corroborated across three simulated datasets (accounting for spectral
+variability and varying pixel purity levels) and the Cuprite dataset.
+Additionally, our comparison with conventional sparse unmixing methods
+showcases considerable advantages of our proposed model, which entails
+nonconvex optimization. Notably, our implementations of the proposed
+algorithms-fast semisupervised unmixing (FaSUn) and sparse unmixing using
+soft-shrinkage (SUnS)-prove considerably more efficient than traditional sparse
+unmixing methods. SUnS and FaSUn were implemented using PyTorch and provided in
+a dedicated Python package called Fast Semisupervised Unmixing (FUnmix), which
+is open-source and available at https://github.com/BehnoodRasti/FUnmix
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniHDA: Towards Universal Hybrid Domain Adaptation of Image Generators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengjia Li, Yang Liu, Yuqi Lin, Zhanwei Zhang, Yibo Zhao, weihang Pan, Tu Zheng, Zheng Yang, Yuchun Jiang, Boxi Wu, Deng Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative domain adaptation has achieved remarkable progress, enabling us to
+adapt a pre-trained generator to a new target domain. However, existing methods
+simply adapt the generator to a single target domain and are limited to a
+single modality, either text-driven or image-driven. Moreover, they are prone
+to overfitting domain-specific attributes, which inevitably compromises
+cross-domain consistency. In this paper, we propose UniHDA, a unified and
+versatile framework for generative hybrid domain adaptation with multi-modal
+references from multiple domains. We use CLIP encoder to project multi-modal
+references into a unified embedding space and then linear interpolate the
+direction vectors from multiple target domains to achieve hybrid domain
+adaptation. To ensure the cross-domain consistency, we propose a novel
+cross-domain spatial structure (CSS) loss that maintains detailed spatial
+structure information between source and target generator. Experiments show
+that the adapted generator can synthesise realistic images with various
+attribute compositions. Additionally, our framework is versatile to multiple
+generators, \eg, StyleGAN2 and Diffusion Models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RGBD Objects in the Wild: Scaling Real-World 3D Object Learning from
+  RGB-D Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12592v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12592v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongchi Xia, Yang Fu, Sifei Liu, Xiaolong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new RGB-D object dataset captured in the wild called
+WildRGB-D. Unlike most existing real-world object-centric datasets which only
+come with RGB capturing, the direct capture of the depth channel allows better
+3D annotations and broader downstream applications. WildRGB-D comprises
+large-scale category-level RGB-D object videos, which are taken using an iPhone
+to go around the objects in 360 degrees. It contains around 8500 recorded
+objects and nearly 20000 RGB-D videos across 46 common object categories. These
+videos are taken with diverse cluttered backgrounds with three setups to cover
+as many real-world scenarios as possible: (i) a single object in one video;
+(ii) multiple objects in one video; and (iii) an object with a static hand in
+one video. The dataset is annotated with object masks, real-world scale camera
+poses, and reconstructed aggregated point clouds from RGBD videos. We benchmark
+four tasks with WildRGB-D including novel view synthesis, camera pose
+estimation, object 6d pose estimation, and object surface reconstruction. Our
+experiments show that the large-scale capture of RGB-D objects provides a large
+potential to advance 3D object learning. Our project page is
+https://wildrgbd.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Implicit Neural Representation Image Codec in Resource-limited
+  Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12587v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12587v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Liu, Jiahong Chen, Bin Chen, Zimo Liu, Baoyi An, Shu-Tao Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Displaying high-quality images on edge devices, such as augmented reality
+devices, is essential for enhancing the user experience. However, these devices
+often face power consumption and computing resource limitations, making it
+challenging to apply many deep learning-based image compression algorithms in
+this field. Implicit Neural Representation (INR) for image compression is an
+emerging technology that offers two key benefits compared to cutting-edge
+autoencoder models: low computational complexity and parameter-free decoding.
+It also outperforms many traditional and early neural compression methods in
+terms of quality. In this study, we introduce a new Mixed Autoregressive Model
+(MARM) to significantly reduce the decoding time for the current INR codec,
+along with a new synthesis network to enhance reconstruction quality. MARM
+includes our proposed Autoregressive Upsampler (ARU) blocks, which are highly
+computationally efficient, and ARM from previous work to balance decoding time
+and reconstruction quality. We also propose enhancing ARU's performance using a
+checkerboard two-stage decoding strategy. Moreover, the ratio of different
+modules can be adjusted to maintain a balance between quality and speed.
+Comprehensive experiments demonstrate that our method significantly improves
+computational efficiency while preserving image quality. With different
+parameter settings, our method can outperform popular AE-based codecs in
+constrained environments in terms of both quality and decoding time, or achieve
+state-of-the-art reconstruction quality compared to other INR codecs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeRF-AD: Neural Radiance Field with Attention-based Disentanglement for
+  Talking Face Synthesis <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chongke Bi, Xiaoxing Liu, Zhilei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Talking face synthesis driven by audio is one of the current research
+hotspots in the fields of multidimensional signal processing and multimedia.
+Neural Radiance Field (NeRF) has recently been brought to this research field
+in order to enhance the realism and 3D effect of the generated faces. However,
+most existing NeRF-based methods either burden NeRF with complex learning tasks
+while lacking methods for supervised multimodal feature fusion, or cannot
+precisely map audio to the facial region related to speech movements. These
+reasons ultimately result in existing methods generating inaccurate lip shapes.
+This paper moves a portion of NeRF learning tasks ahead and proposes a talking
+face synthesis method via NeRF with attention-based disentanglement (NeRF-AD).
+In particular, an Attention-based Disentanglement module is introduced to
+disentangle the face into Audio-face and Identity-face using speech-related
+facial action unit (AU) information. To precisely regulate how audio affects
+the talking face, we only fuse the Audio-face with audio feature. In addition,
+AU information is also utilized to supervise the fusion of these two
+modalities. Extensive qualitative and quantitative experiments demonstrate that
+our NeRF-AD outperforms state-of-the-art methods in generating realistic
+talking face videos, including image quality and lip synchronization. To view
+video results, please refer to https://xiaoxingliu02.github.io/NeRF-AD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EndoGaussian: Gaussian Splatting for Deformable Surgical Scene
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12561v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12561v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Liu, Chenxin Li, Chen Yang, Yixuan Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing deformable tissues from endoscopic stereo videos is essential
+in many downstream surgical applications. However, existing methods suffer from
+slow inference speed, which greatly limits their practical use. In this paper,
+we introduce EndoGaussian, a real-time surgical scene reconstruction framework
+that builds on 3D Gaussian Splatting. Our framework represents dynamic surgical
+scenes as canonical Gaussians and a time-dependent deformation field, which
+predicts Gaussian deformations at novel timestamps. Due to the efficient
+Gaussian representation and parallel rendering pipeline, our framework
+significantly accelerates the rendering speed compared to previous methods. In
+addition, we design the deformation field as the combination of a lightweight
+encoding voxel and an extremely tiny MLP, allowing for efficient Gaussian
+tracking with a minor rendering burden. Furthermore, we design a holistic
+Gaussian initialization method to fully leverage the surface distribution
+prior, achieved by searching informative points from across the input image
+sequence. Experiments on public endoscope datasets demonstrate that our method
+can achieve real-time rendering speed (195 FPS real-time, 100$\times$ gain)
+while maintaining the state-of-the-art reconstruction quality (35.925 PSNR) and
+the fastest training speed (within 2 min/scene), showing significant promise
+for intraoperative surgery applications. Code is available at:
+\url{https://yifliu3.github.io/EndoGaussian/}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Vision <span class="highlight-title">Transformer</span>s Are Efficient Segmentation Learners
+  for Imperfect Labels <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seungho Lee, Seoungyoon Kang, Hyunjung Shim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study demonstrates a cost-effective approach to semantic segmentation
+using self-supervised vision transformers (SSVT). By freezing the SSVT backbone
+and training a lightweight segmentation head, our approach effectively utilizes
+imperfect labels, thereby improving robustness to label imperfections.
+Empirical experiments show significant performance improvements over existing
+methods for various annotation types, including scribble, point-level, and
+image-level labels. The research highlights the effectiveness of
+self-supervised vision transformers in dealing with imperfect labels, providing
+a practical and efficient solution for semantic segmentation while reducing
+annotation costs. Through extensive experiments, we confirm that our method
+outperforms baseline models for all types of imperfect labels. Especially under
+the zero-shot vision-language-model-based label, our model exhibits 11.5\%p
+performance gain compared to the baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI2024 Edge Intelligence Workshop (EIW) accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting and recognizing characters in Greek papyri with YOLOv8, DeiT
+  and SimCLR 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12513v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12513v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Turnbull, Evelyn Mannix
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The capacity to isolate and recognize individual characters from facsimile
+images of papyrus manuscripts yields rich opportunities for digital analysis.
+For this reason the `ICDAR 2023 Competition on Detection and Recognition of
+Greek Letters on Papyri' was held as part of the 17th International Conference
+on Document Analysis and Recognition. This paper discusses our submission to
+the competition. We used an ensemble of YOLOv8 models to detect and classify
+individual characters and employed two different approaches for refining the
+character predictions, including a transformer based DeiT approach and a
+ResNet-50 model trained on a large corpus of unlabelled data using SimCLR, a
+self-supervised learning method. Our submission won the recognition challenge
+with a mAP of 42.2%, and was runner-up in the detection challenge with a mean
+average precision (mAP) of 51.4%. At the more relaxed intersection over union
+threshold of 0.5, we achieved the highest mean average precision and mean
+average recall results for both detection and classification. We ran our
+prediction pipeline on more than 4,500 images from the Oxyrhynchus Papyri to
+illustrate the utility of our approach, and we release the results publicly in
+multiple formats.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convolutional Initialization for Data-Efficient Vision <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianqiao Zheng, Xueqian Li, Simon Lucey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training vision transformer networks on small datasets poses challenges. In
+contrast, convolutional neural networks (CNNs) can achieve state-of-the-art
+performance by leveraging their architectural inductive bias. In this paper, we
+investigate whether this inductive bias can be reinterpreted as an
+initialization bias within a vision transformer network. Our approach is
+motivated by the finding that random impulse filters can achieve almost
+comparable performance to learned filters in CNNs. We introduce a novel
+initialization strategy for transformer networks that can achieve comparable
+performance to CNNs on small datasets while preserving its architectural
+flexibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 9 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open-Set Facial Expression Recognition <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Zhang, Yue Yao, Xuannan Liu, Lixiong Qin, Wenjing Wang, Weihong Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expression recognition (FER) models are typically trained on datasets
+with a fixed number of seven basic classes. However, recent research works
+point out that there are far more expressions than the basic ones. Thus, when
+these models are deployed in the real world, they may encounter unknown
+classes, such as compound expressions that cannot be classified into existing
+basic classes. To address this issue, we propose the open-set FER task for the
+first time. Though there are many existing open-set recognition methods, we
+argue that they do not work well for open-set FER because FER data are all
+human faces with very small inter-class distances, which makes the open-set
+samples very similar to close-set samples. In this paper, we are the first to
+transform the disadvantage of small inter-class distance into an advantage by
+proposing a new way for open-set FER. Specifically, we find that small
+inter-class distance allows for sparsely distributed pseudo labels of open-set
+samples, which can be viewed as symmetric noisy labels. Based on this novel
+observation, we convert the open-set FER to a noisy label detection problem. We
+further propose a novel method that incorporates attention map consistency and
+cycle training to detect the open-set samples. Extensive experiments on various
+FER datasets demonstrate that our method clearly outperforms state-of-the-art
+open-set recognition methods by large margins. Code is available at
+https://github.com/zyh-uaiaaaa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Small Language Model Meets with Reinforced Vision Vocabulary 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12503v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12503v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Wei, Lingyu Kong, Jinyue Chen, Liang Zhao, Zheng Ge, En Yu, Jianjian Sun, Chunrui Han, Xiangyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Playing Large Vision Language Models (LVLMs) in 2023 is trendy among the AI
+community. However, the relatively large number of parameters (more than 7B) of
+popular LVLMs makes it difficult to train and deploy on consumer GPUs,
+discouraging many researchers with limited resources. Imagine how cool it would
+be to experience all the features of current LVLMs on an old GTX1080ti (our
+only game card). Accordingly, we present Vary-toy in this report, a small-size
+Vary along with Qwen-1.8B as the base ``large'' language model. In Vary-toy, we
+introduce an improved vision vocabulary, allowing the model to not only possess
+all features of Vary but also gather more generality. Specifically, we replace
+negative samples of natural images with positive sample data driven by object
+detection in the procedure of generating vision vocabulary, more sufficiently
+utilizing the capacity of the vocabulary network and enabling it to efficiently
+encode visual information corresponding to natural objects. For experiments,
+Vary-toy can achieve 65.6% ANLS on DocVQA, 59.1% accuracy on ChartQA, 88.1%
+accuracy on RefCOCO, and 29% on MMVet. The code will be publicly available on
+the homepage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Automated Real-Time Approach for Image Processing and Segmentation of
+  Fluoroscopic Images and Videos Using a Single Deep Learning Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12488v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12488v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viet Dung Nguyen, Michael T. LaCour, Richard D. Komistek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image segmentation in total knee arthroplasty is crucial for precise
+preoperative planning and accurate implant positioning, leading to improved
+surgical outcomes and patient satisfaction. The biggest challenges of image
+segmentation in total knee arthroplasty include accurately delineating complex
+anatomical structures, dealing with image artifacts and noise, and developing
+robust algorithms that can handle anatomical variations and pathologies
+commonly encountered in patients. The potential of using machine learning for
+image segmentation in total knee arthroplasty lies in its ability to improve
+segmentation accuracy, automate the process, and provide real-time assistance
+to surgeons, leading to enhanced surgical planning, implant placement, and
+patient outcomes. This paper proposes a methodology to use deep learning for
+robust and real-time total knee arthroplasty image segmentation. The deep
+learning model, trained on a large dataset, demonstrates outstanding
+performance in accurately segmenting both the implanted femur and tibia,
+achieving an impressive mean-Average-Precision (mAP) of 88.83 when compared to
+the ground truth while also achieving a real-time segmented speed of 20 frames
+per second (fps). We have introduced a novel methodology for segmenting
+implanted knee fluoroscopic or x-ray images that showcases remarkable levels of
+accuracy and speed, paving the way for various potential extended applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explore Synergistic Interaction Across Frames for Interactive Video
+  Object Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kexin Li, Tao Jiang, Zongxin Yang, Yi Yang, Yueting Zhuang, Jun Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactive Video Object Segmentation (iVOS) is a challenging task that
+requires real-time human-computer interaction. To improve the user experience,
+it is important to consider the user's input habits, segmentation quality,
+running time and memory consumption.However, existing methods compromise user
+experience with single input mode and slow running speed. Specifically, these
+methods only allow the user to interact with one single frame, which limits the
+expression of the user's intent.To overcome these limitations and better align
+with people's usage habits, we propose a framework that can accept multiple
+frames simultaneously and explore synergistic interaction across frames (SIAF).
+Concretely, we designed the Across-Frame Interaction Module that enables users
+to annotate different objects freely on multiple frames. The AFI module will
+migrate scribble information among multiple interactive frames and generate
+multi-frame masks. Additionally, we employ the id-queried mechanism to process
+multiple objects in batches. Furthermore, for a more efficient propagation and
+lightweight model, we design a truncated re-propagation strategy to replace the
+previous multi-round fusion module, which employs an across-round memory that
+stores important interaction information. Our SwinB-SIAF achieves new
+state-of-the-art performance on DAVIS 2017 (89.6%, J&F@60). Moreover, our
+R50-SIAF is more than 3 faster than the state-of-the-art competitor under
+challenging multi-object scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TD^2-Net: Toward Denoising and Debiasing for Dynamic Scene Graph
+  Generation <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12479v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12479v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Lin, Chong Shi, Yibing Zhan, Zuopeng Yang, Yaqi Wu, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic scene graph generation (SGG) focuses on detecting objects in a video
+and determining their pairwise relationships. Existing dynamic SGG methods
+usually suffer from several issues, including 1) Contextual noise, as some
+frames might contain occluded and blurred objects. 2) Label bias, primarily due
+to the high imbalance between a few positive relationship samples and numerous
+negative ones. Additionally, the distribution of relationships exhibits a
+long-tailed pattern. To address the above problems, in this paper, we introduce
+a network named TD$^2$-Net that aims at denoising and debiasing for dynamic
+SGG. Specifically, we first propose a denoising spatio-temporal transformer
+module that enhances object representation with robust contextual information.
+This is achieved by designing a differentiable Top-K object selector that
+utilizes the gumbel-softmax sampling strategy to select the relevant
+neighborhood for each object. Second, we introduce an asymmetrical reweighting
+loss to relieve the issue of label bias. This loss function integrates
+asymmetry focusing factors and the volume of samples to adjust the weights
+assigned to individual samples. Systematic experimental results demonstrate the
+superiority of our proposed TD$^2$-Net over existing state-of-the-art
+approaches on Action Genome databases. In more detail, TD$^2$-Net outperforms
+the second-best competitors by 12.7 \% on mean-Recall@10 for predicate
+classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero Shot Open-ended Video Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ee Yeo Keat, Zhang Hao, Alexander Matyasko, Basura Fernando
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot open-ended inference on untrimmed videos poses a significant
+challenge, especially when no annotated data is utilized to navigate the
+inference direction. In this work, we aim to address this underexplored domain
+by introducing an adaptable framework that efficiently combines both the frozen
+vision-language (VL) model and off-the-shelf large language model (LLM) for
+conducting zero-shot open-ended inference tasks without requiring any
+additional training or fine-tuning. Our comprehensive experiments span various
+video action datasets for goal inference and action recognition tasks. The
+results demonstrate the framework's superior performance in goal inference
+compared to conventional vision-language models in open-ended and close-ended
+scenarios. Notably, the proposed framework exhibits the capability to
+generalize effectively to action recognition tasks, underscoring its
+versatility and potential contributions to advancing the video-based zero-shot
+understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploration and Improvement of Nerf-based 3D Scene Editing Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12456v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12456v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shun Fang, Ming Cui, Xing Feng, Yanan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  NeRF's high-quality scene synthesis capability was quickly accepted by
+scholars in the years after it was proposed, and significant progress has been
+made in 3D scene representation and synthesis. However, the high computational
+cost limits intuitive and efficient editing of scenes, making NeRF's
+development in the scene editing field facing many challenges. This paper
+reviews the preliminary explorations of scholars on NeRF in the scene or object
+editing field in recent years, mainly changing the shape and texture of scenes
+or objects in new synthesized scenes; through the combination of residual
+models such as GaN and Transformer with NeRF, the generalization ability of
+NeRF scene editing has been further expanded, including realizing real-time new
+perspective editing feedback, multimodal editing of text synthesized 3D scenes,
+4D synthesis performance, and in-depth exploration in light and shadow editing,
+initially achieving optimization of indirect touch editing and detail
+representation in complex scenes. Currently, most NeRF editing methods focus on
+the touch points and materials of indirect points, but when dealing with more
+complex or larger 3D scenes, it is difficult to balance accuracy, breadth,
+efficiency, and quality. Overcoming these challenges may become the direction
+of future NeRF 3D scene editing technology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Learning of LiDAR 3D Point Clouds via 2D-3D Neural
+  Calibration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12452v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12452v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Zhang, Siyu Ren, Junhui Hou, Jinjian Wu, Guangming Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel self-supervised learning framework for
+enhancing 3D perception in autonomous driving scenes. Specifically, our
+approach, named NCLR, focuses on 2D-3D neural calibration, a novel pretext task
+that estimates the rigid transformation aligning camera and LiDAR coordinate
+systems. First, we propose the learnable transformation alignment to bridge the
+domain gap between image and point cloud data, converting features into a
+unified representation space for effective comparison and matching. Second, we
+identify the overlapping area between the image and point cloud with the fused
+features. Third, we establish dense 2D-3D correspondences to estimate the rigid
+transformation. The framework not only learns fine-grained matching from points
+to pixels but also achieves alignment of the image and point cloud at a
+holistic level, understanding their relative pose. We demonstrate NCLR's
+efficacy by applying the pre-trained backbone to downstream tasks, such as
+LiDAR-based 3D semantic segmentation, object detection, and panoptic
+segmentation. Comprehensive experiments on various datasets illustrate the
+superiority of NCLR over existing self-supervised methods. The results confirm
+that joint learning from different modalities significantly enhances the
+network's understanding abilities and effectiveness of learned representation.
+Code will be available at \url{https://github.com/Eaphan/NCLR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Methods and strategies for improving the novel view synthesis quality of
+  neural radiation field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shun Fang, Ming Cui, Xing Feng, Yanna Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiation Field (NeRF) technology can learn a 3D implicit model of a
+scene from 2D images and synthesize realistic novel view images. This
+technology has received widespread attention from the industry and has good
+application prospects. In response to the problem that the rendering quality of
+NeRF images needs to be improved, many researchers have proposed various
+methods to improve the rendering quality in the past three years. The latest
+relevant papers are classified and reviewed, the technical principles behind
+quality improvement are analyzed, and the future evolution direction of quality
+improvement methods is discussed. This study can help researchers quickly
+understand the current state and evolutionary context of technology in this
+field, which is helpful in inspiring the development of more efficient
+algorithms and promoting the application of NeRF technology in related fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NIV-SSD: Neighbor IoU-Voting Single-Stage Object Detector From Point
+  Cloud 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Liu, Di Wang, Quan Wang, Kai Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous single-stage detectors typically suffer the misalignment between
+localization accuracy and classification confidence. To solve the misalignment
+problem, we introduce a novel rectification method named neighbor IoU-voting
+(NIV) strategy. Typically, classification and regression are treated as
+separate branches, making it challenging to establish a connection between
+them. Consequently, the classification confidence cannot accurately reflect the
+regression quality. NIV strategy can serve as a bridge between classification
+and regression branches by calculating two types of statistical data from the
+regression output to correct the classification confidence. Furthermore, to
+alleviate the imbalance of detection accuracy for complete objects with dense
+points (easy objects) and incomplete objects with sparse points (difficult
+objects), we propose a new data augmentation scheme named object resampling. It
+undersamples easy objects and oversamples difficult objects by randomly
+transforming part of easy objects into difficult objects. Finally, combining
+the NIV strategy and object resampling augmentation, we design an efficient
+single-stage detector termed NIV-SSD. Extensive experiments on several datasets
+indicate the effectiveness of the NIV strategy and the competitive performance
+of the NIV-SSD detector. The code will be available at
+https://github.com/Say2L/NIV-SSD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAST: Video Polyp Segmentation with a Mixture-Attention Siamese
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12439v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12439v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geng Chen, Junqing Yang, Xiaozhou Pu, Ge-Peng Ji, Huan Xiong, Yongsheng Pan, Hengfei Cui, Yong Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation of polyps from colonoscopy videos is of great
+significance to polyp treatment and early prevention of colorectal cancer.
+However, it is challenging due to the difficulties associated with modelling
+long-range spatio-temporal relationships within a colonoscopy video. In this
+paper, we address this challenging task with a novel Mixture-Attention Siamese
+Transformer (MAST), which explicitly models the long-range spatio-temporal
+relationships with a mixture-attention mechanism for accurate polyp
+segmentation. Specifically, we first construct a Siamese transformer
+architecture to jointly encode paired video frames for their feature
+representations. We then design a mixture-attention module to exploit the
+intra-frame and inter-frame correlations, enhancing the features with rich
+spatio-temporal relationships. Finally, the enhanced features are fed to two
+parallel decoders for predicting the segmentation maps. To the best of our
+knowledge, our MAST is the first transformer model dedicated to video polyp
+segmentation. Extensive experiments on the large-scale SUN-SEG benchmark
+demonstrate the superior performance of MAST in comparison with the
+cutting-edge competitors. Our code is publicly available at
+https://github.com/Junqing-Yang/MAST.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Secure Federated Learning Approaches to Diagnosing COVID-19 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rittika Adhikari, Christopher Settles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent pandemic has underscored the importance of accurately diagnosing
+COVID-19 in hospital settings. A major challenge in this regard is
+differentiating COVID-19 from other respiratory illnesses based on chest
+X-rays, compounded by the restrictions of HIPAA compliance which limit the
+comparison of patient X-rays. This paper introduces a HIPAA-compliant model to
+aid in the diagnosis of COVID-19, utilizing federated learning. Federated
+learning is a distributed machine learning approach that allows for algorithm
+training across multiple decentralized devices using local data samples,
+without the need for data sharing. Our model advances previous efforts in chest
+X-ray diagnostic models. We examined leading models from established
+competitions in this domain and developed our own models tailored to be
+effective with specific hospital data. Considering the model's operation in a
+federated learning context, we explored the potential impact of biased data
+updates on the model's performance. To enhance hospital understanding of the
+model's decision-making process and to verify that the model is not focusing on
+irrelevant features, we employed a visualization technique that highlights key
+features in chest X-rays indicative of a positive COVID-19 diagnosis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Garment Transfer Method Supervised by Distilled Knowledge of
+  Virtual Try-on Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12433v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12433v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naiyu Fang, Lemiao Qiu, Shuyou Zhang, Zili Wang, Kerui Hu, Jianrong Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When a shopper chooses garments online, garment transfer technology wears the
+garment from the model image onto the shopper's image, allowing the shopper to
+decide whether the garment is suitable for them. As garment transfer leverages
+wild and cheap person image as garment condition, it has attracted tremendous
+community attention and holds vast commercial potential. However, since the
+ground truth of garment transfer is almost unavailable in reality, previous
+studies have treated garment transfer as either pose transfer or garment-pose
+disentanglement, and trained garment transfer in self-supervised learning, yet
+do not cover garment transfer intentions completely. Therefore, the training
+supervising the garment transfer is a rock-hard issue. Notably, virtual try-on
+technology has exhibited superior performance using self-supervised learning.
+We supervise the garment transfer training via knowledge distillation from
+virtual try-on. Specifically, we first train the transfer parsing reasoning
+model at multi-phases to provide shape guidance for downstream tasks. The
+transfer parsing reasoning model learns the response and feature knowledge from
+the try-on parsing reasoning model and absorbs the hard knowledge from the
+ground truth. By leveraging the warping knowledge from virtual try-on, we
+estimate a progressive flow to precisely warp the garment by learning the shape
+and content correspondence. To enhance transfer realism, we propose a
+well-designed arm regrowth task to infer exposed skin pixel content.
+Experiments demonstrate that our method has state-of-the-art performance in
+transferring garments between person compared with other virtual try-on and
+garment transfer methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Neglected Tails of Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Parashar, Zhiqiu Lin, Tian Liu, Xiangjue Dong, Yanan Li, Deva Ramanan, James Caverlee, Shu Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) excel in zero-shot recognition but exhibit
+drastically imbalanced performance across visual concepts. For example, CLIP,
+despite an impressive mean zero-shot accuracy on ImageNet (72.7%), yields
+$<$10% on ten concepts (e.g., gyromitra and night snake), presumably, because
+these concepts are under-represented in VLMs' imbalanced pretraining data. Yet,
+assessing this imbalance is challenging as it is non-trivial to calculate the
+frequency of specific concepts within VLMs' large-scale pretraining data. Our
+work makes the first attempt to measure the concept frequency by analyzing
+pretraining texts. We use off-the-shelf language models to help count relevant
+texts that contain synonyms of the given concepts and resolve linguistic
+ambiguity. We confirm that popular VLM datasets like LAION indeed exhibit
+long-tailed concept distributions, which strongly correlate with per-class
+accuracies. Further, contemporary multimodal systems, e.g., visual chatbots and
+text-to-image generators, also struggle with the rare concepts identified by
+our method. To mitigate VLMs' imbalanced performance in zero-shot recognition,
+we propose REtrieval-Augmented Learning REAL. First, instead of prompting VLMs
+using the original class names, REAL uses their most frequent synonyms found in
+VLMs' pretraining texts. This already outperforms human-engineered and
+LLM-generated prompts over nine benchmark datasets, likely because VLMs have
+seen more images associated with the frequently used synonyms. Second, REAL
+uses all the concept synonyms to retrieve a small, class-balanced set of
+pretraining data to train a robust classifier. REAL surpasses the recent
+retrieval-augmented solution REACT, using 400x less storage and 10,000x less
+training time!
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page:
+  https://shubhamprshr27.github.io/neglected-tails-of-vlms/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InverseMatrixVT3D: An Efficient Projection Matrix-Based Approach for 3D
+  Occupancy Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12422v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12422v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenxing Ming, Julie Stephany Berrio, Mao Shan, Stewart Worrall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces InverseMatrixVT3D, an efficient method for transforming
+multi-view image features into 3D feature volumes for 3D semantic occupancy
+prediction. Existing methods for constructing 3D volumes often rely on depth
+estimation, device-specific operators, or transformer queries, which hinders
+the widespread adoption of 3D occupancy models. In contrast, our approach
+leverages two projection matrices to store the static mapping relationships and
+matrix multiplications to efficiently generate global Bird's Eye View (BEV)
+features and local 3D feature volumes. Specifically, we achieve this by
+performing matrix multiplications between multi-view image feature maps and two
+sparse projection matrices. We introduce a sparse matrix handling technique for
+the projection matrices to optimise GPU memory usage. Moreover, a global-local
+attention fusion module is proposed to integrate the global BEV features with
+the local 3D feature volumes to obtain the final 3D volume. We also employ a
+multi-scale supervision mechanism to further enhance performance. Comprehensive
+experiments on the nuScenes dataset demonstrate the simplicity and
+effectiveness of our method. The code will be made available
+at:https://github.com/DanielMing123/InverseMatrixVT3D
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaEmbed: Semi-supervised Domain Adaptation in the Embedding Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Mottaghi, Mohammad Abdullah Jamal, Serena Yeung, Omid Mohareri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised domain adaptation (SSDA) presents a critical hurdle in
+computer vision, especially given the frequent scarcity of labeled data in
+real-world settings. This scarcity often causes foundation models, trained on
+extensive datasets, to underperform when applied to new domains. AdaEmbed, our
+newly proposed methodology for SSDA, offers a promising solution to these
+challenges. Leveraging the potential of unlabeled data, AdaEmbed facilitates
+the transfer of knowledge from a labeled source domain to an unlabeled target
+domain by learning a shared embedding space. By generating accurate and uniform
+pseudo-labels based on the established embedding space, the model overcomes the
+limitations of conventional SSDA, thus enhancing performance significantly. Our
+method's effectiveness is validated through extensive experiments on benchmark
+datasets such as DomainNet, Office-Home, and VisDA-C, where AdaEmbed
+consistently outperforms all the baselines, setting a new state of the art for
+SSDA. With its straightforward implementation and high data efficiency,
+AdaEmbed stands out as a robust and pragmatic solution for real-world
+scenarios, where labeled data is scarce. To foster further research and
+application in this area, we are sharing the codebase of our unified framework
+for semi-supervised domain adaptation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-modal News Understanding with Professionally Labelled Videos
+  (ReutersViLNews) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12419v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12419v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shih-Han Chou, Matthew Kowal, Yasmin Niknam, Diana Moyano, Shayaan Mehdi, Richard Pito, Cheng Zhang, Ian Knopke, Sedef Akinli Kocak, Leonid Sigal, Yalda Mohsenzadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While progress has been made in the domain of video-language understanding,
+current state-of-the-art algorithms are still limited in their ability to
+understand videos at high levels of abstraction, such as news-oriented videos.
+Alternatively, humans easily amalgamate information from video and language to
+infer information beyond what is visually observable in the pixels. An example
+of this is watching a news story, where the context of the event can play as
+big of a role in understanding the story as the event itself. Towards a
+solution for designing this ability in algorithms, we present a large-scale
+analysis on an in-house dataset collected by the Reuters News Agency, called
+Reuters Video-Language News (ReutersViLNews) dataset which focuses on
+high-level video-language understanding with an emphasis on long-form news. The
+ReutersViLNews Dataset consists of long-form news videos collected and labeled
+by news industry professionals over several years and contains prominent news
+reporting from around the world. Each video involves a single story and
+contains action shots of the actual event, interviews with people associated
+with the event, footage from nearby areas, and more. ReutersViLNews dataset
+contains videos from seven subject categories: disaster, finance,
+entertainment, health, politics, sports, and miscellaneous with annotations
+from high-level to low-level, title caption, visual video description,
+high-level story description, keywords, and location. We first present an
+analysis of the dataset statistics of ReutersViLNews compared to previous
+datasets. Then we benchmark state-of-the-art approaches for four different
+video-language tasks. The results suggest that news-oriented videos are a
+substantial challenge for current video-language understanding algorithms and
+we conclude by providing future directions in designing approaches to solve the
+ReutersViLNews dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Icy Moon Surface Simulation and Stereo Depth Estimation for Sampling
+  Autonomy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12414v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12414v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramchander Bhaskara, Georgios Georgakis, Jeremy Nash, Marissa Cameron, Joseph Bowkett, Adnan Ansar, Manoranjan Majji, Paul Backes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sampling autonomy for icy moon lander missions requires understanding of
+topographic and photometric properties of the sampling terrain. Unavailability
+of high resolution visual datasets (either bird-eye view or point-of-view from
+a lander) is an obstacle for selection, verification or development of
+perception systems. We attempt to alleviate this problem by: 1) proposing
+Graphical Utility for Icy moon Surface Simulations (GUISS) framework, for
+versatile stereo dataset generation that spans the spectrum of bulk photometric
+properties, and 2) focusing on a stereo-based visual perception system and
+evaluating both traditional and deep learning-based algorithms for depth
+estimation from stereo matching. The surface reflectance properties of icy moon
+terrains (Enceladus and Europa) are inferred from multispectral datasets of
+previous missions. With procedural terrain generation and physically valid
+illumination sources, our framework can fit a wide range of hypotheses with
+respect to visual representations of icy moon terrains. This is followed by a
+study over the performance of stereo matching algorithms under different visual
+hypotheses. Finally, we emphasize the standing challenges to be addressed for
+simulating perception data assets for icy moons such as Enceladus and Europa.
+Our code can be found here: https://github.com/nasa-jpl/guiss.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Software: https://github.com/nasa-jpl/guiss. IEEE Aerospace
+  Conference 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Spatiotemporal Clutter Filtering of Transthoracic Echocardiographic
+  Images Using a 3D Convolutional Auto-Encoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahdi Tabassian, Somayeh Akbari. S, Sandro Queirós, Jan D'hooge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a deep convolutional auto-encoder network for filtering
+reverberation artifacts, from transthoracic echocardiographic (TTE) image
+sequences. Given the spatiotemporal nature of these artifacts, the filtering
+network was built using 3D convolutional layers to suppress the clutter
+patterns throughout the cardiac cycle. The network was designed by taking
+advantage of: i) an attention mechanism to focus primarily on cluttered regions
+and ii) residual learning to preserve fine structures of the image frames. To
+train the deep network, a diverse set of artifact patterns was simulated and
+the simulated patterns were superimposed onto artifact-free ultra-realistic
+synthetic TTE sequences of six ultrasound vendors to generate input of the
+filtering network. The artifact-free sequences served as ground-truth.
+Performance of the filtering network was evaluated using unseen synthetic as
+well as in-vivo artifactual sequences. Satisfactory results obtained using the
+latter dataset confirmed the good generalization performance of the proposed
+network which was trained using the synthetic sequences and simulated artifact
+patterns. Suitability of the clutter-filtered sequences for further processing
+was assessed by computing segmental strain curves from them. The results showed
+that the large discrepancy between the strain profiles computed from the
+cluttered segments and their corresponding segments in the clutter-free images
+was significantly reduced after filtering the sequences using the proposed
+network. The trained deep network could process an artifactual TTE sequence in
+a fraction of a second and can be used for real-time clutter filtering.
+Moreover, it can improve the precision of the clinical indexes that are
+computed from the TTE sequences. The source code of the proposed method is
+available at:
+https://github.com/MahdiTabassian/Deep-Clutter-Filtering/tree/main.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual-Domain Coarse-to-Fine Progressive Estimation Network for
+  Simultaneous Denoising, Limited-View Reconstruction, and Attenuation
+  Correction of Cardiac SPECT 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13140v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13140v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiongchao Chen, Bo Zhou, Xueqi Guo, Huidong Xie, Qiong Liu, James S. Duncan, Albert J. Sinusas, Chi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single-Photon Emission Computed Tomography (SPECT) is widely applied for the
+diagnosis of coronary artery diseases. Low-dose (LD) SPECT aims to minimize
+radiation exposure but leads to increased image noise. Limited-view (LV) SPECT,
+such as the latest GE MyoSPECT ES system, enables accelerated scanning and
+reduces hardware expenses but degrades reconstruction accuracy. Additionally,
+Computed Tomography (CT) is commonly used to derive attenuation maps
+($\mu$-maps) for attenuation correction (AC) of cardiac SPECT, but it will
+introduce additional radiation exposure and SPECT-CT misalignments. Although
+various methods have been developed to solely focus on LD denoising, LV
+reconstruction, or CT-free AC in SPECT, the solution for simultaneously
+addressing these tasks remains challenging and under-explored. Furthermore, it
+is essential to explore the potential of fusing cross-domain and cross-modality
+information across these interrelated tasks to further enhance the accuracy of
+each task. Thus, we propose a Dual-Domain Coarse-to-Fine Progressive Network
+(DuDoCFNet), a multi-task learning method for simultaneous LD denoising, LV
+reconstruction, and CT-free $\mu$-map generation of cardiac SPECT. Paired
+dual-domain networks in DuDoCFNet are cascaded using a multi-layer fusion
+mechanism for cross-domain and cross-modality feature fusion. Two-stage
+progressive learning strategies are applied in both projection and image
+domains to achieve coarse-to-fine estimations of SPECT projections and
+CT-derived $\mu$-maps. Our experiments demonstrate DuDoCFNet's superior
+accuracy in estimating projections, generating $\mu$-maps, and AC
+reconstructions compared to existing single- or multi-task learning methods,
+under various iterations and LD levels. The source code of this work is
+available at https://github.com/XiongchaoChen/DuDoCFNet-MultiTask.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 Pages, 10 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Digital Divides in Scene Recognition: Uncovering Socioeconomic Biases in
+  Deep Learning Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michelle R. Greene, Mariam Josyula, Wentao Si, Jennifer A. Hart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computer-based scene understanding has influenced fields ranging from urban
+planning to autonomous vehicle performance, yet little is known about how well
+these technologies work across social differences. We investigate the biases of
+deep convolutional neural networks (dCNNs) in scene classification, using
+nearly one million images from global and US sources, including user-submitted
+home photographs and Airbnb listings. We applied statistical models to quantify
+the impact of socioeconomic indicators such as family income, Human Development
+Index (HDI), and demographic factors from public data sources (CIA and US
+Census) on dCNN performance. Our analyses revealed significant socioeconomic
+bias, where pretrained dCNNs demonstrated lower classification accuracy, lower
+classification confidence, and a higher tendency to assign labels that could be
+offensive when applied to homes (e.g., "ruin", "slum"), especially in images
+from homes with lower socioeconomic status (SES). This trend is consistent
+across two datasets of international images and within the diverse economic and
+racial landscapes of the United States. This research contributes to
+understanding biases in computer vision, emphasizing the need for more
+inclusive and representative training datasets. By mitigating the bias in the
+computer vision pipelines, we can ensure fairer and more equitable outcomes for
+applied computer vision, including home valuation and smart home security
+systems. There is urgency in addressing these biases, which can significantly
+impact critical decisions in urban development and resource allocation. Our
+findings also motivate the development of AI systems that better understand and
+serve diverse communities, moving towards technology that equitably benefits
+all sectors of society.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open-source data pipeline for street-view images: a case study on
+  community mobility during COVID-19 pandemic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Martell, Nick Terry, Ribhu Sengupta, Chris Salazar, Nicole A. Errett, Scott B. Miles, Joseph Wartman, Youngjun Choe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Street View Images (SVI) are a common source of valuable data for
+researchers. Researchers have used SVI data for estimating pedestrian volumes,
+demographic surveillance, and to better understand built and natural
+environments in cityscapes. However, the most common source of publicly
+available SVI data is Google Street View. Google Street View images are
+collected infrequently, making temporal analysis challenging, especially in low
+population density areas. Our main contribution is the development of an
+open-source data pipeline for processing 360-degree video recorded from a
+car-mounted camera. The video data is used to generate SVIs, which then can be
+used as an input for temporal analysis. We demonstrate the use of the pipeline
+by collecting a SVI dataset over a 38-month longitudinal survey of Seattle, WA,
+USA during the COVID-19 pandemic. The output of our pipeline is validated
+through statistical analyses of pedestrian traffic in the images. We confirm
+known results in the literature and provide new insights into outdoor
+pedestrian traffic patterns. This study demonstrates the feasibility and value
+of collecting and using SVI for research purposes beyond what is possible with
+currently available SVI data. Limitations and future improvements on the data
+pipeline and case study are also discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 4 figures, two tables. Martell and Terry are equally
+  contributing first authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PlaceFormer: <span class="highlight-title">Transformer</span>-based Visual Place Recognition using
+  Multi-Scale Patch Selection and Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13082v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13082v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shyam Sundar Kannan, Byung-Cheol Min
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual place recognition is a challenging task in the field of computer
+vision, and autonomous robotics and vehicles, which aims to identify a location
+or a place from visual inputs. Contemporary methods in visual place recognition
+employ convolutional neural networks and utilize every region within the image
+for the place recognition task. However, the presence of dynamic and
+distracting elements in the image may impact the effectiveness of the place
+recognition process. Therefore, it is meaningful to focus on task-relevant
+regions of the image for improved recognition. In this paper, we present
+PlaceFormer, a novel transformer-based approach for visual place recognition.
+PlaceFormer employs patch tokens from the transformer to create global image
+descriptors, which are then used for image retrieval. To re-rank the retrieved
+images, PlaceFormer merges the patch tokens from the transformer to form
+multi-scale patches. Utilizing the transformer's self-attention mechanism, it
+selects patches that correspond to task-relevant areas in an image. These
+selected patches undergo geometric verification, generating similarity scores
+across different patch sizes. Subsequently, spatial scores from each patch size
+are fused to produce a final similarity score. This score is then used to
+re-rank the images initially retrieved using global image descriptors.
+Extensive experiments on benchmark datasets demonstrate that PlaceFormer
+outperforms several state-of-the-art methods in terms of accuracy and
+computational efficiency, requiring less time and memory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Free Form Medical Visual Question Answering in Radiology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Narayanan, Rushabh Musthyala, Rahul Sankar, Anirudh Prasad Nistala, Pranav Singh, Jacopo Cirrone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) in the medical domain presents a unique,
+interdisciplinary challenge, combining fields such as Computer Vision, Natural
+Language Processing, and Knowledge Representation. Despite its importance,
+research in medical VQA has been scant, only gaining momentum since 2018.
+Addressing this gap, our research delves into the effective representation of
+radiology images and the joint learning of multimodal representations,
+surpassing existing methods. We innovatively augment the SLAKE dataset,
+enabling our model to respond to a more diverse array of questions, not limited
+to the immediate content of radiology or pathology images. Our model achieves a
+top-1 accuracy of 79.55\% with a less complex architecture, demonstrating
+comparable performance to current state-of-the-art models. This research not
+only advances medical VQA but also opens avenues for practical applications in
+diagnostic settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages and 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SemanticSLAM: Learning based Semantic Map Construction and Robust Camera
+  Localization <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyang Li, Yue Ma, Qinru Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current techniques in Visual Simultaneous Localization and Mapping (VSLAM)
+estimate camera displacement by comparing image features of consecutive scenes.
+These algorithms depend on scene continuity, hence requires frequent camera
+inputs. However, processing images frequently can lead to significant memory
+usage and computation overhead. In this study, we introduce SemanticSLAM, an
+end-to-end visual-inertial odometry system that utilizes semantic features
+extracted from an RGB-D sensor. This approach enables the creation of a
+semantic map of the environment and ensures reliable camera localization.
+SemanticSLAM is scene-agnostic, which means it doesn't require retraining for
+different environments. It operates effectively in indoor settings, even with
+infrequent camera input, without prior knowledge. The strength of SemanticSLAM
+lies in its ability to gradually refine the semantic map and improve pose
+estimation. This is achieved by a convolutional long-short-term-memory
+(ConvLSTM) network, trained to correct errors during map construction. Compared
+to existing VSLAM algorithms, SemanticSLAM improves pose estimation by 17%. The
+resulting semantic map provides interpretable information about the environment
+and can be easily applied to various downstream tasks, such as path planning,
+obstacle avoidance, and robot navigation. The code will be publicly available
+at https://github.com/Leomingyangli/SemanticSLAM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 IEEE Symposium Series on Computational Intelligence (SSCI) 6
+  pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local Background Estimation for Improved Gas Plume Identification in
+  Hyperspectral Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Scout Jarman, Zigfried Hampel-Arias, Adra Carr, Kevin R. Moon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning identification models have shown promise for identifying gas
+plumes in Longwave IR hyperspectral images of urban scenes, particularly when a
+large library of gases are being considered. Because many gases have similar
+spectral signatures, it is important to properly estimate the signal from a
+detected plume. Typically, a scene's global mean spectrum and covariance matrix
+are estimated to whiten the plume's signal, which removes the background's
+signature from the gas signature. However, urban scenes can have many different
+background materials that are spatially and spectrally heterogeneous. This can
+lead to poor identification performance when the global background estimate is
+not representative of a given local background material. We use image
+segmentation, along with an iterative background estimation algorithm, to
+create local estimates for the various background materials that reside
+underneath a gas plume. Our method outperforms global background estimation on
+a set of simulated and real gas plumes. This method shows promise in increasing
+deep learning identification confidence, while being simple and easy to tune
+when considering diverse plumes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to International Geoscience and Remote Sensing Symposium
+  (IGARSS), 2024. 5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PA-SAM: <span class="highlight-title">Prompt</span> Adapter SAM for High-Quality Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaozhi Xie, Bochen Guan, Weihao Jiang, Muyang Yi, Yue Ding, Hongtao Lu, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Segment Anything Model (SAM) has exhibited outstanding performance in
+various image segmentation tasks. Despite being trained with over a billion
+masks, SAM faces challenges in mask prediction quality in numerous scenarios,
+especially in real-world contexts. In this paper, we introduce a novel
+prompt-driven adapter into SAM, namely Prompt Adapter Segment Anything Model
+(PA-SAM), aiming to enhance the segmentation mask quality of the original SAM.
+By exclusively training the prompt adapter, PA-SAM extracts detailed
+information from images and optimizes the mask decoder feature at both sparse
+and dense prompt levels, improving the segmentation performance of SAM to
+produce high-quality masks. Experimental results demonstrate that our PA-SAM
+outperforms other SAM-based methods in high-quality, zero-shot, and open-set
+segmentation. We're making the source code and models available at
+https://github.com/xzz2/pa-sam.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/xzz2/pa-sam</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CIS-UNet: Multi-Class Segmentation of the Aorta in Computed Tomography
+  Angiography via Context-Aware Shifted Window Self-Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Imran, Jonathan R Krebs, Veera Rajasekhar Reddy Gopu, Brian Fazzone, Vishal Balaji Sivaraman, Amarjeet Kumar, Chelsea Viscardi, Robert Evans Heithaus, Benjamin Shickel, Yuyin Zhou, Michol A Cooper, Wei Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in medical imaging and endovascular grafting have facilitated
+minimally invasive treatments for aortic diseases. Accurate 3D segmentation of
+the aorta and its branches is crucial for interventions, as inaccurate
+segmentation can lead to erroneous surgical planning and endograft
+construction. Previous methods simplified aortic segmentation as a binary image
+segmentation problem, overlooking the necessity of distinguishing between
+individual aortic branches. In this paper, we introduce Context Infused
+Swin-UNet (CIS-UNet), a deep learning model designed for multi-class
+segmentation of the aorta and thirteen aortic branches. Combining the strengths
+of Convolutional Neural Networks (CNNs) and Swin transformers, CIS-UNet adopts
+a hierarchical encoder-decoder structure comprising a CNN encoder, symmetric
+decoder, skip connections, and a novel Context-aware Shifted Window
+Self-Attention (CSW-SA) as the bottleneck block. Notably, CSW-SA introduces a
+unique utilization of the patch merging layer, distinct from conventional Swin
+transformers. It efficiently condenses the feature map, providing a global
+spatial context and enhancing performance when applied at the bottleneck layer,
+offering superior computational efficiency and segmentation accuracy compared
+to the Swin transformers. We trained our model on computed tomography (CT)
+scans from 44 patients and tested it on 15 patients. CIS-UNet outperformed the
+state-of-the-art SwinUNetR segmentation model, which is solely based on Swin
+transformers, by achieving a superior mean Dice coefficient of 0.713 compared
+to 0.697, and a mean surface distance of 2.78 mm compared to 3.39 mm.
+CIS-UNet's superior 3D aortic segmentation offers improved precision and
+optimization for planning endovascular treatments. Our dataset and code will be
+publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CCA: Collaborative Competitive Agents for Image Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiankai Hang, Shuyang Gu, Dong Chen, Xin Geng, Baining Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel generative model, Collaborative Competitive
+Agents (CCA), which leverages the capabilities of multiple Large Language
+Models (LLMs) based agents to execute complex tasks. Drawing inspiration from
+Generative Adversarial Networks (GANs), the CCA system employs two equal-status
+generator agents and a discriminator agent. The generators independently
+process user instructions and generate results, while the discriminator
+evaluates the outputs, and provides feedback for the generator agents to
+further reflect and improve the generation results. Unlike the previous
+generative model, our system can obtain the intermediate steps of generation.
+This allows each generator agent to learn from other successful executions due
+to its transparency, enabling a collaborative competition that enhances the
+quality and robustness of the system's results. The primary focus of this study
+is image editing, demonstrating the CCA's ability to handle intricate
+instructions robustly. The paper's main contributions include the introduction
+of a multi-agent-based generative model with controllable intermediate steps
+and iterative optimization, a detailed examination of agent relationships, and
+comprehensive experiments on image editing. Code is available at
+\href{https://github.com/TiankaiHang/CCA}{https://github.com/TiankaiHang/CCA}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RudolfV: A Foundation Model by Pathologists for Pathologists 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04079v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04079v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Dippel, Barbara Feulner, Tobias Winterhoff, Simon Schallenberg, Gabriel Dernbach, Andreas Kunft, Stephan Tietz, Philipp Jurmeister, David Horst, Lukas Ruff, Klaus-Robert Müller, Frederick Klauschen, Maximilian Alber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Histopathology plays a central role in clinical medicine and biomedical
+research. While artificial intelligence shows promising results on many
+pathological tasks, generalization and dealing with rare diseases, where
+training data is scarce, remains a challenge. Distilling knowledge from
+unlabeled data into a foundation model before learning from, potentially
+limited, labeled data provides a viable path to address these challenges. In
+this work, we extend the state of the art of foundation models for digital
+pathology whole slide images by semi-automated data curation and incorporating
+pathologist domain knowledge. Specifically, we combine computational and
+pathologist domain knowledge (1) to curate a diverse dataset of 103k slides
+corresponding to 750 million image patches covering data from different
+fixation, staining, and scanning protocols as well as data from different
+indications and labs across the EU and US, (2) for grouping semantically
+similar slides and tissue patches, and (3) to augment the input images during
+training. We evaluate the resulting model on a set of public and internal
+benchmarks and show that although our foundation model is trained with an order
+of magnitude less slides, it performs on par or better than competing models.
+We expect that scaling our approach to more data and larger models will further
+increase its performance and capacity to deal with increasingly complex real
+world tasks in diagnostics and biomedical research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tracking Any Object Amodally 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12433v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12433v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng-Yen Hsieh, Tarasha Khurana, Achal Dave, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Amodal perception, the ability to comprehend complete object structures from
+partial visibility, is a fundamental skill, even for infants. Its significance
+extends to applications like autonomous driving, where a clear understanding of
+heavily occluded objects is essential. However, modern detection and tracking
+algorithms often overlook this critical capability, perhaps due to the
+prevalence of modal annotations in most datasets. To address the scarcity of
+amodal data, we introduce the TAO-Amodal benchmark, featuring 880 diverse
+categories in thousands of video sequences. Our dataset includes amodal and
+modal bounding boxes for visible and occluded objects, including objects that
+are partially out-of-frame. To enhance amodal tracking with object permanence,
+we leverage a lightweight plug-in module, the amodal expander, to transform
+standard, modal trackers into amodal ones through fine-tuning on a few hundred
+video sequences with data augmentation. We achieve a 3.3\% and 1.6\%
+improvement on the detection and tracking of occluded objects on TAO-Amodal.
+When evaluated on people, our method produces dramatic improvements of 2x
+compared to state-of-the-art modal baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://tao-amodal.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PATS: Patch Area Transportation with Subdivision for Local Feature
+  Matching <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07700v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07700v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Ni, Yijin Li, Zhaoyang Huang, Hongsheng Li, Hujun Bao, Zhaopeng Cui, Guofeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Local feature matching aims at establishing sparse correspondences between a
+pair of images. Recently, detector-free methods present generally better
+performance but are not satisfactory in image pairs with large scale
+differences. In this paper, we propose Patch Area Transportation with
+Subdivision (PATS) to tackle this issue. Instead of building an expensive image
+pyramid, we start by splitting the original image pair into equal-sized patches
+and gradually resizing and subdividing them into smaller patches with the same
+scale. However, estimating scale differences between these patches is
+non-trivial since the scale differences are determined by both relative camera
+poses and scene structures, and thus spatially varying over image pairs.
+Moreover, it is hard to obtain the ground truth for real scenes. To this end,
+we propose patch area transportation, which enables learning scale differences
+in a self-supervised manner. In contrast to bipartite graph matching, which
+only handles one-to-one matching, our patch area transportation can deal with
+many-to-many relationships. PATS improves both matching accuracy and coverage,
+and shows superior performance in downstream tasks, such as relative pose
+estimation, visual localization, and optical flow estimation. The source code
+is available at \url{https://zju3dv.github.io/pats/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2023. Project page: https://zju3dv.github.io/pats</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DengueNet: Dengue Prediction using Spatiotemporal Satellite Imagery for
+  Resource-Limited Countries <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11114v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11114v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuan-Ting Kuo, Dana Moukheiber, Sebastian Cajas Ordonez, David Restrepo, Atika Rahman Paddo, Tsung-Yu Chen, Lama Moukheiber, Mira Moukheiber, Sulaiman Moukheiber, Saptarshi Purkayastha, Po-Chih Kuo, Leo Anthony Celi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dengue fever presents a substantial challenge in developing countries where
+sanitation infrastructure is inadequate. The absence of comprehensive
+healthcare systems exacerbates the severity of dengue infections, potentially
+leading to life-threatening circumstances. Rapid response to dengue outbreaks
+is also challenging due to limited information exchange and integration. While
+timely dengue outbreak forecasts have the potential to prevent such outbreaks,
+the majority of dengue prediction studies have predominantly relied on data
+that impose significant burdens on individual countries for collection. In this
+study, our aim is to improve health equity in resource-constrained countries by
+exploring the effectiveness of high-resolution satellite imagery as a
+nontraditional and readily accessible data source. By leveraging the wealth of
+publicly available and easily obtainable satellite imagery, we present a
+scalable satellite extraction framework based on Sentinel Hub, a cloud-based
+computing platform. Furthermore, we introduce DengueNet, an innovative
+architecture that combines Vision Transformer, Radiomics, and Long Short-term
+Memory to extract and integrate spatiotemporal features from satellite images.
+This enables dengue predictions on an epi-week basis. To evaluate the
+effectiveness of our proposed method, we conducted experiments on five
+municipalities in Colombia. We utilized a dataset comprising 780
+high-resolution Sentinel-2 satellite images for training and evaluation. The
+performance of DengueNet was assessed using the mean absolute error (MAE)
+metric. Across the five municipalities, DengueNet achieved an average MAE of
+43.92. Our findings strongly support the efficacy of satellite imagery as a
+valuable resource for dengue prediction, particularly in informing public
+health policies within countries where manually collected data is scarce and
+dengue virus prevalence is severe.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the IJCAI 2023 Workshop on Bridge-AI: from Climate
+  Change to Health Equity (BridgeAICCHE)., Macao, S.A.R</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WavePlanes: A compact Wavelet representation for Dynamic Neural Radiance
+  Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02218v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02218v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Azzarelli, Nantheera Anantrasirichai, David R Bull
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic Neural Radiance Fields (Dynamic NeRF) enhance NeRF technology to
+model moving scenes. However, they are resource intensive and challenging to
+compress. To address this issue, this paper presents WavePlanes, a fast and
+more compact explicit model. We propose a multi-scale space and space-time
+feature plane representation using N-level 2-D wavelet coefficients. The
+inverse discrete wavelet transform reconstructs N feature signals at varying
+detail, which are linearly decoded to approximate the color and density of
+volumes in a 4-D grid. Exploiting the sparsity of wavelet coefficients, we
+compress a Hash Map containing only non-zero coefficients and their locations
+on each plane. This results in a compressed model size of ~12 MB. Compared with
+state-of-the-art plane-based models, WavePlanes is up to 15x smaller, less
+computationally demanding and achieves comparable results in as little as one
+hour of training - without requiring custom CUDA code or high performance
+computing resources. Additionally, we propose new feature fusion schemes that
+work as well as previously proposed schemes while providing greater
+interpretability. Our code is available at:
+https://github.com/azzarelli/waveplanes/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Industrial and Medical Anomaly Detection Through Cycle-Consistent
+  Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.05154v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.05154v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arnaud Bougaham, Valentin Delchevalerie, Mohammed El Adoui, Benoît Frénay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, a new Anomaly Detection (AD) approach for industrial and
+medical images is proposed. This method leverages the theoretical strengths of
+unsupervised learning and the data availability of both normal and abnormal
+classes. Indeed, the AD is often formulated as an unsupervised task, implying
+only normal images during training. These normal images are devoted to be
+reconstructed, through an autoencoder architecture for instance. However, the
+information contained in abnormal data, when available, is also valuable for
+this reconstruction. The model would be able to identify its weaknesses by
+better learning how to transform an abnormal (respectively normal) image into a
+normal (respectively abnormal) one, helping the entire model to learn better
+than a single normal to normal reconstruction. To address this challenge, the
+proposed method uses Cycle-Generative Adversarial Networks (Cycle-GAN) for
+(ab)normal-to-normal translation. After an input image has been reconstructed
+by the normal generator, an anomaly score quantifies the differences between
+the input and its reconstruction. Based on a threshold set to satisfy a
+business quality constraint, the input image is then flagged as normal or not.
+The proposed method is evaluated on industrial and medical datasets. The
+results demonstrate accurate performance with a zero false negative constraint
+compared to state-of-the-art methods. The code is available at
+https://github.com/ValDelch/CycleGANS-AnomalyDetection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Energy-based Models are Zero-Shot Planners for Compositional Scene
+  Rearrangement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14391v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14391v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolaos Gkanatsios, Ayush Jain, Zhou Xian, Yunchu Zhang, Christopher Atkeson, Katerina Fragkiadaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language is compositional; an instruction can express multiple relation
+constraints to hold among objects in a scene that a robot is tasked to
+rearrange. Our focus in this work is an instructable scene-rearranging
+framework that generalizes to longer instructions and to spatial concept
+compositions never seen at training time. We propose to represent
+language-instructed spatial concepts with energy functions over relative object
+arrangements. A language parser maps instructions to corresponding energy
+functions and an open-vocabulary visual-language model grounds their arguments
+to relevant objects in the scene. We generate goal scene configurations by
+gradient descent on the sum of energy functions, one per language predicate in
+the instruction. Local vision-based policies then re-locate objects to the
+inferred goal locations. We test our model on established instruction-guided
+manipulation benchmarks, as well as benchmarks of compositional instructions we
+introduce. We show our model can execute highly compositional instructions
+zero-shot in simulation and in the real world. It outperforms
+language-to-action reactive policies and Large Language Model planners by a
+large margin, especially for long instructions that involve compositions of
+multiple spatial concepts. Simulation and real-world robot execution videos, as
+well as our code and datasets are publicly available on our website:
+https://ebmplanner.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally | RSS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VGDiffZero: Text-to-image Diffusion Models Can Be Zero-shot Visual
+  Grounders <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01141v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01141v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyang Liu, Siteng Huang, Yachen Kang, Honggang Chen, Donglin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale text-to-image diffusion models have shown impressive capabilities
+for generative tasks by leveraging strong vision-language alignment from
+pre-training. However, most vision-language discriminative tasks require
+extensive fine-tuning on carefully-labeled datasets to acquire such alignment,
+with great cost in time and computing resources. In this work, we explore
+directly applying a pre-trained generative diffusion model to the challenging
+discriminative task of visual grounding without any fine-tuning and additional
+training dataset. Specifically, we propose VGDiffZero, a simple yet effective
+zero-shot visual grounding framework based on text-to-image diffusion models.
+We also design a comprehensive region-scoring method considering both global
+and local contexts of each isolated proposal. Extensive experiments on RefCOCO,
+RefCOCO+, and RefCOCOg show that VGDiffZero achieves strong performance on
+zero-shot visual grounding. Our code is available at
+https://github.com/xuyang-liu16/VGDiffZero.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AIGCBench: Comprehensive Evaluation of Image-to-Video Content Generated
+  by AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01651v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01651v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanda Fan, Chunjie Luo, Wanling Gao, Jianfeng Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The burgeoning field of Artificial Intelligence Generated Content (AIGC) is
+witnessing rapid advancements, particularly in video generation. This paper
+introduces AIGCBench, a pioneering comprehensive and scalable benchmark
+designed to evaluate a variety of video generation tasks, with a primary focus
+on Image-to-Video (I2V) generation. AIGCBench tackles the limitations of
+existing benchmarks, which suffer from a lack of diverse datasets, by including
+a varied and open-domain image-text dataset that evaluates different
+state-of-the-art algorithms under equivalent conditions. We employ a novel text
+combiner and GPT-4 to create rich text prompts, which are then used to generate
+images via advanced Text-to-Image models. To establish a unified evaluation
+framework for video generation tasks, our benchmark includes 11 metrics
+spanning four dimensions to assess algorithm performance. These dimensions are
+control-video alignment, motion effects, temporal consistency, and video
+quality. These metrics are both reference video-dependent and video-free,
+ensuring a comprehensive evaluation strategy. The evaluation standard proposed
+correlates well with human judgment, providing insights into the strengths and
+weaknesses of current I2V algorithms. The findings from our extensive
+experiments aim to stimulate further research and development in the I2V field.
+AIGCBench represents a significant step toward creating standardized benchmarks
+for the broader AIGC landscape, proposing an adaptable and equitable framework
+for future assessments of video generation tasks. We have open-sourced the
+dataset and evaluation code on the project website:
+https://www.benchcouncil.org/AIGCBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to BenchCouncil Transactions on Benchmarks, Standards and
+  Evaluations (TBench)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeepCERES: A Deep learning method for cerebellar lobule segmentation
+  using ultra-high resolution multimodal MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12074v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12074v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Morell-Ortega, Marina Ruiz-Perez, Marien Gadea, Roberto Vivo-Hernando, Gregorio Rubio, Fernando Aparici, Maria de la Iglesia-Vaya, Gwenaelle Catheline, Pierrick Coupé, José V. Manjón
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel multimodal and high-resolution human brain
+cerebellum lobule segmentation method. Unlike current tools that operate at
+standard resolution ($1 \text{ mm}^{3}$) or using mono-modal data, the proposed
+method improves cerebellum lobule segmentation through the use of a multimodal
+and ultra-high resolution ($0.125 \text{ mm}^{3}$) training dataset. To develop
+the method, first, a database of semi-automatically labelled cerebellum lobules
+was created to train the proposed method with ultra-high resolution T1 and T2
+MR images. Then, an ensemble of deep networks has been designed and developed,
+allowing the proposed method to excel in the complex cerebellum lobule
+segmentation task, improving precision while being memory efficient. Notably,
+our approach deviates from the traditional U-Net model by exploring alternative
+architectures. We have also integrated deep learning with classical machine
+learning methods incorporating a priori knowledge from multi-atlas
+segmentation, which improved precision and robustness. Finally, a new online
+pipeline, named DeepCERES, has been developed to make available the proposed
+method to the scientific community requiring as input only a single T1 MR image
+at standard resolution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutomaTikZ: Text-Guided Synthesis of Scientific Vector Graphics with
+  TikZ <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00367v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00367v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Belouadi, Anne Lauscher, Steffen Eger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating bitmap graphics from text has gained considerable attention, yet
+for scientific figures, vector graphics are often preferred. Given that vector
+graphics are typically encoded using low-level graphics primitives, generating
+them directly is difficult. To address this, we propose the use of TikZ, a
+well-known abstract graphics language that can be compiled to vector graphics,
+as an intermediate representation of scientific figures. TikZ offers
+human-oriented, high-level commands, thereby facilitating conditional language
+modeling with any large language model. To this end, we introduce DaTikZ, the
+first large-scale TikZ dataset consisting of 120k TikZ drawings aligned with
+captions. We fine-tune LLaMA on DaTikZ, as well as our new model CLiMA, which
+augments LLaMA with multimodal CLIP embeddings. In both human and automatic
+evaluation, CLiMA and LLaMA outperform commercial GPT-4 and Claude 2 in terms
+of similarity to human-created figures, with CLiMA additionally improving
+text-image alignment. Our detailed analysis shows that all models generalize
+well and are not susceptible to memorization. GPT-4 and Claude 2, however, tend
+to generate more simplistic figures compared to both humans and our models. We
+make our framework, AutomaTikZ, along with model weights and datasets, publicly
+available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024 (poster); Project Page:
+  https://github.com/potamides/AutomaTikZ</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Score-Based Generative Models for PET Image Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.14190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.14190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Imraj RD Singh, Alexander Denker, Riccardo Barbano, Željko Kereta, Bangti Jin, Kris Thielemans, Peter Maass, Simon Arridge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Score-based generative models have demonstrated highly promising results for
+medical image reconstruction tasks in magnetic resonance imaging or computed
+tomography. However, their application to Positron Emission Tomography (PET) is
+still largely unexplored. PET image reconstruction involves a variety of
+challenges, including Poisson noise with high variance and a wide dynamic
+range. To address these challenges, we propose several PET-specific adaptations
+of score-based generative models. The proposed framework is developed for both
+2D and 3D PET. In addition, we provide an extension to guided reconstruction
+using magnetic resonance images. We validate the approach through extensive 2D
+and 3D $\textit{in-silico}$ experiments with a model trained on
+patient-realistic data without lesions, and evaluate on data without lesions as
+well as out-of-distribution data with lesions. This demonstrates the proposed
+method's robustness and significant potential for improved PET reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the Journal of Machine Learning for
+  Biomedical Imaging (MELBA) https://melba-journal.org/2024:001</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Region-Wise Attentive Multi-View Representation Learning for Urban
+  Region Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03212v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03212v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiliang Chan, Qianqian Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Urban region embedding is an important and yet highly challenging issue due
+to the complexity and constantly changing nature of urban data. To address the
+challenges, we propose a Region-Wise Multi-View Representation Learning (ROMER)
+to capture multi-view dependencies and learn expressive representations of
+urban regions without the constraints of rigid neighbourhood region conditions.
+Our model focus on learn urban region representation from multi-source urban
+data. First, we capture the multi-view correlations from mobility flow
+patterns, POI semantics and check-in dynamics. Then, we adopt global graph
+attention networks to learn similarity of any two vertices in graphs. To
+comprehensively consider and share features of multiple views, a two-stage
+fusion module is further proposed to learn weights with external attention to
+fuse multi-view embeddings. Extensive experiments for two downstream tasks on
+real-world datasets demonstrate that our model outperforms state-of-the-art
+methods by up to 17\% improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conditional Variational Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02246v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02246v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel della Maggiora, Luis Alberto Croquevielle, Nikita Deshpande, Harry Horsley, Thomas Heinis, Artur Yakimovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inverse problems aim to determine parameters from observations, a crucial
+task in engineering and science. Lately, generative models, especially
+diffusion models, have gained popularity in this area for their ability to
+produce realistic solutions and their good mathematical properties. Despite
+their success, an important drawback of diffusion models is their sensitivity
+to the choice of variance schedule, which controls the dynamics of the
+diffusion process. Fine-tuning this schedule for specific applications is
+crucial but time-costly and does not guarantee an optimal result. We propose a
+novel approach for learning the schedule as part of the training process. Our
+method supports probabilistic conditioning on data, provides high-quality
+solutions, and is flexible, proving able to adapt to different applications
+with minimum overhead. This approach is tested in two unrelated inverse
+problems: super-resolution microscopy and quantitative phase imaging, yielding
+comparable or superior results to previous methods and fine-tuned diffusion
+models. We conclude that fine-tuning the schedule by experimentation should be
+avoided because it can be learned during training in a stable way that yields
+better results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Denoising Diffusion Probabilistic Models, Inverse Problems,
+  Generative Models, Super Resolution, Phase Quantification, Variational
+  Methods</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Efficient Diffusion-Based Image Editing with Instant Attention
+  Masks <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07709v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07709v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Zou, Jiji Tang, Yiyi Zhou, Jing He, Chaoyi Zhao, Rongsheng Zhang, Zhipeng Hu, Xiaoshuai Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based Image Editing (DIE) is an emerging research hot-spot, which
+often applies a semantic mask to control the target area for diffusion-based
+editing. However, most existing solutions obtain these masks via manual
+operations or off-line processing, greatly reducing their efficiency. In this
+paper, we propose a novel and efficient image editing method for Text-to-Image
+(T2I) diffusion models, termed Instant Diffusion Editing(InstDiffEdit). In
+particular, InstDiffEdit aims to employ the cross-modal attention ability of
+existing diffusion models to achieve instant mask guidance during the diffusion
+steps. To reduce the noise of attention maps and realize the full automatics,
+we equip InstDiffEdit with a training-free refinement scheme to adaptively
+aggregate the attention distributions for the automatic yet accurate mask
+generation. Meanwhile, to supplement the existing evaluations of DIE, we
+propose a new benchmark called Editing-Mask to examine the mask accuracy and
+local editing ability of existing methods. To validate InstDiffEdit, we also
+conduct extensive experiments on ImageNet and Imagen, and compare it with a
+bunch of the SOTA methods. The experimental results show that InstDiffEdit not
+only outperforms the SOTA methods in both image quality and editing results,
+but also has a much faster inference speed, i.e., +5 to +6 times.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Segment Beyond View: Handling Partially Missing Modality for
+  Audio-Visual Semantic Segmentation <span class="chip">AAAI-24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.08673v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.08673v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renjie Wu, Hu Wang, Feras Dayoub, Hsiang-Ting Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Augmented Reality (AR) devices, emerging as prominent mobile interaction
+platforms, face challenges in user safety, particularly concerning oncoming
+vehicles. While some solutions leverage onboard camera arrays, these cameras
+often have limited field-of-view (FoV) with front or downward perspectives.
+Addressing this, we propose a new out-of-view semantic segmentation task and
+Segment Beyond View (SBV), a novel audio-visual semantic segmentation method.
+SBV supplements the visual modality, which miss the information beyond FoV,
+with the auditory information using a teacher-student distillation model
+(Omni2Ego). The model consists of a vision teacher utilising panoramic
+information, an auditory teacher with 8-channel audio, and an audio-visual
+student that takes views with limited FoV and binaural audio as input and
+produce semantic segmentation for objects outside FoV. SBV outperforms existing
+models in comparative evaluations and shows a consistent performance across
+varying FoV ranges and in monaural audio settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI-24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Direction-Oriented Visual-semantic Embedding Model for Remote Sensing
+  Image-text Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08276v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08276v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qing Ma, Jiancheng Pan, Cong Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-text retrieval has developed rapidly in recent years. However, it is
+still a challenge in remote sensing due to visual-semantic imbalance, which
+leads to incorrect matching of non-semantic visual and textual features. To
+solve this problem, we propose a novel Direction-Oriented Visual-semantic
+Embedding Model (DOVE) to mine the relationship between vision and language.
+Our highlight is to conduct visual and textual representations in latent space,
+directing them as close as possible to a redundancy-free regional visual
+representation. Concretely, a Regional-Oriented Attention Module (ROAM)
+adaptively adjusts the distance between the final visual and textual embeddings
+in the latent semantic space, oriented by regional visual features. Meanwhile,
+a lightweight Digging Text Genome Assistant (DTGA) is designed to expand the
+range of tractable textual representation and enhance global word-level
+semantic connections using less attention operations. Ultimately, we exploit a
+global visual-semantic constraint to reduce single visual dependency and serve
+as an external constraint for the final visual and textual representations. The
+effectiveness and superiority of our method are verified by extensive
+experiments including parameter evaluation, quantitative comparison, ablation
+studies and visual analysis, on two benchmark datasets, RSICD and RSITMD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Toward Real Text Manipulation Detection: New <span class="highlight-title">Dataset</span> and New Solution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.06934v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.06934v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongliang Luo, Yuliang Liu, Rui Yang, Xianjin Liu, Jishen Zeng, Yu Zhou, Xiang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the surge in realistic text tampering, detecting fraudulent text in
+images has gained prominence for maintaining information security. However, the
+high costs associated with professional text manipulation and annotation limit
+the availability of real-world datasets, with most relying on synthetic
+tampering, which inadequately replicates real-world tampering attributes. To
+address this issue, we present the Real Text Manipulation (RTM) dataset,
+encompassing 14,250 text images, which include 5,986 manually and 5,258
+automatically tampered images, created using a variety of techniques, alongside
+3,006 unaltered text images for evaluating solution stability. Our evaluations
+indicate that existing methods falter in text forgery detection on the RTM
+dataset. We propose a robust baseline solution featuring a Consistency-aware
+Aggregation Hub and a Gated Cross Neighborhood-attention Fusion module for
+efficient multi-modal information fusion, supplemented by a Tampered-Authentic
+Contrastive Learning module during training, enriching feature representation
+distinction. This framework, extendable to other dual-stream architectures,
+demonstrated notable localization performance improvements of 7.33% and 6.38%
+on manual and overall manipulations, respectively. Our contributions aim to
+propel advancements in real-world text tampering detection. Code and dataset
+will be made available at https://github.com/DrLuo/RTM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper needs to be improved</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeepSeaNet: Improving Underwater Object Detection using EfficientDet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06075v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06075v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanyam Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Marine animals and deep underwater objects are difficult to recognize and
+monitor for safety of aquatic life. There is an increasing challenge when the
+water is saline with granular particles and impurities. In such natural
+adversarial environment, traditional approaches like CNN start to fail and are
+expensive to compute. This project involves implementing and evaluating various
+object detection models, including EfficientDet, YOLOv5, YOLOv8, and
+Detectron2, on an existing annotated underwater dataset, called the
+Brackish-Dataset. The dataset comprises annotated image sequences of fish,
+crabs, starfish, and other aquatic animals captured in Limfjorden water with
+limited visibility. The aim of this research project is to study the efficiency
+of newer models on the same dataset and contrast them with the previous results
+based on accuracy and inference time. Firstly, I compare the results of YOLOv3
+(31.10% mean Average Precision (mAP)), YOLOv4 (83.72% mAP), YOLOv5 (97.6%),
+YOLOv8 (98.20%), EfficientDet (98.56% mAP) and Detectron2 (95.20% mAP) on the
+same dataset. Secondly, I provide a modified BiSkFPN mechanism (BiFPN neck with
+skip connections) to perform complex feature fusion in adversarial noise which
+makes modified EfficientDet robust to perturbations. Third, analyzed the effect
+on accuracy of EfficientDet (98.63% mAP) and YOLOv5 by adversarial learning
+(98.04% mAP). Last, I provide class activation map based explanations (CAM) for
+the two models to promote Explainability in black box models. Overall, the
+results indicate that modified EfficientDet achieved higher accuracy with
+five-fold cross validation than the other models with 88.54% IoU of feature
+maps.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unleashing the Power of <span class="highlight-title">Prompt</span>-driven Nucleus Instance Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15939v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15939v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongyi Shui, Yunlong Zhang, Kai Yao, Chenglu Zhu, Sunyi Zheng, Jingxiong Li, Honglin Li, Yuxuan Sun, Ruizhe Guo, Lin Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nucleus instance segmentation in histology images is crucial for a broad
+spectrum of clinical applications. Current dominant algorithms rely on
+regression of nuclear proxy maps. Distinguishing nucleus instances from the
+estimated maps requires carefully curated post-processing, which is error-prone
+and parameter-sensitive. Recently, the Segment Anything Model (SAM) has earned
+huge attention in medical image segmentation, owing to its impressive
+generalization ability and promptable property. Nevertheless, its potential on
+nucleus instance segmentation remains largely underexplored. In this paper, we
+present a novel prompt-driven framework that consists of a nucleus prompter and
+SAM for automatic nucleus instance segmentation. Specifically, the prompter
+learns to generate a unique point prompt for each nucleus while the SAM is
+fine-tuned to output the corresponding mask for the prompted nucleus.
+Furthermore, we propose the inclusion of adjacent nuclei as negative prompts to
+enhance the model's capability to identify overlapping nuclei. Without
+complicated post-processing, our proposed method sets a new state-of-the-art
+performance on three challenging benchmarks. Code is available at
+\url{github.com/windygoo/PromptNucSeg}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ APLe: Token-Wise Adaptive for Multi-Modal <span class="highlight-title">Prompt</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guiming Cao, Kaize Shi, Hong Fu, Huaiwen Zhang, Guandong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained Vision-Language (V-L) models set the benchmark for generalization
+to downstream tasks among the noteworthy contenders. Many characteristics of
+the V-L model have been explored in existing research including the challenge
+of the sensitivity to text input and the tuning process across multi-modal
+prompts. With the advanced utilization of the V-L model like CLIP, recent
+approaches deploy learnable prompts instead of hand-craft prompts to boost the
+generalization performance and address the aforementioned challenges. Inspired
+by layer-wise training, which is wildly used in image fusion, we note that
+using a sequential training process to adapt different modalities branches of
+CLIP efficiently facilitates the improvement of generalization. In the context
+of addressing the multi-modal prompting challenge, we propose Token-wise
+Adaptive for Multi-modal Prompt Learning (APLe) for tuning both modalities
+prompts, vision and language, as tokens in a sequential manner. APLe addresses
+the challenges in V-L models to promote prompt learning across both modalities,
+which indicates a competitive generalization performance in line with the
+state-of-the-art. Preeminently, APLe shows robustness and favourable
+performance in prompt-length experiments with an absolute advantage in adopting
+the V-L models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages,3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Weakly-supervised land classification for coastal zone based on deep
+  convolutional neural networks by incorporating dual-polarimetric
+  characteristics into training <span class="highlight-title">dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2003.13648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2003.13648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Sun, Armando Marino, Wenze Shui, Zhongwen Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we explore the performance of DCNNs on semantic segmentation
+using spaceborne polarimetric synthetic aperture radar (PolSAR) datasets. The
+semantic segmentation task using PolSAR data can be categorized as weakly
+supervised learning when the characteristics of SAR data and data annotating
+procedures are factored in. Datasets are initially analyzed for selecting
+feasible pre-training images. Then the differences between spaceborne and
+airborne datasets are examined in terms of spatial resolution and viewing
+geometry. In this study we used two dual-polarimetric images acquired by
+TerraSAR-X DLR. A novel method to produce training dataset with more supervised
+information is developed. Specifically, a series of typical classified images
+as well as intensity images serve as training datasets. A field survey is
+conducted for an area of about 20 square kilometers to obtain a ground truth
+dataset used for accuracy evaluation. Several transfer learning strategies are
+made for aforementioned training datasets which will be combined in a
+practicable order. Three DCNN models, including SegNet, U-Net, and LinkNet, are
+implemented next.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We are sorry we would like to improve it</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open-sourced Data Ecosystem in Autonomous Driving: the Present and
+  Future 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyang Li, Yang Li, Huijie Wang, Jia Zeng, Pinlong Cai, Huilin Xu, Dahua Lin, Junchi Yan, Feng Xu, Lu Xiong, Jingdong Wang, Futang Zhu, Kai Yan, Chunjing Xu, Tiancai Wang, Beipeng Mu, Shaoqing Ren, Zhihui Peng, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the continuous maturation and application of autonomous driving
+technology, a systematic examination of open-source autonomous driving datasets
+becomes instrumental in fostering the robust evolution of the industry
+ecosystem. Current autonomous driving datasets can broadly be categorized into
+two generations. The first-generation autonomous driving datasets are
+characterized by relatively simpler sensor modalities, smaller data scale, and
+is limited to perception-level tasks. KITTI, introduced in 2012, serves as a
+prominent representative of this initial wave. In contrast, the
+second-generation datasets exhibit heightened complexity in sensor modalities,
+greater data scale and diversity, and an expansion of tasks from perception to
+encompass prediction and control. Leading examples of the second generation
+include nuScenes and Waymo, introduced around 2019. This comprehensive review,
+conducted in collaboration with esteemed colleagues from both academia and
+industry, systematically assesses over seventy open-source autonomous driving
+datasets from domestic and international sources. It offers insights into
+various aspects, such as the principles underlying the creation of high-quality
+datasets, the pivotal role of data engine systems, and the utilization of
+generative foundation models to facilitate scalable data generation.
+Furthermore, this review undertakes an exhaustive analysis and discourse
+regarding the characteristics and data scales that future third-generation
+autonomous driving datasets should possess. It also delves into the scientific
+and technical challenges that warrant resolution. These endeavors are pivotal
+in advancing autonomous innovation and fostering technological enhancement in
+critical domains. For further details, please refer to
+https://github.com/OpenDriveLab/DriveAGI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article is a simplified English translation of corresponding
+  Chinese article. Please refer to Chinese version for the complete content</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpikePoint: An Efficient Point-based Spiking Neural Network for Event
+  Cameras Action Recognition <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongwei Ren, Yue Zhou, Yulong Huang, Haotian Fu, Xiaopeng Lin, Jie Song, Bojun Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event cameras are bio-inspired sensors that respond to local changes in light
+intensity and feature low latency, high energy efficiency, and high dynamic
+range. Meanwhile, Spiking Neural Networks (SNNs) have gained significant
+attention due to their remarkable efficiency and fault tolerance. By
+synergistically harnessing the energy efficiency inherent in event cameras and
+the spike-based processing capabilities of SNNs, their integration could enable
+ultra-low-power application scenarios, such as action recognition tasks.
+However, existing approaches often entail converting asynchronous events into
+conventional frames, leading to additional data mapping efforts and a loss of
+sparsity, contradicting the design concept of SNNs and event cameras. To
+address this challenge, we propose SpikePoint, a novel end-to-end point-based
+SNN architecture. SpikePoint excels at processing sparse event cloud data,
+effectively extracting both global and local features through a singular-stage
+structure. Leveraging the surrogate training method, SpikePoint achieves high
+accuracy with few parameters and maintains low power consumption, specifically
+employing the identity mapping feature extractor on diverse datasets.
+SpikePoint achieves state-of-the-art (SOTA) performance on four event-based
+action recognition datasets using only 16 timesteps, surpassing other SNN
+methods. Moreover, it also achieves SOTA performance across all methods on
+three datasets, utilizing approximately 0.3\% of the parameters and 0.5\% of
+power consumption employed by artificial neural networks (ANNs). These results
+emphasize the significance of Point Cloud and pave the way for many
+ultra-low-power event-based data processing applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Iterative Adversarial Attack on Image-guided Story Ending Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13208v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13208v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youze Wang, Wenbo Hu, Richang Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal learning involves developing models that can integrate information
+from various sources like images and texts. In this field, multimodal text
+generation is a crucial aspect that involves processing data from multiple
+modalities and outputting text. The image-guided story ending generation
+(IgSEG) is a particularly significant task, targeting on an understanding of
+complex relationships between text and image data with a complete story text
+ending. Unfortunately, deep neural networks, which are the backbone of recent
+IgSEG models, are vulnerable to adversarial samples. Current adversarial attack
+methods mainly focus on single-modality data and do not analyze adversarial
+attacks for multimodal text generation tasks that use cross-modal information.
+To this end, we propose an iterative adversarial attack method
+(Iterative-attack) that fuses image and text modality attacks, allowing for an
+attack search for adversarial text and image in an more effective iterative
+way. Experimental results demonstrate that the proposed method outperforms
+existing single-modal and non-iterative multimodal attack methods, indicating
+the potential for improving the adversarial robustness of multimodal text
+generation models, such as multimodal machine translation, multimodal question
+answering, etc.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Methods and <span class="highlight-title">dataset</span>s for segmentation of minimally invasive surgical
+  instruments in endoscopic images and videos: A <span class="highlight-title">review</span> of the state of the art 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13014v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13014v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Rueckert, Daniel Rueckert, Christoph Palm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of computer- and robot-assisted minimally invasive surgery,
+enormous progress has been made in recent years based on the recognition of
+surgical instruments in endoscopic images and videos. In particular, the
+determination of the position and type of instruments is of great interest.
+Current work involves both spatial and temporal information, with the idea that
+predicting the movement of surgical tools over time may improve the quality of
+the final segmentations. The provision of publicly available datasets has
+recently encouraged the development of new methods, mainly based on deep
+learning. In this review, we identify and characterize datasets used for method
+development and evaluation and quantify their frequency of use in the
+literature. We further present an overview of the current state of research
+regarding the segmentation and tracking of minimally invasive surgical
+instruments in endoscopic images and videos. The paper focuses on methods that
+work purely visually, without markers of any kind attached to the instruments,
+considering both single-frame semantic and instance segmentation approaches, as
+well as those that incorporate temporal information. The publications analyzed
+were identified through the platforms Google Scholar, Web of Science, and
+PubMed. The search terms used were "instrument segmentation", "instrument
+tracking", "surgical tool segmentation", and "surgical tool tracking",
+resulting in a total of 741 articles published between 01/2015 and 07/2023, of
+which 123 were included using systematic selection criteria. A discussion of
+the reviewed literature is provided, highlighting existing shortcomings and
+emphasizing the available potential for future developments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized Out-of-Distribution Detection: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.11334v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.11334v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingkang Yang, Kaiyang Zhou, Yixuan Li, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection is critical to ensuring the reliability
+and safety of machine learning systems. For instance, in autonomous driving, we
+would like the driving system to issue an alert and hand over the control to
+humans when it detects unusual scenes or objects that it has never seen during
+training time and cannot make a safe decision. The term, OOD detection, first
+emerged in 2017 and since then has received increasing attention from the
+research community, leading to a plethora of methods developed, ranging from
+classification-based to density-based to distance-based ones. Meanwhile,
+several other problems, including anomaly detection (AD), novelty detection
+(ND), open set recognition (OSR), and outlier detection (OD), are closely
+related to OOD detection in terms of motivation and methodology. Despite common
+goals, these topics develop in isolation, and their subtle differences in
+definition and problem setting often confuse readers and practitioners. In this
+survey, we first present a unified framework called generalized OOD detection,
+which encompasses the five aforementioned problems, i.e., AD, ND, OSR, OOD
+detection, and OD. Under our framework, these five problems can be seen as
+special cases or sub-tasks, and are easier to distinguish. We then review each
+of these five areas by summarizing their recent technical developments, with a
+special focus on OOD detection methodologies. We conclude this survey with open
+challenges and potential research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Feel free to comment on our Overleaf manuscript:
+  https://www.overleaf.com/9899719915wmccvdtwpkct#c25192</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Long-Term Person Re-Identification with Clothes Change 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.03087v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.03087v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingkun Li, Shupeng Cheng, Peng Xu, Xiatian Zhu, Chun-Guang Li, Jun Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate unsupervised person re-identification (Re-ID) with clothes
+change, a new challenging problem with more practical usability and scalability
+to real-world deployment. Most existing re-id methods artificially assume the
+clothes of every single person to be stationary across space and time. This
+condition is mostly valid for short-term re-id scenarios since an average
+person would often change the clothes even within a single day. To alleviate
+this assumption, several recent works have introduced the clothes change facet
+to re-id, with a focus on supervised learning person identity discriminative
+representation with invariance to clothes changes. Taking a step further
+towards this long-term re-id direction, we further eliminate the requirement of
+person identity labels, as they are significantly more expensive and more
+tedious to annotate in comparison to short-term person re-id datasets. Compared
+to conventional unsupervised short-term re-id, this new problem is drastically
+more challenging as different people may have similar clothes whilst the same
+person can wear multiple suites of clothes over different locations and times
+with very distinct appearance. To overcome such obstacles, we introduce a novel
+Curriculum Person Clustering (CPC) method that can adaptively regulate the
+unsupervised clustering criterion according to the clustering confidence.
+Experiments on three long-term person re-id datasets show that our CPC
+outperforms SOTA unsupervised re-id methods and even closely matches the
+supervised re-id models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Informative ViT: Information Aggregation and Distribution for
+  Hyperspectral and LiDAR Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03179v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03179v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqing Zhang, Jie Lei, Weiying Xie, Geng Yang, Daixun Li, Yunsong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In multimodal land cover classification (MLCC), a common challenge is the
+redundancy in data distribution, where irrelevant information from multiple
+modalities can hinder the effective integration of their unique features. To
+tackle this, we introduce the Multimodal Informative Vit (MIVit), a system with
+an innovative information aggregate-distributing mechanism. This approach
+redefines redundancy levels and integrates performance-aware elements into the
+fused representation, facilitating the learning of semantics in both forward
+and backward directions. MIVit stands out by significantly reducing redundancy
+in the empirical distribution of each modality's separate and fused features.
+It employs oriented attention fusion (OAF) for extracting shallow local
+features across modalities in horizontal and vertical dimensions, and a
+Transformer feature extractor for extracting deep global features through
+long-range attention. We also propose an information aggregation constraint
+(IAC) based on mutual information, designed to remove redundant information and
+preserve complementary information within embedded features. Additionally, the
+information distribution flow (IDF) in MIVit enhances performance-awareness by
+distributing global classification information across different modalities'
+feature maps. This architecture also addresses missing modality challenges with
+lightweight independent modality classifiers, reducing the computational load
+typically associated with Transformers. Our results show that MIVit's
+bidirectional aggregate-distributing mechanism between modalities is highly
+effective, achieving an average overall accuracy of 95.56% across three
+multimodal datasets. This performance surpasses current state-of-the-art
+methods in MLCC. The code for MIVit is accessible at
+https://github.com/icey-zhang/MIViT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explainability-Driven Leaf Disease Classification Using Adversarial
+  Training and Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00334v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00334v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian-Vasile Echim, Iulian-Marius Tăiatu, Dumitru-Clementin Cercel, Florin Pop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work focuses on plant leaf disease classification and explores three
+crucial aspects: adversarial training, model explainability, and model
+compression. The models' robustness against adversarial attacks is enhanced
+through adversarial training, ensuring accurate classification even in the
+presence of threats. Leveraging explainability techniques, we gain insights
+into the model's decision-making process, improving trust and transparency.
+Additionally, we explore model compression techniques to optimize computational
+efficiency while maintaining classification performance. Through our
+experiments, we determine that on a benchmark dataset, the robustness can be
+the price of the classification accuracy with performance reductions of 3%-20%
+for regular tests and gains of 50%-70% for adversarial attack tests. We also
+demonstrate that a student model can be 15-25 times more computationally
+efficient for a slight performance reduction, distilling the knowledge of more
+complex models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, Accepted by ICAART 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Qualitative Failures of Image Generation Models and Their Application in
+  Detecting Deepfakes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06470v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06470v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Borji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability of image and video generation models to create photorealistic
+images has reached unprecedented heights, making it difficult to distinguish
+between real and fake images in many cases. However, despite this progress, a
+gap remains between the quality of generated images and those found in the real
+world. To address this, we have reviewed a vast body of literature from both
+academic publications and social media to identify qualitative shortcomings in
+image generation models, which we have classified into five categories. By
+understanding these failures, we can identify areas where these models need
+improvement, as well as develop strategies for detecting deep fakes. The
+prevalence of deep fakes in today's society is a serious concern, and our
+findings can help mitigate their negative impact.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MotionMix: Weakly-Supervised Diffusion for Controllable Motion
+  Generation <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11115v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11115v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nhat M. Hoang, Kehong Gong, Chuan Guo, Michael Bi Mi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Controllable generation of 3D human motions becomes an important topic as the
+world embraces digital transformation. Existing works, though making promising
+progress with the advent of diffusion models, heavily rely on meticulously
+captured and annotated (e.g., text) high-quality motion corpus, a
+resource-intensive endeavor in the real world. This motivates our proposed
+MotionMix, a simple yet effective weakly-supervised diffusion model that
+leverages both noisy and unannotated motion sequences. Specifically, we
+separate the denoising objectives of a diffusion model into two stages:
+obtaining conditional rough motion approximations in the initial $T-T^*$ steps
+by learning the noisy annotated motions, followed by the unconditional
+refinement of these preliminary motions during the last $T^*$ steps using
+unannotated motions. Notably, though learning from two sources of imperfect
+data, our model does not compromise motion generation quality compared to fully
+supervised approaches that access gold data. Extensive experiments on several
+benchmarks demonstrate that our MotionMix, as a versatile framework,
+consistently achieves state-of-the-art performances on text-to-motion,
+action-to-motion, and music-to-dance tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 38th Association for the Advancement of Artificial
+  Intelligence (AAAI) Conference on Artificial Intelligence, Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Diverse In-Context Configurations for Image Captioning <span class="chip">NeurIPS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14800v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14800v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Yang, Yongliang Wu, Mingzhuo Yang, Haokun Chen, Xin Geng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  After discovering that Language Models (LMs) can be good in-context few-shot
+learners, numerous strategies have been proposed to optimize in-context
+sequence configurations. Recently, researchers in Vision-Language (VL) domains
+also develop their few-shot learners, while they only use the simplest way,
+ie., randomly sampling, to configure in-context image-text pairs. In order to
+explore the effects of varying configurations on VL in-context learning, we
+devised four strategies for image selection and four for caption assignment to
+configure in-context image-text pairs for image captioning. Here Image
+Captioning is used as the case study since it can be seen as the
+visually-conditioned LM. Our comprehensive experiments yield two
+counter-intuitive but valuable insights, highlighting the distinct
+characteristics of VL in-context learning due to multi-modal synergy, as
+compared to the NLP case. Furthermore, in our exploration of optimal
+combination strategies, we observed an average performance enhancement of 20.9
+of CIDEr scores compared to the baseline. The code is given in
+https://github.com/yongliang-wu/ExploreCfg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding <span class="highlight-title">Self-Supervised</span> <span class="highlight-title">Pretrain</span>ing with Part-Aware Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11915v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11915v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Zhu, Jiyang Qi, Mingyu Ding, Xiaokang Chen, Ping Luo, Xinggang Wang, Wenyu Liu, Leye Wang, Jingdong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we are interested in understanding self-supervised pretraining
+through studying the capability that self-supervised representation pretraining
+methods learn part-aware representations. The study is mainly motivated by that
+random views, used in contrastive learning, and random masked (visible)
+patches, used in masked image modeling, are often about object parts.
+  We explain that contrastive learning is a part-to-whole task: the projection
+layer hallucinates the whole object representation from the object part
+representation learned from the encoder, and that masked image modeling is a
+part-to-part task: the masked patches of the object are hallucinated from the
+visible patches. The explanation suggests that the self-supervised pretrained
+encoder is required to understand the object part. We empirically compare the
+off-the-shelf encoders pretrained with several representative methods on
+object-level recognition and part-level recognition. The results show that the
+fully-supervised model outperforms self-supervised models for object-level
+recognition, and most self-supervised contrastive learning and masked image
+modeling methods outperform the fully-supervised method for part-level
+recognition. It is observed that the combination of contrastive learning and
+masked image modeling further improves the performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal Misinformation Detection: Approaches, Challenges and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.13883v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.13883v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Abdali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As social media platforms are evolving from text-based forums into
+multi-modal environments, the nature of misinformation in social media is also
+transforming accordingly. Taking advantage of the fact that visual modalities
+such as images and videos are more favorable and attractive to the users and
+textual contents are sometimes skimmed carelessly, misinformation spreaders
+have recently targeted contextual connections between the modalities e.g., text
+and image. Hence many researchers have developed automatic techniques for
+detecting possible cross-modal discordance in web-based content. We analyze,
+categorize and identify existing approaches in addition to challenges and
+shortcomings they face in order to unearth new research opportunities in the
+field of multi-modal misinformation detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning <span class="highlight-title">Prompt</span>-Enhanced Context Features for Weakly-Supervised Video
+  Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14451v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14451v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujiang Pu, Xiaoyu Wu, Lulu Yang, Shengjin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video anomaly detection under weak supervision presents significant
+challenges, particularly due to the lack of frame-level annotations during
+training. While prior research has utilized graph convolution networks and
+self-attention mechanisms alongside multiple instance learning (MIL)-based
+classification loss to model temporal relations and learn discriminative
+features, these methods often employ multi-branch architectures to capture
+local and global dependencies separately, resulting in increased parameters and
+computational costs. Moreover, the coarse-grained interclass separability
+provided by the binary constraint of MIL-based loss neglects the fine-grained
+discriminability within anomalous classes. In response, this paper introduces a
+weakly supervised anomaly detection framework that focuses on efficient context
+modeling and enhanced semantic discriminability. We present a Temporal Context
+Aggregation (TCA) module that captures comprehensive contextual information by
+reusing the similarity matrix and implementing adaptive fusion. Additionally,
+we propose a Prompt-Enhanced Learning (PEL) module that integrates semantic
+priors using knowledge-based prompts to boost the discriminative capacity of
+context features while ensuring separability between anomaly sub-classes.
+Extensive experiments validate the effectiveness of our method's components,
+demonstrating competitive performance with reduced parameters and computational
+effort on three challenging benchmarks: UCF-Crime, XD-Violence, and
+ShanghaiTech datasets. Notably, our approach significantly improves the
+detection accuracy of certain anomaly sub-classes, underscoring its practical
+value and efficacy. Our code is available at:
+https://github.com/yujiangpu20/PEL4VAD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modeling Stereo-Confidence Out of the End-to-End Stereo-Matching Network
+  via Disparity Plane Sweep <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12001v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12001v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jae Young Lee, Woonghyun Ka, Jaehyun Choi, Junmo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel stereo-confidence that can be measured externally to
+various stereo-matching networks, offering an alternative input modality choice
+of the cost volume for learning-based approaches, especially in safety-critical
+systems. Grounded in the foundational concepts of disparity definition and the
+disparity plane sweep, the proposed stereo-confidence method is built upon the
+idea that any shift in a stereo-image pair should be updated in a corresponding
+amount shift in the disparity map. Based on this idea, the proposed
+stereo-confidence method can be summarized in three folds. 1) Using the
+disparity plane sweep, multiple disparity maps can be obtained and treated as a
+3-D volume (predicted disparity volume), like the cost volume is constructed.
+2) One of these disparity maps serves as an anchor, allowing us to define a
+desirable (or ideal) disparity profile at every spatial point. 3) By comparing
+the desirable and predicted disparity profiles, we can quantify the level of
+matching ambiguity between left and right images for confidence measurement.
+Extensive experimental results using various stereo-matching networks and
+datasets demonstrate that the proposed stereo-confidence method not only shows
+competitive performance on its own but also consistent performance improvements
+when it is used as an input modality for learning-based stereo-confidence
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2024. The first two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stereo-Matching Knowledge Distilled Monocular Depth Estimation Filtered
+  by Multiple Disparity Consistency <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12019v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12019v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Woonghyun Ka, Jae Young Lee, Jaehyun Choi, Junmo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In stereo-matching knowledge distillation methods of the self-supervised
+monocular depth estimation, the stereo-matching network's knowledge is
+distilled into a monocular depth network through pseudo-depth maps. In these
+methods, the learning-based stereo-confidence network is generally utilized to
+identify errors in the pseudo-depth maps to prevent transferring the errors.
+However, the learning-based stereo-confidence networks should be trained with
+ground truth (GT), which is not feasible in a self-supervised setting. In this
+paper, we propose a method to identify and filter errors in the pseudo-depth
+map using multiple disparity maps by checking their consistency without the
+need for GT and a training process. Experimental results show that the proposed
+method outperforms the previous methods and works well on various
+configurations by filtering out erroneous areas where the stereo-matching is
+vulnerable, especially such as textureless regions, occlusion boundaries, and
+reflective surfaces.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2024. The first two authors are equally contributed</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IPR-NeRF: Ownership Verification meets Neural Radiance Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09495v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09495v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Win Kent Ong, Kam Woh Ng, Chee Seng Chan, Yi Zhe Song, Tao Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Field (NeRF) models have gained significant attention in the
+computer vision community in the recent past with state-of-the-art visual
+quality and produced impressive demonstrations. Since then, technopreneurs have
+sought to leverage NeRF models into a profitable business. Therefore, NeRF
+models make it worth the risk of plagiarizers illegally copying,
+re-distributing, or misusing those models. This paper proposes a comprehensive
+intellectual property (IP) protection framework for the NeRF model in both
+black-box and white-box settings, namely IPR-NeRF. In the black-box setting, a
+diffusion-based solution is introduced to embed and extract the watermark via a
+two-stage optimization process. In the white-box setting, a designated digital
+signature is embedded into the weights of the NeRF model by adopting the sign
+loss objective. Our extensive experiments demonstrate that not only does our
+approach maintain the fidelity (\ie, the rendering quality) of IPR-NeRF models,
+but it is also robust against both ambiguity and removal attacks compared to
+prior arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Error on result tabulation of state of the art method which might
+  cause misleading to readers</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Action Assessment via Task-Consistent Score-Discriminative
+  Feature Distribution Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17105v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17105v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan-Ming Li, Ling-An Zeng, Jing-Ke Meng, Wei-Shi Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Action Quality Assessment (AQA) is a task that tries to answer how well an
+action is carried out. While remarkable progress has been achieved, existing
+works on AQA assume that all the training data are visible for training in one
+time, but do not enable continual learning on assessing new technical actions.
+In this work, we address such a Continual Learning problem in AQA
+(Continual-AQA), which urges a unified model to learn AQA tasks sequentially
+without forgetting. Our idea for modeling Continual-AQA is to sequentially
+learn a task-consistent score-discriminative feature distribution, in which the
+latent features express a strong correlation with the score labels regardless
+of the task or action types. From this perspective, we aim to mitigate the
+forgetting in Continual-AQA from two aspects. Firstly, to fuse the features of
+new and previous data into a score-discriminative distribution, a novel
+Feature-Score Correlation-Aware Rehearsal is proposed to store and reuse data
+from previous tasks with limited memory size. Secondly, an Action
+General-Specific Graph is developed to learn and decouple the action-general
+and action-specific knowledge so that the task-consistent score-discriminative
+features can be better extracted across various tasks. Extensive experiments
+are conducted to evaluate the contributions of proposed components. The
+comparisons with the existing continual learning methods additionally verify
+the effectiveness and versatility of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S$^{2}$-DMs:Skip-Step Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01520v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01520v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Wang, Shuangyin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as powerful generative tools, rivaling GANs in
+sample quality and mirroring the likelihood scores of autoregressive models. A
+subset of these models, exemplified by DDIMs, exhibit an inherent asymmetry:
+they are trained over $T$ steps but only sample from a subset of $T$ during
+generation. This selective sampling approach, though optimized for speed,
+inadvertently misses out on vital information from the unsampled steps, leading
+to potential compromises in sample quality. To address this issue, we present
+the S$^{2}$-DMs, which is a new training method by using an innovative
+$L_{skip}$, meticulously designed to reintegrate the information omitted during
+the selective sampling phase. The benefits of this approach are manifold: it
+notably enhances sample quality, is exceptionally simple to implement, requires
+minimal code modifications, and is flexible enough to be compatible with
+various sampling algorithms. On the CIFAR10 dataset, models trained using our
+algorithm showed an improvement of 3.27% to 14.06% over models trained with
+traditional methods across various sampling algorithms (DDIMs, PNDMs, DEIS) and
+different numbers of sampling steps (10, 20, ..., 1000). On the CELEBA dataset,
+the improvement ranged from 8.97% to 27.08%. Access to the code and additional
+resources is provided in the github.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Chain of Thought: Bridging Logical Gaps with Multimodal
+  Infillings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02317v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02317v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Rose, Vaishnavi Himakunthala, Andy Ouyang, Ryan He, Alex Mei, Yujie Lu, Michael Saxon, Chinmay Sonar, Diba Mirza, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language models elicit reasoning in a
+chain-of-thought that allows models to decompose problems in a human-like
+fashion. Though this paradigm improves multi-step reasoning ability in language
+models, it is limited by being unimodal and applied mainly to
+question-answering tasks. We claim that incorporating visual augmentation into
+reasoning is essential, especially for complex, imaginative tasks.
+Consequently, we introduce VCoT, a novel method that leverages chain-of-thought
+prompting with vision-language grounding to recursively bridge the logical gaps
+within sequential data. Our method uses visual guidance to generate synthetic
+multimodal infillings that add consistent and novel information to reduce the
+logical gaps for downstream tasks that can benefit from temporal reasoning, as
+well as provide interpretability into models' multi-step reasoning. We apply
+VCoT to the Visual Storytelling and WikiHow summarization datasets and
+demonstrate through human evaluation that VCoT offers novel and consistent
+synthetic data augmentation beating chain-of-thought baselines, which can be
+used to enhance downstream performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TIM: An Efficient Temporal Interaction Module for Spiking <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11687v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11687v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicheng Shen, Dongcheng Zhao, Guobin Shen, Yi Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNNs), as the third generation of neural networks,
+have gained prominence for their biological plausibility and computational
+efficiency, especially in processing diverse datasets. The integration of
+attention mechanisms, inspired by advancements in neural network architectures,
+has led to the development of Spiking Transformers. These have shown promise in
+enhancing SNNs' capabilities, particularly in the realms of both static and
+neuromorphic datasets. Despite their progress, a discernible gap exists in
+these systems, specifically in the Spiking Self Attention (SSA) mechanism's
+effectiveness in leveraging the temporal processing potential of SNNs. To
+address this, we introduce the Temporal Interaction Module (TIM), a novel,
+convolution-based enhancement designed to augment the temporal data processing
+abilities within SNN architectures. TIM's integration into existing SNN
+frameworks is seamless and efficient, requiring minimal additional parameters
+while significantly boosting their temporal information handling capabilities.
+Through rigorous experimentation, TIM has demonstrated its effectiveness in
+exploiting temporal information, leading to state-of-the-art performance across
+various neuromorphic datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9pages,6figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ultrafast and Ultralight Network-Based Intelligent System for Real-time
+  Diagnosis of Ear Diseases in Any Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10610v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10610v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubiao Yue, Xinyu Zeng, Xiaoqiang Shi, Meiping Zhang, Haihua Liang, Fan Zhang, Yanmei Chen, Zefeng Xie, Wenrui Wu, Zhenzhang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional ear disease diagnosis heavily depends on experienced specialists
+and specialized equipment, frequently resulting in misdiagnoses, treatment
+delays, and financial burdens for some patients. Utilizing deep learning models
+for efficient ear disease diagnosis has proven effective and affordable.
+However, existing research overlooked model inference speed and parameter size
+required for deployment. To tackle these challenges, we constructed a
+large-scale dataset comprising eight ear disease categories and normal ear
+canal samples from two hospitals. Inspired by ShuffleNetV2, we developed
+Best-EarNet, an ultrafast and ultralight network enabling real-time ear disease
+diagnosis. Best-EarNet incorporates the novel Local-Global Spatial Feature
+Fusion Module which can capture global and local spatial information
+simultaneously and guide the network to focus on crucial regions within feature
+maps at various levels, mitigating low accuracy issues. Moreover, our network
+uses multiple auxiliary classification heads for efficient parameter
+optimization. With 0.77M parameters, Best-EarNet achieves an average frames per
+second of 80 on CPU. Employing transfer learning and five-fold cross-validation
+with 22,581 images from Hospital-1, the model achieves an impressive 95.23%
+accuracy. External testing on 1,652 images from Hospital-2 validates its
+performance, yielding 92.14% accuracy. Compared to state-of-the-art networks,
+Best-EarNet establishes a new state-of-the-art (SOTA) in practical
+applications. Most importantly, we developed an intelligent diagnosis system
+called Ear Keeper, which can be deployed on common electronic devices. By
+manipulating a compact electronic otoscope, users can perform comprehensive
+scanning and diagnosis of the ear canal using real-time video. This study
+provides a novel paradigm for ear endoscopy and other medical endoscopic image
+recognition applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Superpixel Generation and Clustering for Weakly Supervised
+  Segmentation of Brain Tumors in MR Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.09930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.09930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jay J. Yoo, Khashayar Namdar, Farzad Khalvati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training machine learning models to segment tumors and other anomalies in
+medical images is an important step for developing diagnostic tools but
+generally requires manually annotated ground truth segmentations, which
+necessitates significant time and resources. This work proposes the use of a
+superpixel generation model and a superpixel clustering model to enable weakly
+supervised brain tumor segmentations. The proposed method utilizes binary
+image-level classification labels, which are readily accessible, to
+significantly improve the initial region of interest segmentations generated by
+standard weakly supervised methods without requiring ground truth annotations.
+We used 2D slices of magnetic resonance brain scans from the Multimodal Brain
+Tumor Segmentation Challenge 2020 dataset and labels indicating the presence of
+tumors to train the pipeline. On the test cohort, our method achieved a mean
+Dice coefficient of 0.691 and a mean 95% Hausdorff distance of 18.1,
+outperforming existing superpixel-based weakly supervised segmentation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, LaTeX; updated methodology, added additional results,
+  revised discussion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ JoIN: Joint GANs Inversion for Intrinsic Image Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11321v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11321v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viraj Shah, Svetlana Lazebnik, Julien Philip
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose to solve ill-posed inverse imaging problems using a
+bank of Generative Adversarial Networks (GAN) as a prior and apply our method
+to the case of Intrinsic Image Decomposition for faces and materials. Our
+method builds on the demonstrated success of GANs to capture complex image
+distributions. At the core of our approach is the idea that the latent space of
+a GAN is a well-suited optimization domain to solve inverse problems. Given an
+input image, we propose to jointly inverse the latent codes of a set of GANs
+and combine their outputs to reproduce the input. Contrary to most GAN
+inversion methods which are limited to inverting only a single GAN, we
+demonstrate that it is possible to maintain distribution priors while inverting
+several GANs jointly. We show that our approach is modular, allowing various
+forward imaging models, and that it can successfully decompose both synthetic
+and real images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage is available at https://virajshah.com/join</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAR-RARP50: Segmentation of surgical instrumentation and Action
+  Recognition on Robot-Assisted Radical Prostatectomy Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrios Psychogyios, Emanuele Colleoni, Beatrice Van Amsterdam, Chih-Yang Li, Shu-Yu Huang, Yuchong Li, Fucang Jia, Baosheng Zou, Guotai Wang, Yang Liu, Maxence Boels, Jiayu Huo, Rachel Sparks, Prokar Dasgupta, Alejandro Granados, Sebastien Ourselin, Mengya Xu, An Wang, Yanan Wu, Long Bai, Hongliang Ren, Atsushi Yamada, Yuriko Harai, Yuto Ishikawa, Kazuyuki Hayashi, Jente Simoens, Pieter DeBacker, Francesco Cisternino, Gabriele Furnari, Alex Mottrie, Federica Ferraguti, Satoshi Kondo, Satoshi Kasai, Kousuke Hirasawa, Soohee Kim, Seung Hyun Lee, Kyu Eun Lee, Hyoun-Joong Kong, Kui Fu, Chao Li, Shan An, Stefanie Krell, Sebastian Bodenstedt, Nicolas Ayobi, Alejandra Perez, Santiago Rodriguez, Juanita Puentes, Pablo Arbelaez, Omid Mohareri, Danail Stoyanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surgical tool segmentation and action recognition are fundamental building
+blocks in many computer-assisted intervention applications, ranging from
+surgical skills assessment to decision support systems. Nowadays,
+learning-based action recognition and segmentation approaches outperform
+classical methods, relying, however, on large, annotated datasets. Furthermore,
+action recognition and tool segmentation algorithms are often trained and make
+predictions in isolation from each other, without exploiting potential
+cross-task relationships. With the EndoVis 2022 SAR-RARP50 challenge, we
+release the first multimodal, publicly available, in-vivo, dataset for surgical
+action recognition and semantic instrumentation segmentation, containing 50
+suturing video segments of Robotic Assisted Radical Prostatectomy (RARP). The
+aim of the challenge is twofold. First, to enable researchers to leverage the
+scale of the provided dataset and develop robust and highly accurate
+single-task action recognition and tool segmentation approaches in the surgical
+domain. Second, to further explore the potential of multitask-based learning
+approaches and determine their comparative advantage against their single-task
+counterparts. A total of 12 teams participated in the challenge, contributing 7
+action recognition methods, 9 instrument segmentation techniques, and 4
+multitask approaches that integrated both action recognition and instrument
+segmentation. The complete SAR-RARP50 dataset is available at:
+https://rdr.ucl.ac.uk/projects/SARRARP50_Segmentation_of_surgical_instrumentation_and_Action_Recognition_on_Robot-Assisted_Radical_Prostatectomy_Challenge/191091
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soft Augmentation for Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.04625v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.04625v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu, Shen Yan, Laura Leal-Taixé, James Hays, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern neural networks are over-parameterized and thus rely on strong
+regularization such as data augmentation and weight decay to reduce overfitting
+and improve generalization. The dominant form of data augmentation applies
+invariant transforms, where the learning target of a sample is invariant to the
+transform applied to that sample. We draw inspiration from human visual
+classification studies and propose generalizing augmentation with invariant
+transforms to soft augmentation where the learning target softens non-linearly
+as a function of the degree of the transform applied to the sample: e.g., more
+aggressive image crop augmentations produce less confident learning targets. We
+demonstrate that soft targets allow for more aggressive data augmentation,
+offer more robust performance boosts, work with other augmentation policies,
+and interestingly, produce better calibrated models (since they are trained to
+be less confident on aggressively cropped/occluded examples). Combined with
+existing aggressive augmentation strategies, soft target 1) doubles the top-1
+accuracy boost across Cifar-10, Cifar-100, ImageNet-1K, and ImageNet-V2, 2)
+improves model occlusion performance by up to $4\times$, and 3) halves the
+expected calibration error (ECE). Finally, we show that soft augmentation
+generalizes to self-supervised classification tasks. Code available at
+https://github.com/youngleox/soft_augmentation
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linguistic Binding in Diffusion Models: Enhancing Attribute
+  Correspondence through Attention Map Alignment <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08877v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08877v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Royi Rassin, Eran Hirsch, Daniel Glickman, Shauli Ravfogel, Yoav Goldberg, Gal Chechik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-conditioned image generation models often generate incorrect
+associations between entities and their visual attributes. This reflects an
+impaired mapping between linguistic binding of entities and modifiers in the
+prompt and visual binding of the corresponding elements in the generated image.
+As one notable example, a query like "a pink sunflower and a yellow flamingo"
+may incorrectly produce an image of a yellow sunflower and a pink flamingo. To
+remedy this issue, we propose SynGen, an approach which first syntactically
+analyses the prompt to identify entities and their modifiers, and then uses a
+novel loss function that encourages the cross-attention maps to agree with the
+linguistic binding reflected by the syntax. Specifically, we encourage large
+overlap between attention maps of entities and their modifiers, and small
+overlap with other entities and modifier words. The loss is optimized during
+inference, without retraining or fine-tuning the model. Human evaluation on
+three datasets, including one new and challenging set, demonstrate significant
+improvements of SynGen compared with current state of the art methods. This
+work highlights how making use of sentence structure during inference can
+efficiently and substantially improve the faithfulness of text-to-image
+generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023 (oral). Our code is publicly available at
+  https://github.com/RoyiRa/Syntax-Guided-Generation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigate Replication and Copying in Diffusion Models with Generalized
+  Caption and Dual Fusion Enhancement <span class="chip">ICASSP
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.07254v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.07254v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenghao Li, Dake Chen, Yuke Zhang, Peter A. Beerel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While diffusion models demonstrate a remarkable capability for generating
+high-quality images, their tendency to `replicate' training data raises privacy
+concerns. Although recent research suggests that this replication may stem from
+the insufficient generalization of training data captions and duplication of
+training images, effective mitigation strategies remain elusive. To address
+this gap, our paper first introduces a generality score that measures the
+caption generality and employ large language model (LLM) to generalize training
+captions. Subsequently, we leverage generalized captions and propose a novel
+dual fusion enhancement approach to mitigate the replication of diffusion
+models. Our empirical results demonstrate that our proposed methods can
+significantly reduce replication by 43.5% compared to the original diffusion
+model while maintaining the diversity and quality of generations. Code is
+available at https://github.com/HowardLi0816/dual-fusion-diffusion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for presentation at 2024 IEEE
+  International Conference on Acoustics, Speech, and Signal Processing (ICASSP
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Impact of Blur and Resolution on Demographic Disparities in 1-to-Many
+  Facial Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.04447v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.04447v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aman Bhatta, Gabriella Pangelinan, Michael C. King, Kevin W. Bowyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most studies to date that have examined demographic variations in face
+recognition accuracy have analyzed 1-to-1 matching accuracy, using images that
+could be described as "government ID quality". This paper analyzes the accuracy
+of 1-to-many facial identification across demographic groups, and in the
+presence of blur and reduced resolution in the probe image as might occur in
+"surveillance camera quality" images. Cumulative match characteristic curves
+(CMC) are not appropriate for comparing propensity for rank-one recognition
+errors across demographics, and so we use three metrics for our analysis: (1)
+the well-known d' metric between mated and non-mated score distributions, and
+introduced in this work, (2) absolute score difference between thresholds in
+the high-similarity tail of the non-mated and the low-similarity tail of the
+mated distribution, and (3) distribution of (mated - non-mated rank-one scores)
+across the set of probe images. We find that demographic variation in 1-to-many
+accuracy does not entirely follow what has been observed in 1-to-1 matching
+accuracy. Also, different from 1-to-1 accuracy, demographic comparison of
+1-to-many accuracy can be affected by different numbers of identities and
+images across demographics. More importantly, we show that increased blur in
+the probe image, or reduced resolution of the face in the probe image, can
+significantly increase the false positive identification rate. And we show that
+the demographic variation in these high blur or low resolution conditions is
+much larger for male / female than for African-American / Caucasian. The point
+that 1-to-many accuracy can potentially collapse in the context of processing
+"surveillance camera quality" probe images against a "government ID quality"
+gallery is an important one.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 8 figures, Conference submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scene Graph Generation via Conditional Random Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1811.08075v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1811.08075v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weilin Cong, William Wang, Wang-Chien Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the great success object detection and segmentation models have
+achieved in recognizing individual objects in images, performance on cognitive
+tasks such as image caption, semantic image retrieval, and visual QA is far
+from satisfactory. To achieve better performance on these cognitive tasks,
+merely recognizing individual object instances is insufficient. Instead, the
+interactions between object instances need to be captured in order to
+facilitate reasoning and understanding of the visual scenes in an image. Scene
+graph, a graph representation of images that captures object instances and
+their relationships, offers a comprehensive understanding of an image. However,
+existing techniques on scene graph generation fail to distinguish subjects and
+objects in the visual scenes of images and thus do not perform well with
+real-world datasets where exist ambiguous object instances. In this work, we
+propose a novel scene graph generation model for predicting object instances
+and its corresponding relationships in an image. Our model, SG-CRF, learns the
+sequential order of subject and object in a relationship triplet, and the
+semantic compatibility of object instance nodes and relationship nodes in a
+scene graph efficiently. Experiments empirically show that SG-CRF outperforms
+the state-of-the-art methods, on three different datasets, i.e., CLEVR, VRD,
+and Visual Genome, raising the Recall@100 from 24.99% to 49.95%, from 41.92% to
+50.47%, and from 54.69% to 54.77%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Need to withdraw this draft as requested by collaborators</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">19</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient Flow of Energy: A General and Efficient Approach for Entity
+  Alignment Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12798v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12798v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyi Wang, Haifeng Sun, Jingyu Wang, Qi Qi, Shaoling Sun, Jianxin Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity alignment (EA), a pivotal process in integrating multi-source
+Knowledge Graphs (KGs), seeks to identify equivalent entity pairs across these
+graphs. Most existing approaches regard EA as a graph representation learning
+task, concentrating on enhancing graph encoders. However, the decoding process
+in EA - essential for effective operation and alignment accuracy - has received
+limited attention and remains tailored to specific datasets and model
+architectures, necessitating both entity and additional explicit relation
+embeddings. This specificity limits its applicability, particularly in
+GNN-based models. To address this gap, we introduce a novel, generalized, and
+efficient decoding approach for EA, relying solely on entity embeddings. Our
+method optimizes the decoding process by minimizing Dirichlet energy, leading
+to the gradient flow within the graph, to promote graph homophily. The
+discretization of the gradient flow produces a fast and scalable approach,
+termed Triple Feature Propagation (TFP). TFP innovatively channels gradient
+flow through three views: entity-to-entity, entity-to-relation, and
+relation-to-entity. This generalized gradient flow enables TFP to harness the
+multi-view structural information of KGs. Rigorous experimentation on diverse
+real-world datasets demonstrates that our approach significantly enhances
+various EA methods. Notably, the approach achieves these advancements with less
+than 6 seconds of additional computational time, establishing a new benchmark
+in efficiency and adaptability for future EA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CDRNP: Cross-Domain Recommendation to Cold-Start Users via Neural
+  Process <span class="chip">WSDM'2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaodong Li, Jiawei Sheng, Jiangxia Cao, Wenyuan Zhang, Quangang Li, Tingwen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain recommendation (CDR) has been proven as a promising way to
+tackle the user cold-start problem, which aims to make recommendations for
+users in the target domain by transferring the user preference derived from the
+source domain. Traditional CDR studies follow the embedding and mapping (EMCDR)
+paradigm, which transfers user representations from the source to target domain
+by learning a user-shared mapping function, neglecting the user-specific
+preference. Recent CDR studies attempt to learn user-specific mapping functions
+in meta-learning paradigm, which regards each user's CDR as an individual task,
+but neglects the preference correlations among users, limiting the beneficial
+information for user representations. Moreover, both of the paradigms neglect
+the explicit user-item interactions from both domains during the mapping
+process. To address the above issues, this paper proposes a novel CDR framework
+with neural process (NP), termed as CDRNP. Particularly, it develops the
+meta-learning paradigm to leverage user-specific preference, and further
+introduces a stochastic process by NP to capture the preference correlations
+among the overlapping and cold-start users, thus generating more powerful
+mapping functions by mapping the user-specific preference and common preference
+correlations to a predictive probability distribution. In addition, we also
+introduce a preference remainer to enhance the common preference from the
+overlapping users, and finally devises an adaptive conditional decoder with
+preference modulation to make prediction for cold-start users with items in the
+target domain. Experimental results demonstrate that CDRNP outperforms previous
+SOTA methods in three real-world CDR scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by WSDM'2024 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MOReGIn: Multi-Objective Recommendation at the Global and Individual
+  Levels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12593v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12593v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizabeth Gómez, David Contreras, Ludovico Boratto, Maria Salamó
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Objective Recommender Systems (MORSs) emerged as a paradigm to
+guarantee multiple (often conflicting) goals. Besides accuracy, a MORS can
+operate at the global level, where additional beyond-accuracy goals are met for
+the system as a whole, or at the individual level, meaning that the
+recommendations are tailored to the needs of each user. The state-of-the-art
+MORSs either operate at the global or individual level, without assuming the
+co-existence of the two perspectives. In this study, we show that when global
+and individual objectives co-exist, MORSs are not able to meet both types of
+goals. To overcome this issue, we present an approach that regulates the
+recommendation lists so as to guarantee both global and individual
+perspectives, while preserving its effectiveness. Specifically, as individual
+perspective, we tackle genre calibration and, as global perspective, provider
+fairness. We validate our approach on two real-world datasets, publicly
+released with this paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PolyCF: Towards the Optimal Spectral Graph Filters for Collaborative
+  Filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12590v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12590v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifang Qin, Wei Ju, Xiao Luo, Yiyang Gu, Ming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative Filtering (CF) is a pivotal research area in recommender
+systems that capitalizes on collaborative similarities between users and items
+to provide personalized recommendations. With the remarkable achievements of
+node embedding-based Graph Neural Networks (GNNs), we explore the upper bounds
+of expressiveness inherent to embedding-based methodologies and tackle the
+challenges by reframing the CF task as a graph signal processing problem. To
+this end, we propose PolyCF, a flexible graph signal filter that leverages
+polynomial graph filters to process interaction signals. PolyCF exhibits the
+capability to capture spectral features across multiple eigenspaces through a
+series of Generalized Gram filters and is able to approximate the optimal
+polynomial response function for recovering missing interactions. A graph
+optimization objective and a pair-wise ranking objective are jointly used to
+optimize the parameters of the convolution kernel. Experiments on three widely
+adopted datasets demonstrate the superiority of PolyCF over current
+state-of-the-art CF methods. Moreover, comprehensive studies empirically
+validate each component's efficacy in the proposed PolyCF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InfoRank: Unbiased Learning-to-Rank via Conditional Mutual Information
+  Minimization <span class="chip">WWW 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12553v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12553v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarui Jin, Zexue He, Mengyue Yang, Weinan Zhang, Yong Yu, Jun Wang, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ranking items regarding individual user interests is a core technique of
+multiple downstream tasks such as recommender systems. Learning such a
+personalized ranker typically relies on the implicit feedback from users' past
+click-through behaviors. However, collected feedback is biased toward
+previously highly-ranked items and directly learning from it would result in a
+"rich-get-richer" phenomenon. In this paper, we propose a simple yet sufficient
+unbiased learning-to-rank paradigm named InfoRank that aims to simultaneously
+address both position and popularity biases. We begin by consolidating the
+impacts of those biases into a single observation factor, thereby providing a
+unified approach to addressing bias-related issues. Subsequently, we minimize
+the mutual information between the observation estimation and the relevance
+estimation conditioned on the input features. By doing so, our relevance
+estimation can be proved to be free of bias. To implement InfoRank, we first
+incorporate an attention mechanism to capture latent correlations within
+user-item features, thereby generating estimations of observation and
+relevance. We then introduce a regularization term, grounded in conditional
+mutual information, to promote conditional independence between relevance
+estimation and observation estimation. Experimental evaluations conducted
+across three extensive recommendation and search datasets reveal that InfoRank
+learns more precise and unbiased ranking strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WWW 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DREditor: An Time-efficient Approach for Building a Domain-specific
+  Dense Retrieval Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Huang, Duanyu Feng, Wenqiang Lei, Jiancheng Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying dense retrieval models efficiently is becoming increasingly
+important across various industries. This is especially true for enterprise
+search services, where customizing search engines to meet the time demands of
+different enterprises in different domains is crucial. Motivated by this, we
+develop a time-efficient approach called DREditor to edit the matching rule of
+an off-the-shelf dense retrieval model to suit a specific domain. This is
+achieved by directly calibrating the output embeddings of the model using an
+efficient and effective linear mapping. This mapping is powered by an edit
+operator that is obtained by solving a specially constructed least squares
+problem. Compared to implicit rule modification via long-time finetuning, our
+experimental results show that DREditor provides significant advantages on
+different domain-specific datasets, dataset sources, retrieval models, and
+computing devices. It consistently enhances time efficiency by 100-300 times
+while maintaining comparable or even superior retrieval performance. In a
+broader context, we take the first step to introduce a novel embedding
+calibration approach for the retrieval task, filling the technical blank in the
+current field of embedding calibration. This approach also paves the way for
+building domain-specific dense retrieval models efficiently and inexpensively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures, Codes are available at
+  https://github.com/huangzichun/DREditor</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Key Information Retrieval to Classify the Unstructured Data Content of
+  Preferential Trade Agreements <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahui Zhao, Ziyi Meng, Stepan Gordeev, Zijie Pan, Dongjin Song, Sandro Steinbach, Caiwen Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid proliferation of textual data, predicting long texts has
+emerged as a significant challenge in the domain of natural language
+processing. Traditional text prediction methods encounter substantial
+difficulties when grappling with long texts, primarily due to the presence of
+redundant and irrelevant information, which impedes the model's capacity to
+capture pivotal insights from the text. To address this issue, we introduce a
+novel approach to long-text classification and prediction. Initially, we employ
+embedding techniques to condense the long texts, aiming to diminish the
+redundancy therein. Subsequently,the Bidirectional Encoder Representations from
+Transformers (BERT) embedding method is utilized for text classification
+training. Experimental outcomes indicate that our method realizes considerable
+performance enhancements in classifying long texts of Preferential Trade
+Agreements. Furthermore, the condensation of text through embedding methods not
+only augments prediction accuracy but also substantially reduces computational
+complexity. Overall, this paper presents a strategy for long-text prediction,
+offering a valuable reference for researchers and engineers in the natural
+language processing sphere.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AI4TS Workshop@AAAI 2024 accepted publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Persona-centric Metamorphic Relation guided Robustness Evaluation for
+  Multi-turn Dialogue Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12483v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12483v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanbing Chen, Lin Li, Xiaohui Tao, Dong Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently there has been significant progress in the field of dialogue system
+thanks to the introduction of training paradigms such as fine-tune and prompt
+learning. Persona can function as the prior knowledge for maintaining the
+personality consistency of dialogue systems, which makes it perform well on
+accuracy. Nonetheless, the conventional reference-based evaluation method falls
+short in capturing the genuine text comprehension prowess of the model,
+significantly relying on the quality of data annotation. In contrast, the
+application of metamorphic testing offers a more profound insight into the
+model's distinct capabilities without necessitating supplementary annotation
+labels. This approach furnishes a more comprehensive portrayal of the model's
+intricacies and exposes intricacies concealed within reference-based validation
+techniques. Consequently, we introduce a persona-centric metamorphic relation
+construction for metamorphic testing, aimed at evaluating both the persona
+consistency and robustness of personalized dialogue models. For that reason,
+this work evaluates several widely used training paradigms including learning
+from scratch, pretrain + fine-tune and prompt learning in personalized dialogue
+retrieval to know if they are more robust or if they have the same flaws as
+their predecessor. Under three kinds of designed metamorphic relations with
+consistent outputs, our experimental results reveal that prompt learning shows
+stronger robustness compared to training from scratch and fine-tune. Although
+tested retrieval models gain competitively high retrieval accuracy according to
+the traditional reference-based validation, they are still fragile and
+demonstrate various unexpected behaviors, thus there is still room for future
+improvement in personalized dialogue retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Session-level Normalization and Click-through Data Enhancement for
+  Session-based Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Chen, Zhicheng Dou, Jiaxin Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since a user usually has to issue a sequence of queries and examine multiple
+documents to resolve a complex information need in a search session,
+researchers have paid much attention to evaluating search systems at the
+session level rather than the single-query level. Most existing session-level
+metrics evaluate each query separately and then aggregate the query-level
+scores using a session-level weighting function. The assumptions behind these
+metrics are that all queries in the session should be involved, and their
+orders are fixed. However, if a search system could make the user satisfied
+with her first few queries, she may not need any subsequent queries. Besides,
+in most real-world search scenarios, due to a lack of explicit feedback from
+real users, we can only leverage some implicit feedback, such as users' clicks,
+as relevance labels for offline evaluation. Such implicit feedback might be
+different from the real relevance in a search session as some documents may be
+omitted in the previous query but identified in the later reformulations. To
+address the above issues, we make two assumptions about session-based
+evaluation, which explicitly describe an ideal session-search system and how to
+enhance click-through data in computing session-level evaluation metrics. Based
+on our assumptions, we design a session-level metric called Normalized
+U-Measure (NUM). NUM evaluates a session as a whole and utilizes an ideal
+session to normalize the result of the actual session. Besides, it infers
+session-level relevance labels based on implicit feedback. Experiments on two
+public datasets demonstrate the effectiveness of NUM by comparing it with
+existing session-based metrics in terms of correlation with user satisfaction
+and intuitiveness. We also conduct ablation studies to explore whether these
+assumptions hold.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GraphPro: Graph <span class="highlight-title">Pre-train</span>ing and <span class="highlight-title">Prompt</span> Learning for Recommendation <span class="chip">WWW'2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.16716v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.16716v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Yang, Lianghao Xia, Da Luo, Kangyi Lin, Chao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GNN-based recommenders have excelled in modeling intricate user-item
+interactions through multi-hop message passing. However, existing methods often
+overlook the dynamic nature of evolving user-item interactions, which impedes
+the adaption to changing user preferences and distribution shifts in newly
+arriving data. Thus, their scalability and performances in real-world dynamic
+environments are limited. In this study, we propose GraphPro, a framework that
+incorporates parameter-efficient and dynamic graph pre-training with prompt
+learning. This novel combination empowers GNNs to effectively capture both
+long-term user preferences and short-term behavior dynamics, enabling the
+delivery of accurate and timely recommendations. Our GraphPro framework
+addresses the challenge of evolving user preferences by seamlessly integrating
+a temporal prompt mechanism and a graph-structural prompt learning mechanism
+into the pre-trained GNN model. The temporal prompt mechanism encodes time
+information on user-item interaction, allowing the model to naturally capture
+temporal context, while the graph-structural prompt learning mechanism enables
+the transfer of pre-trained knowledge to adapt to behavior dynamics without the
+need for continuous incremental training. We further bring in a dynamic
+evaluation setting for recommendation to mimic real-world dynamic scenarios and
+bridge the offline-online gap to a better level. Our extensive experiments
+including a large-scale industrial deployment showcases the lightweight plug-in
+scalability of our GraphPro when integrated with various state-of-the-art
+recommenders, emphasizing the advantages of GraphPro in terms of effectiveness,
+robustness and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by WWW'2024, full paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlock Multi-Modal Capability of Dense Retrieval via Visual Module
+  Plugin 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.14037v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.14037v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianshuo Zhou, Sen Mei, Xinze Li, Zhenghao Liu, Chenyan Xiong, Zhiyuan Liu, Yu Gu, Ge Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes Multi-modAl Retrieval model via Visual modulE pLugin
+(MARVEL) to learn an embedding space for queries and multi-modal documents to
+conduct retrieval. MARVEL encodes queries and multi-modal documents with a
+unified encoder model, which helps to alleviate the modality gap between images
+and texts. Specifically, we enable the image understanding ability of a
+well-trained dense retriever, T5-ANCE, by incorporating the image features
+encoded by the visual module as its inputs. To facilitate the multi-modal
+retrieval tasks, we build the ClueWeb22-MM dataset based on the ClueWeb22
+dataset, which regards anchor texts as queries, and exact the related texts and
+image documents from anchor linked web pages. Our experiments show that MARVEL
+significantly outperforms the state-of-the-art methods on the multi-modal
+retrieval dataset WebQA and ClueWeb22-MM. Our further analyses show that the
+visual module plugin method is tailored to enable the image understanding
+ability for an existing dense retrieval model. Besides, we also show that the
+language model has the ability to extract image semantics from image encoders
+and adapt the image features in the input space of language models. All codes
+are available at https://github.com/OpenMatch/MARVEL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Refined Edge Usage of Graph Neural Networks for Edge Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12970v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12970v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarui Jin, Yangkun Wang, Weinan Zhang, Quan Gan, Xiang Song, Yong Yu, Zheng Zhang, David Wipf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs), originally proposed for node classification,
+have also motivated many recent works on edge prediction (a.k.a., link
+prediction). However, existing methods lack elaborate design regarding the
+distinctions between two tasks that have been frequently overlooked: (i) edges
+only constitute the topology in the node classification task but can be used as
+both the topology and the supervisions (i.e., labels) in the edge prediction
+task; (ii) the node classification makes prediction over each individual node,
+while the edge prediction is determinated by each pair of nodes. To this end,
+we propose a novel edge prediction paradigm named Edge-aware Message PassIng
+neuRal nEtworks (EMPIRE). Concretely, we first introduce an edge splitting
+technique to specify use of each edge where each edge is solely used as either
+the topology or the supervision (named as topology edge or supervision edge).
+We then develop a new message passing mechanism that generates the messages to
+source nodes (through topology edges) being aware of target nodes (through
+supervision edges). In order to emphasize the differences between pairs
+connected by supervision edges and pairs unconnected, we further weight the
+messages to highlight the relative ones that can reflect the differences. In
+addition, we design a novel negative node-pair sampling trick that efficiently
+samples 'hard' negative instances in the supervision instances, and can
+significantly improve the performance. Experimental results verify that the
+proposed method can significantly outperform existing state-of-the-art models
+regarding the edge prediction task on multiple homogeneous and heterogeneous
+graph datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Need major revisions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retrieval meets Long Context Large Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03025v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03025v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xu, Wei Ping, Xianchao Wu, Lawrence McAfee, Chen Zhu, Zihan Liu, Sandeep Subramanian, Evelina Bakhturina, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extending the context window of large language models (LLMs) is getting
+popular recently, while the solution of augmenting LLMs with retrieval has
+existed for years. The natural questions are: i) Retrieval-augmentation versus
+long context window, which one is better for downstream tasks? ii) Can both
+methods be combined to get the best of both worlds? In this work, we answer
+these questions by studying both solutions using two state-of-the-art
+pretrained LLMs, i.e., a proprietary 43B GPT and Llama2-70B. Perhaps
+surprisingly, we find that LLM with 4K context window using simple
+retrieval-augmentation at generation can achieve comparable performance to
+finetuned LLM with 16K context window via positional interpolation on long
+context tasks, while taking much less computation. More importantly, we
+demonstrate that retrieval can significantly improve the performance of LLMs
+regardless of their extended context window sizes. Our best model,
+retrieval-augmented Llama2-70B with 32K context window, outperforms
+GPT-3.5-turbo-16k and Davinci003 in terms of average score on nine long context
+tasks including question answering, query-based summarization, and in-context
+few-shot learning tasks. It also outperforms its non-retrieval Llama2-70B-32k
+baseline by a margin, while being much faster at generation. Our study provides
+general insights on the choice of retrieval-augmentation versus long context
+extension of LLM for practitioners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChatQA: Building <span class="highlight-title">GPT</span>-4 Level Conversational QA Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10225v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10225v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Liu, Wei Ping, Rajarshi Roy, Peng Xu, Chankyu Lee, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce ChatQA, a family of conversational question
+answering (QA) models that obtain GPT-4 level accuracies. Specifically, we
+propose a two-stage instruction tuning method that can significantly improve
+the zero-shot conversational QA results from large language models (LLMs). To
+handle retrieval-augmented generation in conversational QA, we fine-tune a
+dense retriever on a multi-turn QA dataset, which provides comparable results
+to using the state-of-the-art query rewriting model while largely reducing
+deployment cost. Notably, our ChatQA-70B can outperform GPT-4 in terms of
+average score on 10 conversational QA datasets (54.14 vs. 53.90), without
+relying on any synthetic data from OpenAI GPT models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We added ChatQA-22B results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SynthTab: Leveraging Synthesized Data for Guitar Tablature Transcription <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09085v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09085v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongyi Zang, Yi Zhong, Frank Cwitkowitz, Zhiyao Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Guitar tablature is a form of music notation widely used among guitarists. It
+captures not only the musical content of a piece, but also its implementation
+and ornamentation on the instrument. Guitar Tablature Transcription (GTT) is an
+important task with broad applications in music education, composition, and
+entertainment. Existing GTT datasets are quite limited in size and scope,
+rendering models trained on them prone to overfitting and incapable of
+generalizing to out-of-domain data. In order to address this issue, we present
+a methodology for synthesizing large-scale GTT audio using commercial acoustic
+and electric guitar plugins. We procure SynthTab, a dataset derived from
+DadaGP, which is a vast and diverse collection of richly annotated symbolic
+tablature. The proposed synthesis pipeline produces audio which faithfully
+adheres to the original fingerings and a subset of techniques specified in the
+tablature, and covers multiple guitars and styles for each track. Experiments
+show that pre-training a baseline GTT model on SynthTab can improve
+transcription performance when fine-tuning and testing on an individual
+dataset. More importantly, cross-dataset experiments show that pre-training
+significantly mitigates issues with overfitting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-context Learning with Retrieved Demonstrations for Language Models: A
+  <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11624v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11624v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Man Luo, Xin Xu, Yue Liu, Panupong Pasupat, Mehran Kazemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models, especially pre-trained large language models, have showcased
+remarkable abilities as few-shot in-context learners (ICL), adept at adapting
+to new tasks with just a few demonstrations in the input context. However, the
+model's ability to perform ICL is sensitive to the choice of the few-shot
+demonstrations. Instead of using a fixed set of demonstrations, one recent
+development is to retrieve demonstrations tailored to each input query. The
+implementation of demonstration retrieval is relatively straightforward,
+leveraging existing databases and retrieval systems. This not only improves the
+efficiency and scalability of the learning process but also has been shown to
+reduce biases inherent in manual example selection. In light of the encouraging
+results and growing research in ICL with retrieved demonstrations, we conduct
+an extensive review of studies in this area. In this survey, we discuss and
+compare different design choices for retrieval models, retrieval training
+procedures, and inference algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ D2K: Turning Historical Data into Retrievable Knowledge for Recommender
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11478v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11478v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarui Qin, Weiwen Liu, Ruiming Tang, Weinan Zhang, Yong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A vast amount of user behavior data is constantly accumulating on today's
+large recommendation platforms, recording users' various interests and tastes.
+Preserving knowledge from the old data while new data continually arrives is a
+vital problem for recommender systems. Existing approaches generally seek to
+save the knowledge implicitly in the model parameters. However, such a
+parameter-centric approach lacks scalability and flexibility -- the capacity is
+hard to scale, and the knowledge is inflexible to utilize. Hence, in this work,
+we propose a framework that turns massive user behavior data to retrievable
+knowledge (D2K). It is a data-centric approach that is model-agnostic and easy
+to scale up. Different from only storing unary knowledge such as the user-side
+or item-side information, D2K propose to store ternary knowledge for
+recommendation, which is determined by the complete recommendation factors --
+user, item, and context. The knowledge retrieved by target samples can be
+directly used to enhance the performance of any recommendation algorithms.
+Specifically, we introduce a Transformer-based knowledge encoder to transform
+the old data into knowledge with the user-item-context cross features. A
+personalized knowledge adaptation unit is devised to effectively exploit the
+information from the knowledge base by adapting the retrieved knowledge to the
+target samples. Extensive experiments on two public datasets show that D2K
+significantly outperforms existing baselines and is compatible with a major
+collection of recommendation algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Much Freedom Does An Effectiveness Metric Really Have? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09477v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09477v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alistair Moffat, Joel Mackenzie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is tempting to assume that because effectiveness metrics have free choice
+to assign scores to search engine result pages (SERPs) there must thus be a
+similar degree of freedom as to the relative order that SERP pairs can be put
+into. In fact that second freedom is, to a considerable degree, illusory.
+That's because if one SERP in a pair has been given a certain score by a
+metric, fundamental ordering constraints in many cases then dictate that the
+score for the second SERP must be either not less than, or not greater than,
+the score assigned to the first SERP. We refer to these fixed relationships as
+innate pairwise SERP orderings. Our first goal in this work is to describe and
+defend those pairwise SERP relationship constraints, and tabulate their
+relative occurrence via both exhaustive and empirical experimentation.
+  We then consider how to employ such innate pairwise relationships in IR
+experiments, leading to a proposal for a new measurement paradigm.
+Specifically, we argue that tables of results in which many different metrics
+are listed for champion versus challenger system comparisons should be avoided;
+and that instead a single metric be argued for in principled terms, with any
+relationships identified by that metric then reinforced via an assessment of
+the innate relationship as to whether other metrics - indeed, all other metrics
+- are likely to yield the same system-vs-system outcome.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To Appear: Journal of the Association for Information Science and
+  Technology, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using LLMs to discover emerging coded antisemitic hate-speech in
+  extremist social media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10841v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10841v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhanush Kikkisetti, Raza Ul Mustafa, Wendy Melillo, Roberto Corizzo, Zois Boukouvalas, Jeff Gill, Nathalie Japkowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online hate speech proliferation has created a difficult problem for social
+media platforms. A particular challenge relates to the use of coded language by
+groups interested in both creating a sense of belonging for its users and
+evading detection. Coded language evolves quickly and its use varies over time.
+This paper proposes a methodology for detecting emerging coded hate-laden
+terminology. The methodology is tested in the context of online antisemitic
+discourse. The approach considers posts scraped from social media platforms,
+often used by extremist users. The posts are scraped using seed expressions
+related to previously known discourse of hatred towards Jews. The method begins
+by identifying the expressions most representative of each post and calculating
+their frequency in the whole corpus. It filters out grammatically incoherent
+expressions as well as previously encountered ones so as to focus on emergent
+well-formed terminology. This is followed by an assessment of semantic
+similarity to known antisemitic terminology using a fine-tuned large language
+model, and subsequent filtering out of the expressions that are too distant
+from known expressions of hatred. Emergent antisemitic expressions containing
+terms clearly relating to Jewish topics are then removed to return only coded
+expressions of hatred.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, 2 algorithms, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">177</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Language Learning: Arhitectures and Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ekin Akyürek, Bailin Wang, Yoon Kim, Jacob Andreas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale neural language models exhibit a remarkable capacity for
+in-context learning (ICL): they can infer novel functions from datasets
+provided as input. Most of our current understanding of when and how ICL arises
+comes from LMs trained on extremely simple learning problems like linear
+regression and associative recall. There remains a significant gap between
+these model problems and the "real" ICL exhibited by LMs trained on large text
+corpora, which involves not just retrieval and function approximation but
+free-form generation of language and other structured outputs. In this paper,
+we study ICL through the lens of a new family of model problems we term in
+context language learning (ICLL). In ICLL, LMs are presented with a set of
+strings from a formal language, and must generate additional strings from the
+same language. We focus on in-context learning of regular languages generated
+by random finite automata. We evaluate a diverse set of neural sequence models
+(including several RNNs, Transformers, and state-space model variants) on
+regular ICLL tasks, aiming to answer three questions: (1) Which model classes
+are empirically capable of ICLL? (2) What algorithmic solutions do successful
+models implement to perform ICLL? (3) What architectural changes can improve
+ICLL in less performant models? We first show that Transformers significantly
+outperform neural sequence models with recurrent or convolutional
+representations on ICLL tasks. Next, we provide evidence that their ability to
+do so relies on specialized "n-gram heads" (higher-order variants of induction
+heads) that compute input-conditional next-token distributions. Finally, we
+show that hard-wiring these heads into recurrent and convolutional models
+improves performance not just on ICLL, but natural language modeling --
+improving the perplexity of 340M-parameter models by up to 1.14 points (6.7%)
+on the SlimPajama dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Efficacy of Text-Based Input Modalities for Action Anticipation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Apoorva Beedu, Karan Samel, Irfan Essa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although the task of anticipating future actions is highly uncertain,
+information from additional modalities help to narrow down plausible action
+choices. Each modality provides different environmental context for the model
+to learn from. While previous multi-modal methods leverage information from
+modalities such as video and audio, we primarily explore how text inputs for
+actions and objects can also enable more accurate action anticipation.
+Therefore, we propose a Multi-modal Anticipative Transformer (MAT), an
+attention-based video transformer architecture that jointly learns from
+multi-modal features and text captions. We train our model in two-stages, where
+the model first learns to predict actions in the video clip by aligning with
+captions, and during the second stage, we fine-tune the model to predict future
+actions. Compared to existing methods, MAT has the advantage of learning
+additional environmental context from two kinds of text inputs: action
+descriptions during the pre-training stage, and the text inputs for detected
+objects and actions during modality feature fusion. Through extensive
+experiments, we evaluate the effectiveness of the pre-training stage, and show
+that our model outperforms previous methods on all datasets. In addition, we
+examine the impact of object and action information obtained via text and
+perform extensive ablations. We evaluate the performance on on three datasets:
+EpicKitchens-100, EpicKitchens-55 and EGTEA GAZE+; and show that text
+descriptions do indeed aid in more effective action anticipation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoRT: Embodied Foundation Models for Large Scale Orchestration of
+  Robotic Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Ahn, Debidatta Dwibedi, Chelsea Finn, Montse Gonzalez Arenas, Keerthana Gopalakrishnan, Karol Hausman, Brian Ichter, Alex Irpan, Nikhil Joshi, Ryan Julian, Sean Kirmani, Isabel Leal, Edward Lee, Sergey Levine, Yao Lu, Isabel Leal, Sharath Maddineni, Kanishka Rao, Dorsa Sadigh, Pannag Sanketi, Pierre Sermanet, Quan Vuong, Stefan Welker, Fei Xia, Ted Xiao, Peng Xu, Steve Xu, Zhuo Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models that incorporate language, vision, and more recently
+actions have revolutionized the ability to harness internet scale data to
+reason about useful tasks. However, one of the key challenges of training
+embodied foundation models is the lack of data grounded in the physical world.
+In this paper, we propose AutoRT, a system that leverages existing foundation
+models to scale up the deployment of operational robots in completely unseen
+scenarios with minimal human supervision. AutoRT leverages vision-language
+models (VLMs) for scene understanding and grounding, and further uses large
+language models (LLMs) for proposing diverse and novel instructions to be
+performed by a fleet of robots. Guiding data collection by tapping into the
+knowledge of foundation models enables AutoRT to effectively reason about
+autonomy tradeoffs and safety while significantly scaling up data collection
+for robot learning. We demonstrate AutoRT proposing instructions to over 20
+robots across multiple buildings and collecting 77k real robot episodes via
+both teleoperation and autonomous robot policies. We experimentally show that
+such "in-the-wild" data collected by AutoRT is significantly more diverse, and
+that AutoRT's use of LLMs allows for instruction following data collection
+robots that can align to human preferences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chatterbox: Robust Transport for LLM Token Streaming under Unstable
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanchen Li, Yuhan Liu, Yihua Cheng, Siddhant Ray, Kuntai Du, Junchen Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To render each generated token in real time, the LLM server generates
+response tokens one by one and streams each generated token (or group of a few
+tokens) through the network to the user right after it is generated, which we
+refer to as LLM token streaming. However, under unstable network conditions,
+the LLM token streaming experience could suffer greatly from stalls since one
+packet loss could block the rendering of tokens contained in subsequent packets
+even if they arrive on time. With a real-world measurement study, we show that
+current applications including ChatGPT, Claude, and Bard all suffer from
+increased stall under unstable network.
+  For this emerging token streaming problem in LLM Chatbots, we propose a novel
+transport layer scheme, called Chatterbox, which puts new generated tokens as
+well as currently unacknowledged tokens in the next outgoing packet. This
+ensures that each packet contains some new tokens and can be independently
+rendered when received, thus avoiding aforementioned stalls caused by missing
+packets. Through simulation under various network conditions, we show
+Chatterbox reduces stall ratio (proportion of token rendering wait time) by
+71.0% compared to the token streaming method commonly used by real chatbot
+applications and by 31.6% compared to a custom packet duplication scheme. By
+tailoring Chatterbox to fit the token-by-token generation of LLM, we enable the
+Chatbots to respond like an eloquent speaker for users to better enjoy
+pervasive AI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayesian Semi-structured Subspace Inference <span class="chip">AISTATS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Dold, David Rügamer, Beate Sick, Oliver Dürr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-structured regression models enable the joint modeling of interpretable
+structured and complex unstructured feature effects. The structured model part
+is inspired by statistical models and can be used to infer the input-output
+relationship for features of particular importance. The complex unstructured
+part defines an arbitrary deep neural network and thereby provides enough
+flexibility to achieve competitive prediction performance. While these models
+can also account for aleatoric uncertainty, there is still a lack of work on
+accounting for epistemic uncertainty. In this paper, we address this problem by
+presenting a Bayesian approximation for semi-structured regression models using
+subspace inference. To this end, we extend subspace inference for joint
+posterior sampling from a full parameter space for structured effects and a
+subspace for unstructured effects. Apart from this hybrid sampling scheme, our
+method allows for tunable complexity of the subspace and can capture multiple
+minima in the loss landscape. Numerical experiments validate our approach's
+efficacy in recovering structured effect parameter posteriors in
+semi-structured models and approaching the full-space posterior distribution of
+MCMC for increasing subspace dimension. Further, our approach exhibits
+competitive predictive performance across simulated and real-world datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AISTATS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reward-Relevance-Filtered Linear Offline Reinforcement Learning <span class="chip">AISTATS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angela Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies offline reinforcement learning with linear function
+approximation in a setting with decision-theoretic, but not estimation
+sparsity. The structural restrictions of the data-generating process presume
+that the transitions factor into a sparse component that affects the reward and
+could affect additional exogenous dynamics that do not affect the reward.
+Although the minimally sufficient adjustment set for estimation of full-state
+transition properties depends on the whole state, the optimal policy and
+therefore state-action value function depends only on the sparse component: we
+call this causal/decision-theoretic sparsity. We develop a method for
+reward-filtering the estimation of the state-action value function to the
+sparse component by a modification of thresholded lasso in least-squares policy
+evaluation. We provide theoretical guarantees for our reward-filtered linear
+fitted-Q-iteration, with sample complexity depending only on the size of the
+sparse component.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>conference version accepted at AISTATS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ pyAKI - An Open Source Solution to Automated KDIGO classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12930v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12930v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Porschen, Jan Ernsting, Paul Brauckmann, Raphael Weiss, Till Würdemann, Hendrik Booke, Wida Amini, Ludwig Maidowski, Benjamin Risse, Tim Hahn, Thilo von Groote
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acute Kidney Injury (AKI) is a frequent complication in critically ill
+patients, affecting up to 50% of patients in the intensive care units. The lack
+of standardized and open-source tools for applying the Kidney Disease Improving
+Global Outcomes (KDIGO) criteria to time series data has a negative impact on
+workload and study quality. This project introduces pyAKI, an open-source
+pipeline addressing this gap by providing a comprehensive solution for
+consistent KDIGO criteria implementation.
+  The pyAKI pipeline was developed and validated using a subset of the Medical
+Information Mart for Intensive Care (MIMIC)-IV database, a commonly used
+database in critical care research. We defined a standardized data model in
+order to ensure reproducibility. Validation against expert annotations
+demonstrated pyAKI's robust performance in implementing KDIGO criteria.
+Comparative analysis revealed its ability to surpass the quality of human
+labels.
+  This work introduces pyAKI as an open-source solution for implementing the
+KDIGO criteria for AKI diagnosis using time series data with high accuracy and
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DsDm: Model-Aware <span class="highlight-title">Dataset</span> Selection with Datamodels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Logan Engstrom, Axel Feldmann, Aleksander Madry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When selecting data for training large-scale models, standard practice is to
+filter for examples that match human notions of data quality. Such filtering
+yields qualitatively clean datapoints that intuitively should improve model
+behavior. However, in practice the opposite can often happen: we find that
+selecting according to similarity with "high quality" data sources may not
+increase (and can even hurt) performance compared to randomly selecting data.
+  To develop better methods for selecting data, we start by framing dataset
+selection as an optimization problem that we can directly solve for: given
+target tasks, a learning algorithm, and candidate data, select the subset that
+maximizes model performance. This framework thus avoids handpicked notions of
+data quality, and instead models explicitly how the learning process uses train
+datapoints to predict on the target tasks. Our resulting method greatly
+improves language model (LM) performance on both pre-specified tasks and
+previously unseen tasks. Specifically, choosing target tasks representative of
+standard LM problems and evaluating on diverse held-out benchmarks, our
+selected datasets provide a 2x compute multiplier over baseline methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Performance Analysis of Support Vector Machine (SVM) on Challenging
+  <span class="highlight-title">Dataset</span>s for Forest Fire Detection <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankan Kar, Nirjhar Nath, Utpalraj Kemprai,  Aman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article delves into the analysis of performance and utilization of
+Support Vector Machines (SVMs) for the critical task of forest fire detection
+using image datasets. With the increasing threat of forest fires to ecosystems
+and human settlements, the need for rapid and accurate detection systems is of
+utmost importance. SVMs, renowned for their strong classification capabilities,
+exhibit proficiency in recognizing patterns associated with fire within images.
+By training on labeled data, SVMs acquire the ability to identify distinctive
+attributes associated with fire, such as flames, smoke, or alterations in the
+visual characteristics of the forest area. The document thoroughly examines the
+use of SVMs, covering crucial elements like data preprocessing, feature
+extraction, and model training. It rigorously evaluates parameters such as
+accuracy, efficiency, and practical applicability. The knowledge gained from
+this study aids in the development of efficient forest fire detection systems,
+enabling prompt responses and improving disaster management. Moreover, the
+correlation between SVM accuracy and the difficulties presented by
+high-dimensional datasets is carefully investigated, demonstrated through a
+revealing case study. The relationship between accuracy scores and the
+different resolutions used for resizing the training datasets has also been
+discussed in this article. These comprehensive studies result in a definitive
+overview of the difficulties faced and the potential sectors requiring further
+improvement and focus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 8 figures, accepted in IJCNS of SCIRP (not yet published)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep multitask neural networks for solving some stochastic optimal
+  control problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Yeo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing neural network-based approaches for solving stochastic optimal
+control problems using the associated backward dynamic programming principle
+rely on the ability to simulate the underlying state variables. However, in
+some problems, this simulation is infeasible, leading to the discretization of
+state variable space and the need to train one neural network for each data
+point. This approach becomes computationally inefficient when dealing with
+large state variable spaces. In this paper, we consider a class of this type of
+stochastic optimal control problems and introduce an effective solution
+employing multitask neural networks. To train our multitask neural network, we
+introduce a novel scheme that dynamically balances the learning across tasks.
+Through numerical experiments on real-world derivatives pricing problems, we
+prove that our method outperforms state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model-Free $δ$-Policy Iteration Based on Damped Newton Method for
+  Nonlinear Continuous-Time H$\infty$ Tracking Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a {\delta}-PI algorithm which is based on damped Newton
+method for the H{\infty} tracking control problem of unknown continuous-time
+nonlinear system. A discounted performance function and an augmented system are
+used to get the tracking Hamilton-Jacobi-Isaac (HJI) equation. Tracking HJI
+equation is a nonlinear partial differential equation, traditional
+reinforcement learning methods for solving the tracking HJI equation are mostly
+based on the Newton method, which usually only satisfies local convergence and
+needs a good initial guess. Based upon the damped Newton iteration operator
+equation, a generalized tracking Bellman equation is derived firstly. The
+{\delta}-PI algorithm can seek the optimal solution of the tracking HJI
+equation by iteratively solving the generalized tracking Bellman equation.
+On-policy learning and off-policy learning {\delta}-PI reinforcement learning
+methods are provided, respectively. Off-policy version {\delta}-PI algorithm is
+a model-free algorithm which can be performed without making use of a priori
+knowledge of the system dynamics. NN-based implementation scheme for the
+off-policy {\delta}-PI algorithms is shown. The suitability of the model-free
+{\delta}-PI algorithm is illustrated with a nonlinear system simulation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Collaborative and Autonomous Agents in Data-Stream-Supported
+  Coordination of Mobile Crowdsourcing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ralf Bruns, Jeremias Dötterl, Jürgen Dunkel, Sascha Ossowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mobile crowdsourcing refers to systems where the completion of tasks
+necessarily requires physical movement of crowdworkers in an on-demand
+workforce. Evidence suggests that in such systems, tasks often get assigned to
+crowdworkers who struggle to complete those tasks successfully, resulting in
+high failure rates and low service quality. A promising solution to ensure
+higher quality of service is to continuously adapt the assignment and respond
+to failure-causing events by transferring tasks to better-suited workers who
+use different routes or vehicles. However, implementing task transfers in
+mobile crowdsourcing is difficult because workers are autonomous and may reject
+transfer requests. Moreover, task outcomes are uncertain and need to be
+predicted. In this paper, we propose different mechanisms to achieve outcome
+prediction and task coordination in mobile crowdsourcing. First, we analyze
+different data stream learning approaches for the prediction of task outcomes.
+Second, based on the suggested prediction model, we propose and evaluate two
+different approaches for task coordination with different degrees of autonomy:
+an opportunistic approach for crowdshipping with collaborative, but
+non-autonomous workers, and a market-based model with autonomous workers for
+crowdsensing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification of grapevine varieties using UAV hyperspectral imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alfonso López, Carlos Javier Ogayar, Francisco Ramón Feito, Joaquim João Sousa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The classification of different grapevine varieties is a relevant phenotyping
+task in Precision Viticulture since it enables estimating the growth of
+vineyard rows dedicated to different varieties, among other applications
+concerning the wine industry. This task can be performed with destructive
+methods that require time-consuming tasks, including data collection and
+analysis in the laboratory. However, Unmanned Aerial Vehicles (UAV) provide a
+more efficient and less prohibitive approach to collecting hyperspectral data,
+despite acquiring noisier data. Therefore, the first task is the processing of
+these data to correct and downsample large amounts of data. In addition, the
+hyperspectral signatures of grape varieties are very similar. In this work, a
+Convolutional Neural Network (CNN) is proposed for classifying seventeen
+varieties of red and white grape variants. Rather than classifying single
+samples, these are processed together with their neighbourhood. Hence, the
+extraction of spatial and spectral features is addressed with 1) a spatial
+attention layer and 2) Inception blocks. The pipeline goes from processing to
+dataset elaboration, finishing with the training phase. The fitted model is
+evaluated in terms of response time, accuracy and data separability, and
+compared with other state-of-the-art CNNs for classifying hyperspectral data.
+Our network was proven to be much more lightweight with a reduced number of
+input bands, a lower number of trainable weights and therefore, reduced
+training time. Despite this, the evaluated metrics showed much better results
+for our network (~99% overall accuracy), in comparison with previous works
+barely achieving 81% OA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning safety critics via a non-contractive binary bellman operator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Agustin Castellano, Hancheng Min, Juan Andrés Bazerque, Enrique Mallada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The inability to naturally enforce safety in Reinforcement Learning (RL),
+with limited failures, is a core challenge impeding its use in real-world
+applications. One notion of safety of vast practical relevance is the ability
+to avoid (unsafe) regions of the state space. Though such a safety goal can be
+captured by an action-value-like function, a.k.a. safety critics, the
+associated operator lacks the desired contraction and uniqueness properties
+that the classical Bellman operator enjoys. In this work, we overcome the
+non-contractiveness of safety critic operators by leveraging that safety is a
+binary property. To that end, we study the properties of the binary safety
+critic associated with a deterministic dynamical system that seeks to avoid
+reaching an unsafe region. We formulate the corresponding binary Bellman
+equation (B2E) for safety and study its properties. While the resulting
+operator is still non-contractive, we fully characterize its fixed points
+representing--except for a spurious solution--maximal persistently safe regions
+of the state space that can always avoid failure. We provide an algorithm that,
+by design, leverages axiomatic knowledge of safe data to avoid spurious fixed
+points.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An embedding-based distance for temporal graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12843v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12843v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Dall'Amico, Alain Barrat, Ciro Cattuto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We define a distance between temporal graphs based on graph embeddings built
+using time-respecting random walks. We study both the case of matched graphs,
+when there exists a known relation between the nodes, and the unmatched case,
+when such a relation is unavailable and the graphs may be of different sizes.
+We illustrate the interest of our distance definition, using both real and
+synthetic temporal network data, by showing its ability to discriminate between
+graphs with different structural and temporal properties. Leveraging
+state-of-the-art machine learning techniques, we propose an efficient
+implementation of distance computation that is viable for large-scale temporal
+graphs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterated Relevance Matrix Analysis (IRMA) for the identification of
+  class-discriminative subspaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sofie Lövdal, Michael Biehl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce and investigate the iterated application of Generalized Matrix
+Learning Vector Quantizaton for the analysis of feature relevances in
+classification problems, as well as for the construction of
+class-discriminative subspaces. The suggested Iterated Relevance Matrix
+Analysis (IRMA) identifies a linear subspace representing the classification
+specific information of the considered data sets using Generalized Matrix
+Learning Vector Quantization (GMLVQ). By iteratively determining a new
+discriminative subspace while projecting out all previously identified ones, a
+combined subspace carrying all class-specific information can be found. This
+facilitates a detailed analysis of feature relevances, and enables improved
+low-dimensional representations and visualizations of labeled data sets.
+Additionally, the IRMA-based class-discriminative subspace can be used for
+dimensionality reduction and the training of robust classifiers with
+potentially improved performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5 figures, 1 table. Submitted to Neurocomputing. Extension
+  of 2023 ESANN conference contribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Next Destination Prediction: A Novel LSTM Approach Using
+  Real-World Airline Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salih Salihoglu, Gulser Koksal, Orhan Abar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the modern transportation industry, accurate prediction of travelers' next
+destinations brings multiple benefits to companies, such as customer
+satisfaction and targeted marketing. This study focuses on developing a precise
+model that captures the sequential patterns and dependencies in travel data,
+enabling accurate predictions of individual travelers' future destinations. To
+achieve this, a novel model architecture with a sliding window approach based
+on Long Short-Term Memory (LSTM) is proposed for destination prediction in the
+transportation industry. The experimental results highlight satisfactory
+performance and high scores achieved by the proposed model across different
+data sizes and performance metrics. This research contributes to advancing
+destination prediction methods, empowering companies to deliver personalized
+recommendations and optimize customer experiences in the dynamic travel
+landscape.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAPPING: Debiasing Graph Neural Networks for Fair Node Classification
+  with Limited Sensitive Information Leakage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12824v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12824v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Song, Balaji Palanisamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite remarkable success in diverse web-based applications, Graph Neural
+Networks(GNNs) inherit and further exacerbate historical discrimination and
+social stereotypes, which critically hinder their deployments in high-stake
+domains such as online clinical diagnosis, financial crediting, etc. However,
+current fairness research that primarily craft on i.i.d data, cannot be
+trivially replicated to non-i.i.d. graph structures with topological dependence
+among samples. Existing fair graph learning typically favors pairwise
+constraints to achieve fairness but fails to cast off dimensional limitations
+and generalize them into multiple sensitive attributes; besides, most studies
+focus on in-processing techniques to enforce and calibrate fairness,
+constructing a model-agnostic debiasing GNN framework at the pre-processing
+stage to prevent downstream misuses and improve training reliability is still
+largely under-explored. Furthermore, previous work on GNNs tend to enhance
+either fairness or privacy individually but few probe into their interplays. In
+this paper, we propose a novel model-agnostic debiasing framework named MAPPING
+(\underline{M}asking \underline{A}nd \underline{P}runing and
+Message-\underline{P}assing train\underline{ING}) for fair node classification,
+in which we adopt the distance covariance($dCov$)-based fairness constraints to
+simultaneously reduce feature and topology biases in arbitrary dimensions, and
+combine them with adversarial debiasing to confine the risks of attribute
+inference attacks. Experiments on real-world datasets with different GNN
+variants demonstrate the effectiveness and flexibility of MAPPING. Our results
+show that MAPPING can achieve better trade-offs between utility and fairness,
+and mitigate privacy risks of sensitive information leakage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Finished May last year. Remember to submit all papers to arXiv early
+  without compromising the principles of conferences</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Based Simulators for the Phosphorus Removal Process
+  Control in Wastewater Treatment via Deep Reinforcement Learning Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Esmaeel Mohammadi, Mikkel Stokholm-Bjerregaard, Aviaja Anna Hansen, Per Halkjær Nielsen, Daniel Ortiz-Arroyo, Petar Durdevic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Phosphorus removal is vital in wastewater treatment to reduce reliance on
+limited resources. Deep reinforcement learning (DRL) is a machine learning
+technique that can optimize complex and nonlinear systems, including the
+processes in wastewater treatment plants, by learning control policies through
+trial and error. However, applying DRL to chemical and biological processes is
+challenging due to the need for accurate simulators. This study trained six
+models to identify the phosphorus removal process and used them to create a
+simulator for the DRL environment. Although the models achieved high accuracy
+(>97%), uncertainty and incorrect prediction behavior limited their performance
+as simulators over longer horizons. Compounding errors in the models'
+predictions were identified as one of the causes of this problem. This approach
+for improving process control involves creating simulation environments for DRL
+algorithms, using data from supervisory control and data acquisition (SCADA)
+systems with a sufficient historical horizon without complex system modeling or
+parameter estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DatUS^2: Data-driven Unsupervised Semantic Segmentation with <span class="highlight-title">Pre-train</span>ed
+  <span class="highlight-title">Self-supervised</span> Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12820v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12820v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sonal Kumar, Arijit Sur, Rashmi Dutta Baruah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Successive proposals of several self-supervised training schemes continue to
+emerge, taking one step closer to developing a universal foundation model. In
+this process, the unsupervised downstream tasks are recognized as one of the
+evaluation methods to validate the quality of visual features learned with a
+self-supervised training scheme. However, unsupervised dense semantic
+segmentation has not been explored as a downstream task, which can utilize and
+evaluate the quality of semantic information introduced in patch-level feature
+representations during self-supervised training of a vision transformer.
+Therefore, this paper proposes a novel data-driven approach for unsupervised
+semantic segmentation (DatUS^2) as a downstream task. DatUS^2 generates
+semantically consistent and dense pseudo annotate segmentation masks for the
+unlabeled image dataset without using any visual-prior or synchronized data. We
+compare these pseudo-annotated segmentation masks with ground truth masks for
+evaluating recent self-supervised training schemes to learn shared semantic
+properties at the patch level and discriminative semantic properties at the
+segment level. Finally, we evaluate existing state-of-the-art self-supervised
+training schemes with our proposed downstream task, i.e., DatUS^2. Also, the
+best version of DatUS^2 outperforms the existing state-of-the-art method for
+the unsupervised dense semantic segmentation task with 15.02% MiOU and 21.47%
+Pixel accuracy on the SUIM dataset. It also achieves a competitive level of
+accuracy for a large-scale and complex dataset, i.e., the COCO dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The manuscript contains 13 pages, 9 figures and 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Layer Tying for Parameter-Efficient <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tamir David Hay, Lior Wolf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the pursuit of reducing the number of trainable parameters in deep
+transformer networks, we employ Reinforcement Learning to dynamically select
+layers during training and tie them together. Every few iterations, the RL
+agent is asked whether to train each layer $i$ independently or to copy the
+weights of a previous layer $j<i$. This facilitates weight sharing, reduces the
+number of trainable parameters, and also serves as an effective regularization
+technique. Experimental evaluations validate that our model modestly
+outperforms the baseline transformer model with regard to perplexity and
+drastically reduces the number of trainable parameters. In particular, the
+memory consumption during training is up to one order of magnitude less than
+the conventional training method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Binary structured physics-informed neural networks for solving equations
+  with rapidly changing solutions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12806v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12806v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanzhi Liu, Ruifan Wu, Ying Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs), rooted in deep learning, have
+emerged as a promising approach for solving partial differential equations
+(PDEs). By embedding the physical information described by PDEs into
+feedforward neural networks, PINNs are trained as surrogate models to
+approximate solutions without the need for label data. Nevertheless, even
+though PINNs have shown remarkable performance, they can face difficulties,
+especially when dealing with equations featuring rapidly changing solutions.
+These difficulties encompass slow convergence, susceptibility to becoming
+trapped in local minima, and reduced solution accuracy. To address these
+issues, we propose a binary structured physics-informed neural network (BsPINN)
+framework, which employs binary structured neural network (BsNN) as the neural
+network component. By leveraging a binary structure that reduces inter-neuron
+connections compared to fully connected neural networks, BsPINNs excel in
+capturing the local features of solutions more effectively and efficiently.
+These features are particularly crucial for learning the rapidly changing in
+the nature of solutions. In a series of numerical experiments solving Burgers
+equation, Euler equation, Helmholtz equation, and high-dimension Poisson
+equation, BsPINNs exhibit superior convergence speed and heightened accuracy
+compared to PINNs. From these experiments, we discover that BsPINNs resolve the
+issues caused by increased hidden layers in PINNs resulting in over-smoothing,
+and prevent the decline in accuracy due to non-smoothness of PDEs solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MORPH: Towards Automated Concept Drift Adaptation for Malware Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Tanvirul Alam, Romy Fieblinger, Ashim Mahara, Nidhi Rastogi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concept drift is a significant challenge for malware detection, as the
+performance of trained machine learning models degrades over time, rendering
+them impractical. While prior research in malware concept drift adaptation has
+primarily focused on active learning, which involves selecting representative
+samples to update the model, self-training has emerged as a promising approach
+to mitigate concept drift. Self-training involves retraining the model using
+pseudo labels to adapt to shifting data distributions. In this research, we
+propose MORPH -- an effective pseudo-label-based concept drift adaptation
+method specifically designed for neural networks. Through extensive
+experimental analysis of Android and Windows malware datasets, we demonstrate
+the efficacy of our approach in mitigating the impact of concept drift. Our
+method offers the advantage of reducing annotation efforts when combined with
+active learning. Furthermore, our method significantly improves over existing
+works in automated concept drift adaptation for malware detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Review</span> of Deep Learning Methods for Photoplethysmography Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangkun Nie, Jiabao Zhu, Gongzheng Tang, Deyun Zhang, Shijia Geng, Qinghao Zhao, Shenda Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Photoplethysmography (PPG) is a highly promising device due to its advantages
+in portability, user-friendly operation, and non-invasive capabilities to
+measure a wide range of physiological information. Recent advancements in deep
+learning have demonstrated remarkable outcomes by leveraging PPG signals for
+tasks related to personal health management and other multifaceted
+applications. In this review, we systematically reviewed papers that applied
+deep learning models to process PPG data between January 1st of 2017 and July
+31st of 2023 from Google Scholar, PubMed and Dimensions. Each paper is analyzed
+from three key perspectives: tasks, models, and data. We finally extracted 193
+papers where different deep learning frameworks were used to process PPG
+signals. Based on the tasks addressed in these papers, we categorized them into
+two major groups: medical-related, and non-medical-related. The medical-related
+tasks were further divided into seven subgroups, including blood pressure
+analysis, cardiovascular monitoring and diagnosis, sleep health, mental health,
+respiratory monitoring and analysis, blood glucose analysis, as well as others.
+The non-medical-related tasks were divided into four subgroups, which encompass
+signal processing, biometric identification, electrocardiogram reconstruction,
+and human activity recognition. In conclusion, significant progress has been
+made in the field of using deep learning methods to process PPG data recently.
+This allows for a more thorough exploration and utilization of the information
+contained in PPG signals. However, challenges remain, such as limited quantity
+and quality of publicly available databases, a lack of effective validation in
+real-world scenarios, and concerns about the interpretability, scalability, and
+complexity of deep learning models. Moreover, there are still emerging research
+areas that require further investigation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepRicci: <span class="highlight-title">Self-supervised</span> Graph Structure-Feature Co-Refinement for
+  Alleviating Over-squashing <span class="chip">ICDM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12780v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12780v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Sun, Zhenhao Huang, Hua Wu, Junda Ye, Hao Peng, Zhengtao Yu, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have shown great power for learning and mining
+on graphs, and Graph Structure Learning (GSL) plays an important role in
+boosting GNNs with a refined graph. In the literature, most GSL solutions
+either primarily focus on structure refinement with task-specific supervision
+(i.e., node classification), or overlook the inherent weakness of GNNs
+themselves (e.g., over-squashing), resulting in suboptimal performance despite
+sophisticated designs. In light of these limitations, we propose to study
+self-supervised graph structure-feature co-refinement for effectively
+alleviating the issue of over-squashing in typical GNNs. In this paper, we take
+a fundamentally different perspective of the Ricci curvature in Riemannian
+geometry, in which we encounter the challenges of modeling, utilizing and
+computing Ricci curvature. To tackle these challenges, we present a
+self-supervised Riemannian model, DeepRicci. Specifically, we introduce a
+latent Riemannian space of heterogeneous curvatures to model various Ricci
+curvatures, and propose a gyrovector feature mapping to utilize Ricci curvature
+for typical GNNs. Thereafter, we refine node features by geometric contrastive
+learning among different geometric views, and simultaneously refine graph
+structure by backward Ricci flow based on a novel formulation of differentiable
+Ricci curvature. Finally, extensive experiments on public datasets show the
+superiority of DeepRicci, and the connection between backward Ricci flow and
+over-squashing. Codes of our work are given in https://github.com/RiemanGraph/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE ICDM 2023, Full paper, 10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Nonlinear Two-Time-Scale Stochastic Approximation: Achieving
+  $\mathcal{O}(1/k)$ Finite-Sample Complexity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thinh T. Doan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes to develop a new variant of the two-time-scale stochastic
+approximation to find the roots of two coupled nonlinear operators, assuming
+only noisy samples of these operators can be observed. Our key idea is to
+leverage the classic Ruppert-Polyak averaging technique to dynamically estimate
+the operators through their samples. The estimated values of these averaging
+steps will then be used in the two-time-scale stochastic approximation updates
+to find the desired solution. Our main theoretical result is to show that under
+the strongly monotone condition of the underlying nonlinear operators the
+mean-squared errors of the iterates generated by the proposed method converge
+to zero at an optimal rate $\mathcal{O}(1/k)$, where $k$ is the number of
+iterations. Our result significantly improves the existing result of
+two-time-scale stochastic approximation, where the best known finite-time
+convergence rate is $\mathcal{O}(1/k^{2/3})$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Utility of Probing Trajectories for Algorithm-Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quentin Renau, Emma Hart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine-learning approaches to algorithm-selection typically take data
+describing an instance as input. Input data can take the form of features
+derived from the instance description or fitness landscape, or can be a direct
+representation of the instance itself, i.e. an image or textual description.
+Regardless of the choice of input, there is an implicit assumption that
+instances that are similar will elicit similar performance from algorithm, and
+that a model is capable of learning this relationship. We argue that viewing
+algorithm-selection purely from an instance perspective can be misleading as it
+fails to account for how an algorithm `views' similarity between instances. We
+propose a novel `algorithm-centric' method for describing instances that can be
+used to train models for algorithm-selection: specifically, we use short
+probing trajectories calculated by applying a solver to an instance for a very
+short period of time. The approach is demonstrated to be promising, providing
+comparable or better results to computationally expensive landscape-based
+feature-based approaches. Furthermore, projecting the trajectories into a
+2-dimensional space illustrates that functions that are similar from an
+algorithm-perspective do not necessarily correspond to the accepted
+categorisation of these functions from a human perspective.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in the proceedings of the 27th International Conference,
+  EvoApplications 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TNANet: A Temporal-Noise-Aware Neural Network for Suicidal Ideation
+  Prediction with Noisy Physiological Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niqi Liu, Fang Liu, Wenqi Ji, Xinxin Du, Xu Liu, Guozhen Zhao, Wenting Mu, Yong-Jin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The robust generalization of deep learning models in the presence of inherent
+noise remains a significant challenge, especially when labels are subjective
+and noise is indiscernible in natural settings. This problem is particularly
+pronounced in many practical applications. In this paper, we address a special
+and important scenario of monitoring suicidal ideation, where time-series data,
+such as photoplethysmography (PPG), is susceptible to such noise. Current
+methods predominantly focus on image and text data or address artificially
+introduced noise, neglecting the complexities of natural noise in time-series
+analysis. To tackle this, we introduce a novel neural network model tailored
+for analyzing noisy physiological time-series data, named TNANet, which merges
+advanced encoding techniques with confidence learning, enhancing prediction
+accuracy. Another contribution of our work is the collection of a specialized
+dataset of PPG signals derived from real-world environments for suicidal
+ideation prediction. Employing this dataset, our TNANet achieves the prediction
+accuracy of 63.33% in a binary classification task, outperforming
+state-of-the-art models. Furthermore, comprehensive evaluations were conducted
+on three other well-known public datasets with artificially introduced noise to
+rigorously test the TNANet's capabilities. These tests consistently
+demonstrated TNANet's superior performance by achieving an accuracy improvement
+of more than 10% compared to baseline methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Distributional Uncertainty of the SHAP score in Explainable Machine
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Santiago Cifuentes, Leopoldo Bertossi, Nina Pardal, Sergio Abriola, Maria Vanina Martinez, Miguel Romero
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attribution scores reflect how important the feature values in an input
+entity are for the output of a machine learning model. One of the most popular
+attribution scores is the SHAP score, which is an instantiation of the general
+Shapley value used in coalition game theory. The definition of this score
+relies on a probability distribution on the entity population. Since the exact
+distribution is generally unknown, it needs to be assigned subjectively or be
+estimated from data, which may lead to misleading feature scores. In this
+paper, we propose a principled framework for reasoning on SHAP scores under
+unknown entity population distributions. In our framework, we consider an
+uncertainty region that contains the potential distributions, and the SHAP
+score of a feature becomes a function defined over this region. We study the
+basic problems of finding maxima and minima of this function, which allows us
+to determine tight ranges for the SHAP scores of all features. In particular,
+we pinpoint the complexity of these problems, and other related ones, showing
+them to be NP-complete. Finally, we present experiments on a real-world
+dataset, showing that our framework may contribute to a more robust feature
+scoring.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Object Detection Performance for Small Objects through
+  Synthetic Data Generation and Proportional Class-Balancing Technique: A
+  Comparative Study in Industrial Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jibinraj Antony, Vinit Hegiste, Ali Nazeri, Hooman Tavakoli, Snehal Walunj, Christiane Plociennik, Martin Ruskowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object Detection (OD) has proven to be a significant computer vision method
+in extracting localized class information and has multiple applications in the
+industry. Although many of the state-of-the-art (SOTA) OD models perform well
+on medium and large sized objects, they seem to under perform on small objects.
+In most of the industrial use cases, it is difficult to collect and annotate
+data for small objects, as it is time-consuming and prone to human errors.
+Additionally, those datasets are likely to be unbalanced and often result in an
+inefficient model convergence. To tackle this challenge, this study presents a
+novel approach that injects additional data points to improve the performance
+of the OD models. Using synthetic data generation, the difficulties in data
+collection and annotations for small object data points can be minimized and to
+create a dataset with balanced distribution. This paper discusses the effects
+of a simple proportional class-balancing technique, to enable better anchor
+matching of the OD models. A comparison was carried out on the performances of
+the SOTA OD models: YOLOv5, YOLOv7 and SSD, for combinations of real and
+synthetic datasets within an industrial use case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted and presented in conference ESAIM23 1st European Symposium
+  on Artificial Intelligence in Manufacturing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Falcon: Fair Active Learning using Multi-armed Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12722v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12722v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ki Hyun Tae, Hantian Zhang, Jaeyoung Park, Kexin Rong, Steven Euijong Whang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biased data can lead to unfair machine learning models, highlighting the
+importance of embedding fairness at the beginning of data analysis,
+particularly during dataset curation and labeling. In response, we propose
+Falcon, a scalable fair active learning framework. Falcon adopts a data-centric
+approach that improves machine learning model fairness via strategic sample
+selection. Given a user-specified group fairness measure, Falcon identifies
+samples from "target groups" (e.g., (attribute=female, label=positive)) that
+are the most informative for improving fairness. However, a challenge arises
+since these target groups are defined using ground truth labels that are not
+available during sample selection. To handle this, we propose a novel
+trial-and-error method, where we postpone using a sample if the predicted label
+is different from the expected one and falls outside the target group. We also
+observe the trade-off that selecting more informative samples results in higher
+likelihood of postponing due to undesired label prediction, and the optimal
+balance varies per dataset. We capture the trade-off between informativeness
+and postpone rate as policies and propose to automatically select the best
+policy using adversarial multi-armed bandit methods, given their computational
+efficiency and theoretical guarantees. Experiments show that Falcon
+significantly outperforms existing fair active learning approaches in terms of
+fairness and accuracy and is more efficient. In particular, only Falcon
+supports a proper trade-off between accuracy and fairness where its maximum
+fairness score is 1.8-4.5x higher than the second-best results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 12 figures, 14 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gas trap prediction from 3D seismic and well test data using machine
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dmitry Ivlev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aim of this work is to create and apply a methodological approach for
+predicting gas traps from 3D seismic data and gas well testing. The paper
+formalizes the approach to creating a training dataset by selecting volumes
+with established gas saturation and filtration properties within the seismic
+wavefield. The training dataset thus created is used in a process stack of
+sequential application of data processing methods and ensemble machine learning
+algorithms. As a result, a cube of calibrated probabilities of belonging of the
+study space to gas reservoirs was obtained. The high efficiency of this
+approach is shown on a delayed test sample of three wells (blind wells). The
+final value of the gas reservoir prediction quality metric f1 score was
+0.893846.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When Redundancy Matters: Machine Teaching of Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cèsar Ferri, Dario Garigliotti, Brigt Arve Toppe Håvardstun, Josè Hernández-Orallo, Jan Arne Telle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In traditional machine teaching, a teacher wants to teach a concept to a
+learner, by means of a finite set of examples, the witness set. But concepts
+can have many equivalent representations. This redundancy strongly affects the
+search space, to the extent that teacher and learner may not be able to easily
+determine the equivalence class of each representation. In this common
+situation, instead of teaching concepts, we explore the idea of teaching
+representations. We work with several teaching schemas that exploit
+representation and witness size (Eager, Greedy and Optimal) and analyze the
+gains in teaching effectiveness for some representational languages (DNF
+expressions and Turing-complete P3 programs). Our theoretical and experimental
+results indicate that there are various types of redundancy, handled better by
+the Greedy schema introduced here than by the Eager schema, although both can
+be arbitrarily far away from the Optimal. For P3 programs we found that witness
+sets are usually smaller than the programs they identify, which is an
+illuminating justification of why machine teaching from examples makes sense at
+all.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Neural Network Benchmarks for Selective Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12708v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12708v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Pugnana, Lorenzo Perini, Jesse Davis, Salvatore Ruggieri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increasing deployment of machine learning models in many
+socially-sensitive tasks, there is a growing demand for reliable and
+trustworthy predictions. One way to accomplish these requirements is to allow a
+model to abstain from making a prediction when there is a high risk of making
+an error. This requires adding a selection mechanism to the model, which
+selects those examples for which the model will provide a prediction. The
+selective classification framework aims to design a mechanism that balances the
+fraction of rejected predictions (i.e., the proportion of examples for which
+the model does not make a prediction) versus the improvement in predictive
+performance on the selected predictions. Multiple selective classification
+frameworks exist, most of which rely on deep neural network architectures.
+However, the empirical evaluation of the existing approaches is still limited
+to partial comparisons among methods and settings, providing practitioners with
+little insight into their relative merits. We fill this gap by benchmarking 18
+baselines on a diverse set of 44 datasets that includes both image and tabular
+data. Moreover, there is a mix of binary and multiclass tasks. We evaluate
+these approaches using several criteria, including selective error rate,
+empirical coverage, distribution of rejected instance's classes, and
+performance on out-of-distribution instances. The results indicate that there
+is not a single clear winner among the surveyed baselines, and the best method
+depends on the users' objectives.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy-based Automated Model Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ru Peng, Heming Zou, Haobo Wang, Yawen Zeng, Zenan Huang, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional evaluation protocols on machine learning models rely heavily
+on a labeled, i.i.d-assumed testing dataset, which is not often present in real
+world applications. The Automated Model Evaluation (AutoEval) shows an
+alternative to this traditional workflow, by forming a proximal prediction
+pipeline of the testing performance without the presence of ground-truth
+labels. Despite its recent successes, the AutoEval frameworks still suffer from
+an overconfidence issue, substantial storage and computational cost. In that
+regard, we propose a novel measure -- Meta-Distribution Energy (MDE) -- that
+allows the AutoEval framework to be both more efficient and effective. The core
+of the MDE is to establish a meta-distribution statistic, on the information
+(energy) associated with individual samples, then offer a smoother
+representation enabled by energy-based learning. We further provide our
+theoretical insights by connecting the MDE with the classification loss. We
+provide extensive experiments across modalities, datasets and different
+architectural backbones to validate MDE's validity, together with its
+superiority compared with prior approaches. We also prove MDE's versatility by
+showing its seamless integration with large-scale models, and easy adaption to
+learning scenarios with noisy- or imbalanced- labels.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DVL Calibration using Data-driven Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeev Yampolsky, Itzik Klein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous underwater vehicles (AUVs) are used in a wide range of underwater
+applications, ranging from seafloor mapping to industrial operations. While
+underwater, the AUV navigation solution commonly relies on the fusion between
+inertial sensors and Doppler velocity logs (DVL). To achieve accurate DVL
+measurements a calibration procedure should be conducted before the mission
+begins. Model-based calibration approaches include filtering approaches
+utilizing global navigation satellite system signals. In this paper, we propose
+an end-to-end deep-learning framework for the calibration procedure. Using
+stimulative data, we show that our proposed approach outperforms model-based
+approaches by 35% in accuracy and 80% in the required calibration time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages , 3 figures , 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Mean Field Games on Sparse Graphs: A Hybrid Graphex Approach <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Fabian, Kai Cui, Heinz Koeppl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning the behavior of large agent populations is an important task for
+numerous research areas. Although the field of multi-agent reinforcement
+learning (MARL) has made significant progress towards solving these systems,
+solutions for many agents often remain computationally infeasible and lack
+theoretical guarantees. Mean Field Games (MFGs) address both of these issues
+and can be extended to Graphon MFGs (GMFGs) to include network structures
+between agents. Despite their merits, the real world applicability of GMFGs is
+limited by the fact that graphons only capture dense graphs. Since most
+empirically observed networks show some degree of sparsity, such as power law
+graphs, the GMFG framework is insufficient for capturing these network
+topologies. Thus, we introduce the novel concept of Graphex MFGs (GXMFGs) which
+builds on the graph theoretical concept of graphexes. Graphexes are the
+limiting objects to sparse graph sequences that also have other desirable
+features such as the small world property. Learning equilibria in these games
+is challenging due to the rich and sparse structure of the underlying graphs.
+To tackle these challenges, we design a new learning algorithm tailored to the
+GXMFG setup. This hybrid graphex learning approach leverages that the system
+mainly consists of a highly connected core and a sparse periphery. After
+defining the system and providing a theoretical analysis, we state our learning
+approach and demonstrate its learning capabilities on both synthetic graphs and
+real-world networks. This comparison shows that our GXMFG learning algorithm
+successfully extends MFGs to a highly relevant class of hard, realistic
+learning problems that are not accurately addressed by current MARL and MFG
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLpowershap: Logistic Loss-based Automated Shapley Values Feature
+  Selection Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12683v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12683v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iqbal Madakkatel, Elina Hyppönen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shapley values have been used extensively in machine learning, not only to
+explain black box machine learning models, but among other tasks, also to
+conduct model debugging, sensitivity and fairness analyses and to select
+important features for robust modelling and for further follow-up analyses.
+Shapley values satisfy certain axioms that promote fairness in distributing
+contributions of features toward prediction or reducing error, after accounting
+for non-linear relationships and interactions when complex machine learning
+models are employed. Recently, a number of feature selection methods utilising
+Shapley values have been introduced. Here, we present a novel feature selection
+method, LLpowershap, which makes use of loss-based Shapley values to identify
+informative features with minimal noise among the selected sets of features.
+Our simulation results show that LLpowershap not only identifies higher number
+of informative features but outputs fewer noise features compared to other
+state-of-the-art feature selection methods. Benchmarking results on four
+real-world datasets demonstrate higher or at par predictive performance of
+LLpowershap compared to other Shapley based wrapper methods, or filter methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-Neighbors Also Matter to Kriging: A New Contrastive-Prototypical
+  Learning <span class="chip">AISTATS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhishuai Li, Yunhao Nie, Ziyue Li, Lei Bai, Yisheng Lv, Rui Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kriging aims at estimating the attributes of unsampled geo-locations from
+observations in the spatial vicinity or physical connections, which helps
+mitigate skewed monitoring caused by under-deployed sensors. Existing works
+assume that neighbors' information offers the basis for estimating the
+attributes of the unobserved target while ignoring non-neighbors. However,
+non-neighbors could also offer constructive information, and neighbors could
+also be misleading. To this end, we propose ``Contrastive-Prototypical''
+self-supervised learning for Kriging (KCP) to refine valuable information from
+neighbors and recycle the one from non-neighbors. As a pre-trained paradigm, we
+conduct the Kriging task from a new perspective of representation: we aim to
+first learn robust and general representations and then recover attributes from
+representations. A neighboring contrastive module is designed that coarsely
+learns the representations by narrowing the representation distance between the
+target and its neighbors while pushing away the non-neighbors. In parallel, a
+prototypical module is introduced to identify similar representations via
+exchanged prediction, thus refining the misleading neighbors and recycling the
+useful non-neighbors from the neighboring contrast component. As a result, not
+all the neighbors and some of the non-neighbors will be used to infer the
+target. To encourage the two modules above to learn general and robust
+representations, we design an adaptive augmentation module that incorporates
+data-driven attribute augmentation and centrality-based topology augmentation
+over the spatiotemporal Kriging graph data. Extensive experiments on real-world
+datasets demonstrate the superior performance of KCP compared to its peers with
+6% improvements and exceptional transferability and robustness. The code is
+available at https://github.com/bonaldli/KCP
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in AISTATS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Selection via Robust Weighted Score for High Dimensional Binary
+  Class-Imbalanced Gene Expression Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zardad Khan, Amjad Ali, Saeed Aldahmani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a robust weighted score for unbalanced data (ROWSU) is
+proposed for selecting the most discriminative feature for high dimensional
+gene expression binary classification with class-imbalance problem. The method
+addresses one of the most challenging problems of highly skewed class
+distributions in gene expression datasets that adversely affect the performance
+of classification algorithms. First, the training dataset is balanced by
+synthetically generating data points from minority class observations. Second,
+a minimum subset of genes is selected using a greedy search approach. Third, a
+novel weighted robust score, where the weights are computed by support vectors,
+is introduced to obtain a refined set of genes. The highest-scoring genes based
+on this approach are combined with the minimum subset of genes selected by the
+greedy search approach to form the final set of genes. The novel method ensures
+the selection of the most discriminative genes, even in the presence of skewed
+class distribution, thus improving the performance of the classifiers. The
+performance of the proposed ROWSU method is evaluated on $6$ gene expression
+datasets. Classification accuracy and sensitivity are used as performance
+metrics to compare the proposed ROWSU algorithm with several other
+state-of-the-art methods. Boxplots and stability plots are also constructed for
+a better understanding of the results. The results show that the proposed
+method outperforms the existing feature selection procedures based on
+classification performance from k nearest neighbours (kNN) and random forest
+(RF) classifiers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating Human Expertise in Continuous Spaces: A Novel Interactive
+  Bayesian Optimization Framework with Preference Expected Improvement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12662v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12662v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolaus Feith, Elmar Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactive Machine Learning (IML) seeks to integrate human expertise into
+machine learning processes. However, most existing algorithms cannot be applied
+to Realworld Scenarios because their state spaces and/or action spaces are
+limited to discrete values. Furthermore, the interaction of all existing
+methods is restricted to deciding between multiple proposals. We therefore
+propose a novel framework based on Bayesian Optimization (BO). Interactive
+Bayesian Optimization (IBO) enables collaboration between machine learning
+algorithms and humans. This framework captures user preferences and provides an
+interface for users to shape the strategy by hand. Additionally, we've
+incorporated a new acquisition function, Preference Expected Improvement (PEI),
+to refine the system's efficiency using a probabilistic model of the user
+preferences. Our approach is geared towards ensuring that machines can benefit
+from human expertise, aiming for a more aligned and effective learning process.
+In the course of this work, we applied our method to simulations and in a real
+world task using a Franka Panda robot to show human-robot collaboration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistency Enhancement-Based Deep Multiview Clustering via Contrastive
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Yang, Hua Mao, Wai Lok Woo, Jie Chen, Xi Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiview clustering (MVC) segregates data samples into meaningful clusters
+by synthesizing information across multiple views. Moreover, deep
+learning-based methods have demonstrated their strong feature learning
+capabilities in MVC scenarios. However, effectively generalizing feature
+representations while maintaining consistency is still an intractable problem.
+In addition, most existing deep clustering methods based on contrastive
+learning overlook the consistency of the clustering representations during the
+clustering process. In this paper, we show how the above problems can be
+overcome and propose a consistent enhancement-based deep MVC method via
+contrastive learning (CCEC). Specifically, semantic connection blocks are
+incorporated into a feature representation to preserve the consistent
+information among multiple views. Furthermore, the representation process for
+clustering is enhanced through spectral clustering, and the consistency across
+multiple views is improved. Experiments conducted on five datasets demonstrate
+the effectiveness and superiority of our method in comparison with the
+state-of-the-art (SOTA) methods. The code for this method can be accessed at
+https://anonymous.4open.science/r/CCEC-E84E/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Robustness of Deep Learning-aided Symbol Detectors to Varying
+  Conditions and Imperfect Channel Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chin-Hung Chen, Boris Karanov, Wim van Houtum, Wu Yan, Alex Young, Alex Alvarado
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, a data-driven Bahl-Cocke-Jelinek-Raviv (BCJR) algorithm tailored to
+channels with intersymbol interference has been introduced. This so-called
+BCJRNet algorithm utilizes neural networks to calculate channel likelihoods.
+BCJRNet has demonstrated resilience against inaccurate channel tap estimations
+when applied to a time-invariant channel with ideal exponential decay profiles.
+However, its generalization capabilities for practically-relevant time-varying
+channels, where the receiver can only access incorrect channel parameters,
+remain largely unexplored. The primary contribution of this paper is to expand
+upon the results from existing literature to encompass a variety of imperfect
+channel knowledge cases that appear in real-world transmissions. Our findings
+demonstrate that BCJRNet significantly outperforms the conventional BCJR
+algorithm for stationary transmission scenarios when learning from noisy
+channel data and with imperfect channel decay profiles. However, this advantage
+is shown to diminish when the operating channel is also rapidly time-varying.
+Our results also show the importance of memory assumptions for conventional
+BCJR and BCJRNet. An underestimation of the memory largely degrades the
+performance of both BCJR and BCJRNet, especially in a slow-decaying channel. To
+mimic a situation closer to a practical scenario, we also combined channel tap
+uncertainty with imperfect channel memory knowledge. Somewhat surprisingly, our
+results revealed improved performance when employing the conventional BCJR with
+an underestimated memory assumption. BCJRNet, on the other hand, showed a
+consistent performance improvement as the level of accurate memory knowledge
+increased.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted paper at IEEE Wireless Communications and Networking
+  Conference (WCNC) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Binary Feature Mask Optimization for Feature Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehmet E. Lorasdagi, Mehmet Y. Turali, Ali T. Koc, Suleyman S. Kozat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate feature selection problem for generic machine learning (ML)
+models. We introduce a novel framework that selects features considering the
+predictions of the model. Our framework innovates by using a novel feature
+masking approach to eliminate the features during the selection process,
+instead of completely removing them from the dataset. This allows us to use the
+same ML model during feature selection, unlike other feature selection methods
+where we need to train the ML model again as the dataset has different
+dimensions on each iteration. We obtain the mask operator using the predictions
+of the ML model, which offers a comprehensive view on the subsets of the
+features essential for the predictive performance of the model. A variety of
+approaches exist in the feature selection literature. However, no study has
+introduced a training-free framework for a generic ML model to select features
+while considering the importance of the feature subsets as a whole, instead of
+focusing on the individual features. We demonstrate significant performance
+improvements on the real-life datasets under different settings using LightGBM
+and Multi-Layer Perceptron as our ML models. Additionally, we openly share the
+implementation code for our methods to encourage the research and the
+contributions in this area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Reply to Makelov et al. (2023)'s "Interpretability Illusion" Arguments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxuan Wu, Atticus Geiger, Jing Huang, Aryaman Arora, Thomas Icard, Christopher Potts, Noah D. Goodman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We respond to the recent paper by Makelov et al. (2023), which reviews
+subspace interchange intervention methods like distributed alignment search
+(DAS; Geiger et al. 2023) and claims that these methods potentially cause
+"interpretability illusions". We first review Makelov et al. (2023)'s technical
+notion of what an "interpretability illusion" is, and then we show that even
+intuitive and desirable explanations can qualify as illusions in this sense. As
+a result, their method of discovering "illusions" can reject explanations they
+consider "non-illusory". We then argue that the illusions Makelov et al. (2023)
+see in practice are artifacts of their training and evaluation paradigms. We
+close by emphasizing that, though we disagree with their core characterization,
+Makelov et al. (2023)'s examples and discussion have undoubtedly pushed the
+field of interpretability forward.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Full-Stack Optimization for CAM-Only DNN Inference <span class="chip">DATE24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12630v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12630v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        João Paulo C. de Lima, Asif Ali Khan, Luigi Carro, Jeronimo Castrillon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accuracy of neural networks has greatly improved across various domains
+over the past years. Their ever-increasing complexity, however, leads to
+prohibitively high energy demands and latency in von Neumann systems. Several
+computing-in-memory (CIM) systems have recently been proposed to overcome this,
+but trade-offs involving accuracy, hardware reliability, and scalability for
+large models remain a challenge. Additionally, for some CIM designs, the
+activation movement still requires considerable time and energy. This paper
+explores the combination of algorithmic optimizations for ternary weight neural
+networks and associative processors (APs) implemented using racetrack memory
+(RTM). We propose a novel compilation flow to optimize convolutions on APs by
+reducing their arithmetic intensity. By leveraging the benefits of RTM-based
+APs, this approach substantially reduces data transfers within the memory while
+addressing accuracy, energy efficiency, and reliability concerns. Concretely,
+our solution improves the energy efficiency of ResNet-18 inference on ImageNet
+by 7.5x compared to crossbar in-memory accelerators while retaining software
+accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at DATE24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Blind Channel Estimation and Joint Symbol Detection with Data-Driven
+  Factor Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12627v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12627v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Schmid, Tomer Raviv, Nir Shlezinger, Laurent Schmalen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the application of the factor graph framework for blind joint
+channel estimation and symbol detection on time-variant linear inter-symbol
+interference channels. In particular, we consider the expectation maximization
+(EM) algorithm for maximum likelihood estimation, which typically suffers from
+high complexity as it requires the computation of the symbol-wise posterior
+distributions in every iteration. We address this issue by efficiently
+approximating the posteriors using the belief propagation (BP) algorithm on a
+suitable factor graph. By interweaving the iterations of BP and EM, the
+detection complexity can be further reduced to a single BP iteration per EM
+step. In addition, we propose a data-driven version of our algorithm that
+introduces momentum in the BP updates and learns a suitable EM parameter update
+schedule, thereby significantly improving the performance-complexity tradeoff
+with a few offline training samples. Our numerical experiments demonstrate the
+excellent performance of the proposed blind detector and show that it even
+outperforms coherent BP detection in high signal-to-noise scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE for peer review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Distillation from Language-Oriented to Emergent Communication
+  for Multi-Agent Remote Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongjun Kim, Sejin Seo, Jihong Park, Mehdi Bennis, Seong-Lyun Kim, Junil Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we compare emergent communication (EC) built upon multi-agent
+deep reinforcement learning (MADRL) and language-oriented semantic
+communication (LSC) empowered by a pre-trained large language model (LLM) using
+human language. In a multi-agent remote navigation task, with multimodal input
+data comprising location and channel maps, it is shown that EC incurs high
+training cost and struggles when using multimodal data, whereas LSC yields high
+inference computing cost due to the LLM's large size. To address their
+respective bottlenecks, we propose a novel framework of language-guided EC
+(LEC) by guiding the EC training using LSC via knowledge distillation (KD).
+Simulations corroborate that LEC achieves faster travel time while avoiding
+areas with poor channel conditions, as well as speeding up the MADRL training
+convergence by up to 61.8% compared to EC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Joint Effect of Task Similarity and Overparameterization on
+  Catastrophic Forgetting -- An Analytical Model <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Itay Evron, Daniel Goldfarb, Nir Weinberger, Daniel Soudry, Paul Hand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In continual learning, catastrophic forgetting is affected by multiple
+aspects of the tasks. Previous works have analyzed separately how forgetting is
+affected by either task similarity or overparameterization. In contrast, our
+paper examines how task similarity and overparameterization jointly affect
+forgetting in an analyzable model. Specifically, we focus on two-task continual
+linear regression, where the second task is a random orthogonal transformation
+of an arbitrary first task (an abstraction of random permutation tasks). We
+derive an exact analytical expression for the expected forgetting - and uncover
+a nuanced pattern. In highly overparameterized models, intermediate task
+similarity causes the most forgetting. However, near the interpolation
+threshold, forgetting decreases monotonically with the expected task
+similarity. We validate our findings with linear regression on synthetic data,
+and with neural networks on established permutation task benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Twelfth International Conference on Learning
+  Representations (ICLR 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span> Smells: An Omen for Undesirable Generative AI Outputs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krishna Ronanki, Beatriz Cabrero-Daniel, Christian Berger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent Generative Artificial Intelligence (GenAI) trends focus on various
+applications, including creating stories, illustrations, poems, articles,
+computer code, music compositions, and videos. Extrinsic hallucinations are a
+critical limitation of such GenAI, which can lead to significant challenges in
+achieving and maintaining the trustworthiness of GenAI. In this paper, we
+propose two new concepts that we believe will aid the research community in
+addressing limitations associated with the application of GenAI models. First,
+we propose a definition for the "desirability" of GenAI outputs and three
+factors which are observed to influence it. Second, drawing inspiration from
+Martin Fowler's code smells, we propose the concept of "prompt smells" and the
+adverse effects they are observed to have on the desirability of GenAI outputs.
+We expect our work will contribute to the ongoing conversation about the
+desirability of GenAI outputs and help advance the field in a meaningful way.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CAIN 2024: Poster Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The twin peaks of learning neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12610v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12610v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizaveta Demyanenko, Christoph Feinauer, Enrico M. Malatesta, Luca Saglietti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works demonstrated the existence of a double-descent phenomenon for
+the generalization error of neural networks, where highly overparameterized
+models escape overfitting and achieve good test performance, at odds with the
+standard bias-variance trade-off described by statistical learning theory. In
+the present work, we explore a link between this phenomenon and the increase of
+complexity and sensitivity of the function represented by neural networks. In
+particular, we study the Boolean mean dimension (BMD), a metric developed in
+the context of Boolean function analysis. Focusing on a simple teacher-student
+setting for the random feature model, we derive a theoretical analysis based on
+the replica method that yields an interpretable expression for the BMD, in the
+high dimensional regime where the number of data points, the number of
+features, and the input size grow to infinity. We find that, as the degree of
+overparameterization of the network is increased, the BMD reaches an evident
+peak at the interpolation threshold, in correspondence with the generalization
+error peak, and then slowly approaches a low asymptotic value. The same
+phenomenology is then traced in numerical experiments with different model
+classes and training setups. Moreover, we find empirically that adversarially
+initialized models tend to show higher BMD values, and that models that are
+more robust to adversarial attacks exhibit a lower BMD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 30 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Semi-supervised Unmixing using Non-convex Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behnood Rasti, Alexandre Zouaoui, Julien Mairal, Jocelyn Chanussot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a novel linear model tailored for
+semisupervised/library-based unmixing. Our model incorporates considerations
+for library mismatch while enabling the enforcement of the abundance sum-to-one
+constraint (ASC). Unlike conventional sparse unmixing methods, this model
+involves nonconvex optimization, presenting significant computational
+challenges. We demonstrate the efficacy of Alternating Methods of Multipliers
+(ADMM) in cyclically solving these intricate problems. We propose two
+semisupervised unmixing approaches, each relying on distinct priors applied to
+the new model in addition to the ASC: sparsity prior and convexity constraint.
+Our experimental results validate that enforcing the convexity constraint
+outperforms the sparsity prior for the endmember library. These results are
+corroborated across three simulated datasets (accounting for spectral
+variability and varying pixel purity levels) and the Cuprite dataset.
+Additionally, our comparison with conventional sparse unmixing methods
+showcases considerable advantages of our proposed model, which entails
+nonconvex optimization. Notably, our implementations of the proposed
+algorithms-fast semisupervised unmixing (FaSUn) and sparse unmixing using
+soft-shrinkage (SUnS)-prove considerably more efficient than traditional sparse
+unmixing methods. SUnS and FaSUn were implemented using PyTorch and provided in
+a dedicated Python package called Fast Semisupervised Unmixing (FUnmix), which
+is open-source and available at https://github.com/BehnoodRasti/FUnmix
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpreting Equivariant Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Abildtrup Hansen, Anna Calissano, Aasa Feragen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Latent representations are used extensively for downstream tasks, such as
+visualization, interpolation or feature extraction of deep learning models.
+Invariant and equivariant neural networks are powerful and well-established
+models for enforcing inductive biases. In this paper, we demonstrate that the
+inductive bias imposed on the by an equivariant model must also be taken into
+account when using latent representations. We show how not accounting for the
+inductive biases leads to decreased performance on downstream tasks, and vice
+versa, how accounting for inductive biases can be done effectively by using an
+invariant projection of the latent representations. We propose principles for
+how to choose such a projection, and show the impact of using these principles
+in two common examples: First, we study a permutation equivariant variational
+auto-encoder trained for molecule graph generation; here we show that invariant
+projections can be designed that incur no loss of information in the resulting
+invariant representation. Next, we study a rotation-equivariant representation
+used for image classification. Here, we illustrate how random invariant
+projections can be used to obtain an invariant representation with a high
+degree of retained information. In both cases, the analysis of invariant latent
+representations proves superior to their equivariant counterparts. Finally, we
+illustrate that the phenomena documented here for equivariant neural networks
+have counterparts in standard neural networks where invariance is encouraged
+via augmentation. Thus, while these ambiguities may be known by experienced
+developers of equivariant models, we make both the knowledge as well as
+effective tools to handle the ambiguities available to the broader community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLMCheckup: Conversational Examination of Large Language Models via
+  Interpretability Tools 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianli Wang, Tatiana Anikina, Nils Feldhus, Josef van Genabith, Leonhard Hennig, Sebastian Möller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretability tools that offer explanations in the form of a dialogue have
+demonstrated their efficacy in enhancing users' understanding, as one-off
+explanations may occasionally fall short in providing sufficient information to
+the user. Current solutions for dialogue-based explanations, however, require
+many dependencies and are not easily transferable to tasks they were not
+designed for. With LLMCheckup, we present an easily accessible tool that allows
+users to chat with any state-of-the-art large language model (LLM) about its
+behavior. We enable LLMs to generate all explanations by themselves and take
+care of intent recognition without fine-tuning, by connecting them with a broad
+spectrum of Explainable AI (XAI) tools, e.g. feature attributions,
+embedding-based similarity, and prompting strategies for counterfactual and
+rationale generation. LLM (self-)explanations are presented as an interactive
+dialogue that supports follow-up questions and generates suggestions.
+LLMCheckup provides tutorials for operations available in the system, catering
+to individuals with varying levels of expertise in XAI and supports multiple
+input modalities. We introduce a new parsing strategy called multi-prompt
+parsing substantially enhancing the parsing accuracy of LLMs. Finally, we
+showcase the tasks of fact checking and commonsense question answering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Contrastive Invariant Learning from the Causal Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12564v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12564v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanhu Mo, Xiao Wang, Shaohua Fan, Chuan Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph contrastive learning (GCL), learning the node representation by
+contrasting two augmented graphs in a self-supervised way, has attracted
+considerable attention. GCL is usually believed to learn the invariant
+representation. However, does this understanding always hold in practice? In
+this paper, we first study GCL from the perspective of causality. By analyzing
+GCL with the structural causal model (SCM), we discover that traditional GCL
+may not well learn the invariant representations due to the non-causal
+information contained in the graph. How can we fix it and encourage the current
+GCL to learn better invariant representations? The SCM offers two requirements
+and motives us to propose a novel GCL method. Particularly, we introduce the
+spectral graph augmentation to simulate the intervention upon non-causal
+factors. Then we design the invariance objective and independence objective to
+better capture the causal factors. Specifically, (i) the invariance objective
+encourages the encoder to capture the invariant information contained in causal
+variables, and (ii) the independence objective aims to reduce the influence of
+confounders on the causal variables. Experimental results demonstrate the
+effectiveness of our approach on node classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UR4NNV: Neural Network Verification, Under-approximation Reachability
+  Works! 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12550v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12550v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Liang, Taoran Wu, Ran Zhao, Bai Xue, Ji Wang, Wenjing Yang, Shaojun Deng, Wanwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, formal verification of deep neural networks (DNNs) has garnered
+considerable attention, and over-approximation based methods have become
+popular due to their effectiveness and efficiency. However, these strategies
+face challenges in addressing the "unknown dilemma" concerning whether the
+exact output region or the introduced approximation error violates the property
+in question. To address this, this paper introduces the UR4NNV verification
+framework, which utilizes under-approximation reachability analysis for DNN
+verification for the first time. UR4NNV focuses on DNNs with Rectified Linear
+Unit (ReLU) activations and employs a binary tree branch-based
+under-approximation algorithm. In each epoch, UR4NNV under-approximates a
+sub-polytope of the reachable set and verifies this polytope against the given
+property. Through a trial-and-error approach, UR4NNV effectively falsifies DNN
+properties while providing confidence levels when reaching verification epoch
+bounds and failing falsifying properties. Experimental comparisons with
+existing verification methods demonstrate the effectiveness and efficiency of
+UR4NNV, significantly reducing the impact of the "unknown dilemma".
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Building Myopic MPC Policies using Supervised Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12546v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12546v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher A. Orrico, Bokan Yang, Dinesh Krishnamoorthy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of supervised learning techniques in combination with model
+predictive control (MPC) has recently generated significant interest,
+particularly in the area of approximate explicit MPC, where function
+approximators like deep neural networks are used to learn the MPC policy via
+optimal state-action pairs generated offline. While the aim of approximate
+explicit MPC is to closely replicate the MPC policy, substituting online
+optimization with a trained neural network, the performance guarantees that
+come with solving the online optimization problem are typically lost. This
+paper considers an alternative strategy, where supervised learning is used to
+learn the optimal value function offline instead of learning the optimal
+policy. This can then be used as the cost-to-go function in a myopic MPC with a
+very short prediction horizon, such that the online computation burden reduces
+significantly without affecting the controller performance. This approach
+differs from existing work on value function approximations in the sense that
+it learns the cost-to-go function by using offline-collected state-value pairs,
+rather than closed-loop performance data. The cost of generating the
+state-value pairs used for training is addressed using a sensitivity-based data
+augmentation scheme.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Constrained $k$-Center Clustering with Background Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longkun Guo, Chaoqi Jia, Kewen Liao, Zhigang Lu, Minhui Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Center-based clustering has attracted significant research interest from both
+theory and practice. In many practical applications, input data often contain
+background knowledge that can be used to improve clustering results. In this
+work, we build on widely adopted $k$-center clustering and model its input
+background knowledge as must-link (ML) and cannot-link (CL) constraint sets.
+However, most clustering problems including $k$-center are inherently
+$\mathcal{NP}$-hard, while the more complex constrained variants are known to
+suffer severer approximation and computation barriers that significantly limit
+their applicability. By employing a suite of techniques including reverse
+dominating sets, linear programming (LP) integral polyhedron, and LP duality,
+we arrive at the first efficient approximation algorithm for constrained
+$k$-center with the best possible ratio of 2. We also construct competitive
+baseline algorithms and empirically evaluate our approximation algorithm
+against them on a variety of real datasets. The results validate our
+theoretical findings and demonstrate the great advantages of our algorithm in
+terms of clustering cost, clustering quality, and running time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DAFA: Distance-Aware Fair Adversarial Training <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyungyu Lee, Saehyung Lee, Hyemi Jang, Junsung Park, Ho Bae, Sungroh Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The disparity in accuracy between classes in standard training is amplified
+during adversarial training, a phenomenon termed the robust fairness problem.
+Existing methodologies aimed to enhance robust fairness by sacrificing the
+model's performance on easier classes in order to improve its performance on
+harder ones. However, we observe that under adversarial attacks, the majority
+of the model's predictions for samples from the worst class are biased towards
+classes similar to the worst class, rather than towards the easy classes.
+Through theoretical and empirical analysis, we demonstrate that robust fairness
+deteriorates as the distance between classes decreases. Motivated by these
+insights, we introduce the Distance-Aware Fair Adversarial training (DAFA)
+methodology, which addresses robust fairness by taking into account the
+similarities between classes. Specifically, our method assigns distinct loss
+weights and adversarial margins to each class and adjusts them to encourage a
+trade-off in robustness among similar classes. Experimental results across
+various datasets demonstrate that our method not only maintains average robust
+accuracy but also significantly improves the worst robust accuracy, indicating
+a marked improvement in robust fairness compared to existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BiTA: Bi-Directional Tuning for Lossless Acceleration in Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Lin, Hanling Yi, Hongbin Li, Yifan Yang, Xiaotian Yu, Guangming Lu, Rong Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) commonly employ autoregressive generation during
+inference, leading to high memory bandwidth demand and consequently extended
+latency. To mitigate this inefficiency, we present Bi-directional Tuning for
+lossless Acceleration (BiTA), an innovative method expediting LLMs via
+streamlined semi-autoregressive generation and draft verification. Inspired by
+the concept of prompt tuning, we enhance LLMs with a parameter-efficient design
+called bi-directional tuning for the capability in semi-autoregressive
+generation. Employing efficient tree-based decoding, the models perform draft
+candidate generation and verification in parallel, ensuring outputs identical
+to their autoregressive counterparts under greedy sampling. BiTA serves as a
+lightweight plug-in module, seamlessly boosting the inference efficiency of
+existing LLMs without requiring additional assistance models or incurring
+significant extra memory costs. Applying the proposed BiTA, LLaMA-2-70B-Chat
+achieves a 2.7$\times$ speedup on the MT-Bench benchmark. Extensive experiments
+confirm our method surpasses state-of-the-art acceleration techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Source code at https://github.com/linfeng93/BiTA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Key Information Retrieval to Classify the Unstructured Data Content of
+  Preferential Trade Agreements <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahui Zhao, Ziyi Meng, Stepan Gordeev, Zijie Pan, Dongjin Song, Sandro Steinbach, Caiwen Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid proliferation of textual data, predicting long texts has
+emerged as a significant challenge in the domain of natural language
+processing. Traditional text prediction methods encounter substantial
+difficulties when grappling with long texts, primarily due to the presence of
+redundant and irrelevant information, which impedes the model's capacity to
+capture pivotal insights from the text. To address this issue, we introduce a
+novel approach to long-text classification and prediction. Initially, we employ
+embedding techniques to condense the long texts, aiming to diminish the
+redundancy therein. Subsequently,the Bidirectional Encoder Representations from
+Transformers (BERT) embedding method is utilized for text classification
+training. Experimental outcomes indicate that our method realizes considerable
+performance enhancements in classifying long texts of Preferential Trade
+Agreements. Furthermore, the condensation of text through embedding methods not
+only augments prediction accuracy but also substantially reduces computational
+complexity. Overall, this paper presents a strategy for long-text prediction,
+offering a valuable reference for researchers and engineers in the natural
+language processing sphere.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AI4TS Workshop@AAAI 2024 accepted publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DDMI: Domain-Agnostic Latent Diffusion Models for Synthesizing
+  High-Quality Implicit Neural Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dogyun Park, Sihyeon Kim, Sojin Lee, Hyunwoo J. Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have introduced a new class of generative models for
+synthesizing implicit neural representations (INRs) that capture arbitrary
+continuous signals in various domains. These models opened the door for
+domain-agnostic generative models, but they often fail to achieve high-quality
+generation. We observed that the existing methods generate the weights of
+neural networks to parameterize INRs and evaluate the network with fixed
+positional embeddings (PEs). Arguably, this architecture limits the expressive
+power of generative models and results in low-quality INR generation. To
+address this limitation, we propose Domain-agnostic Latent Diffusion Model for
+INRs (DDMI) that generates adaptive positional embeddings instead of neural
+networks' weights. Specifically, we develop a Discrete-to-continuous space
+Variational AutoEncoder (D2C-VAE), which seamlessly connects discrete data and
+the continuous signal functions in the shared latent space. Additionally, we
+introduce a novel conditioning mechanism for evaluating INRs with the
+hierarchically decomposed PEs to further enhance expressive power. Extensive
+experiments across four modalities, e.g., 2D images, 3D shapes, Neural Radiance
+Fields, and videos, with seven benchmark datasets, demonstrate the versatility
+of DDMI and its superior performance compared to the existing INR generative
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Digital cloning of online social networks for language-sensitive
+  agent-based modeling of misinformation spread 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12509v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12509v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prateek Puri, Gabriel Hassler, Anton Shenk, Sai Katragadda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a simulation framework for studying misinformation spread within
+online social networks that blends agent-based modeling and natural language
+processing techniques. While many other agent-based simulations exist in this
+space, their ability to provide actionable insights in in part limited by their
+lack of fidelity and generalizability to existing networks. To partially
+address these concerns, we create a 'digital clone' of a known misinformation
+sharing network by downloading social media histories for over ten thousand of
+its users. We parse these histories to both extract the structure of the
+network and model the nuanced ways in which information is shared and spread
+among its members. Unlike many other agent-based methods in this space,
+information sharing between users in our framework is sensitive to topic of
+discussion, user preferences, and online community dynamics. To evaluate the
+fidelity of our method, we seed our cloned network with a set of posts recorded
+in the base network and compare propagation dynamics between the two, observing
+reasonable agreement across the twin networks over a variety of metrics.
+Lastly, we explore how the cloned network may serve as a flexible, low-cost
+testbed for misinformation countermeasure evaluation and red teaming analysis.
+We hope the tools explored here augment existing efforts in the space and
+unlock new opportunities for misinformation countermeasure evaluation, a field
+that may become increasingly important to consider with the anticipated rise of
+misinformation campaigns fueled by generative artificial intelligence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Stochastic (Variance-Reduced) Proximal Gradient Method for
+  Regularized Expected Reward Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12508v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12508v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Liang, Haizhao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a regularized expected reward optimization problem in the
+non-oblivious setting that covers many existing problems in reinforcement
+learning (RL). In order to solve such an optimization problem, we apply and
+analyze the classical stochastic proximal gradient method. In particular, the
+method has shown to admit an $O(\epsilon^{-4})$ sample complexity to an
+$\epsilon$-stationary point, under standard conditions. Since the variance of
+the classical stochastic gradient estimator is typically large which slows down
+the convergence, we also apply an efficient stochastic variance-reduce proximal
+gradient method with an importance sampling based ProbAbilistic Gradient
+Estimator (PAGE). To the best of our knowledge, the application of this method
+represents a novel approach in addressing the general regularized reward
+optimization problem. Our analysis shows that the sample complexity can be
+improved from $O(\epsilon^{-4})$ to $O(\epsilon^{-3})$ under additional
+conditions. Our results on the stochastic (variance-reduced) proximal gradient
+method match the sample complexity of their most competitive counterparts under
+similar settings in the RL literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building Minimal and Reusable Causal State Abstractions for
+  Reinforcement Learning <span class="chip">AAAI24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zizhao Wang, Caroline Wang, Xuesu Xiao, Yuke Zhu, Peter Stone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Two desiderata of reinforcement learning (RL) algorithms are the ability to
+learn from relatively little experience and the ability to learn policies that
+generalize to a range of problem specifications. In factored state spaces, one
+approach towards achieving both goals is to learn state abstractions, which
+only keep the necessary variables for learning the tasks at hand. This paper
+introduces Causal Bisimulation Modeling (CBM), a method that learns the causal
+relationships in the dynamics and reward functions for each task to derive a
+minimal, task-specific abstraction. CBM leverages and improves implicit
+modeling to train a high-fidelity causal dynamics model that can be reused for
+all tasks in the same environment. Empirical validation on manipulation
+environments and Deepmind Control Suite reveals that CBM's learned implicit
+dynamics models identify the underlying causal relationships and state
+abstractions more accurately than explicit ones. Furthermore, the derived state
+abstractions allow a task learner to achieve near-oracle levels of sample
+efficiency and outperform baselines on all tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DexTouch: Learning to Seek and Manipulate Objects with Tactile Dexterity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kang-Won Lee, Yuzhe Qin, Xiaolong Wang, Soo-Chul Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The sense of touch is an essential ability for skillfully performing a
+variety of tasks, providing the capacity to search and manipulate objects
+without relying on visual information. Extensive research has been conducted
+over time to apply these human tactile abilities to robots. In this paper, we
+introduce a multi-finger robot system designed to search for and manipulate
+objects using the sense of touch without relying on visual information.
+Randomly located target objects are searched using tactile sensors, and the
+objects are manipulated for tasks that mimic daily-life. The objective of the
+study is to endow robots with human-like tactile capabilities. To achieve this,
+binary tactile sensors are implemented on one side of the robot hand to
+minimize the Sim2Real gap. Training the policy through reinforcement learning
+in simulation and transferring the trained policy to the real environment, we
+demonstrate that object search and manipulation using tactile sensors is
+possible even in an environment without vision information. In addition, an
+ablation study was conducted to analyze the effect of tactile information on
+manipulative tasks. Our project page is available at
+https://lee-kangwon.github.io/dextouch/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://lee-kangwon.github.io/dextouch/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparing Human-Centered Language Modeling: Is it Better to Model
+  Groups, Individual Traits, or Both? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikita Soni, Niranjan Balasubramanian, H. Andrew Schwartz, Dirk Hovy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language processing has made progress in incorporating human context
+into its models, but whether it is more effective to use group-wise attributes
+(e.g., over-45-year-olds) or model individuals remains open. Group attributes
+are technically easier but coarse: not all 45-year-olds write the same way. In
+contrast, modeling individuals captures the complexity of each person's
+identity. It allows for a more personalized representation, but we may have to
+model an infinite number of users and require data that may be impossible to
+get. We compare modeling human context via group attributes, individual users,
+and combined approaches. Combining group and individual features significantly
+benefits user-level regression tasks like age estimation or personality
+assessment from a user's documents. Modeling individual users significantly
+improves the performance of single document-level classification tasks like
+stance and topic detection. We also find that individual-user modeling does
+well even without user's historical data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Learning Method for the Wave Equation Based on Finite
+  Difference Residual Constraints Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12489v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12489v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Feng, Yi Jiang, Jia-Xian Qin, Lai-Ping Zhang, Xiao-Gang Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The wave equation is an important physical partial differential equation, and
+in recent years, deep learning has shown promise in accelerating or replacing
+traditional numerical methods for solving it. However, existing deep learning
+methods suffer from high data acquisition costs, low training efficiency, and
+insufficient generalization capability for boundary conditions. To address
+these issues, this paper proposes an unsupervised learning method for the wave
+equation based on finite difference residual constraints. We construct a novel
+finite difference residual constraint based on structured grids and finite
+difference methods, as well as an unsupervised training strategy, enabling
+convolutional neural networks to train without data and predict the forward
+propagation process of waves. Experimental results show that finite difference
+residual constraints have advantages over physics-informed neural networks
+(PINNs) type physical information constraints, such as easier fitting, lower
+computational costs, and stronger source term generalization capability, making
+our method more efficient in training and potent in application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in Chinese language</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adiabatic Quantum Support Vector Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prasanna Date, Dong Jun Woun, Kathleen Hamilton, Eduardo A. Coello Perez, Mayanka Chandra Shekhar, Francisco Rios, John Gounley, In-Saeng Suh, Travis Humble, Georgia Tourassi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adiabatic quantum computers can solve difficult optimization problems (e.g.,
+the quadratic unconstrained binary optimization problem), and they seem well
+suited to train machine learning models. In this paper, we describe an
+adiabatic quantum approach for training support vector machines. We show that
+the time complexity of our quantum approach is an order of magnitude better
+than the classical approach. Next, we compare the test accuracy of our quantum
+approach against a classical approach that uses the Scikit-learn library in
+Python across five benchmark datasets (Iris, Wisconsin Breast Cancer (WBC),
+Wine, Digits, and Lambeq). We show that our quantum approach obtains accuracies
+on par with the classical approach. Finally, we perform a scalability study in
+which we compute the total training times of the quantum approach and the
+classical approach with increasing number of features and number of data points
+in the training dataset. Our scalability results show that the quantum approach
+obtains a 3.5--4.5 times speedup over the classical approach on datasets with
+many (millions of) features.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mini-batch Submodular Maximization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gregory Schwartzman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the first mini-batch algorithm for maximizing a non-negative
+monotone decomposable submodular function, $F=\sum_{i=1}^N f^i$, under a set of
+constraints. We improve over the sparsifier based approach both in theory and
+in practice. We experimentally observe that our algorithm generates solutions
+that are far superior to those generated by the sparsifier based approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayesian identification of nonseparable Hamiltonians with multiplicative
+  noise using deep learning and reduced-order modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12476v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12476v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Galioto, Harsh Sharma, Boris Kramer, Alex Arkady Gorodetsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a structure-preserving Bayesian approach for learning
+nonseparable Hamiltonian systems using stochastic dynamic models allowing for
+statistically-dependent, vector-valued additive and multiplicative measurement
+noise. The approach is comprised of three main facets. First, we derive a
+Gaussian filter for a statistically-dependent, vector-valued, additive and
+multiplicative noise model that is needed to evaluate the likelihood within the
+Bayesian posterior. Second, we develop a novel algorithm for cost-effective
+application of Bayesian system identification to high-dimensional systems.
+Third, we demonstrate how structure-preserving methods can be incorporated into
+the proposed framework, using nonseparable Hamiltonians as an illustrative
+system class. We compare the Bayesian method to a state-of-the-art machine
+learning method on a canonical nonseparable Hamiltonian model and a chaotic
+double pendulum model with small, noisy training datasets. The results show
+that using the Bayesian posterior as a training objective can yield upwards of
+724 times improvement in Hamiltonian mean squared error using training data
+with up to 10% multiplicative noise compared to a standard training objective.
+Lastly, we demonstrate the utility of the novel algorithm for parameter
+estimation of a 64-dimensional model of the spatially-discretized nonlinear
+Schr\"odinger equation with data corrupted by up to 20% multiplicative noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models are Superpositions of All Characters: Attaining
+  Arbitrary Role-play via Self-Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12474v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12474v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keming Lu, Bowen Yu, Chang Zhou, Jingren Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Considerable efforts have been invested in augmenting the role-playing
+proficiency of open-source large language models (LLMs) by emulating
+proprietary counterparts. Nevertheless, we posit that LLMs inherently harbor
+role-play capabilities, owing to the extensive knowledge of characters and
+potential dialogues ingrained in their vast training corpora. Thus, in this
+study, we introduce Ditto, a self-alignment method for role-play. Ditto
+capitalizes on character knowledge, encouraging an instruction-following LLM to
+simulate role-play dialogues as a variant of reading comprehension. This method
+creates a role-play training set comprising 4,000 characters, surpassing the
+scale of currently available datasets by tenfold regarding the number of roles.
+Subsequently, we fine-tune the LLM using this self-generated dataset to augment
+its role-playing capabilities. Upon evaluating our meticulously constructed and
+reproducible role-play benchmark and the roleplay subset of MT-Bench, Ditto, in
+various parameter scales, consistently maintains a consistent role identity and
+provides accurate role-specific knowledge in multi-turn role-play
+conversations. Notably, it outperforms all open-source role-play baselines,
+showcasing performance levels comparable to advanced proprietary chatbots.
+Furthermore, we present the first comprehensive cross-supervision alignment
+experiment in the role-play domain, revealing that the intrinsic capabilities
+of LLMs confine the knowledge within role-play. Meanwhile, the role-play styles
+can be easily acquired with the guidance of smaller models. We open-source
+related resources at https://github.com/OFA-Sys/Ditto.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning for Graph Coloring: Understanding the Power and
+  Limits of Non-Label Invariant Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chase Cummins, Richard Veras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Register allocation is one of the most important problems for modern
+compilers. With a practically unlimited number of user variables and a small
+number of CPU registers, assigning variables to registers without conflicts is
+a complex task. This work demonstrates the use of casting the register
+allocation problem as a graph coloring problem. Using technologies such as
+PyTorch and OpenAI Gymnasium Environments we will show that a Proximal Policy
+Optimization model can learn to solve the graph coloring problem. We will also
+show that the labeling of a graph is critical to the performance of the model
+by taking the matrix representation of a graph and permuting it. We then test
+the model's effectiveness on each of these permutations and show that it is not
+effective when given a relabeling of the same graph. Our main contribution lies
+in showing the need for label reordering invariant representations of graphs
+for machine learning models to achieve consistent performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-agent deep reinforcement learning with centralized training and
+  decentralized execution for transportation infrastructure management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12455v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12455v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Saifullah, K. G. Papakonstantinou, C. P. Andriotis, S. M. Stoffels
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a multi-agent Deep Reinforcement Learning (DRL) framework for
+managing large transportation infrastructure systems over their life-cycle.
+Life-cycle management of such engineering systems is a computationally
+intensive task, requiring appropriate sequential inspection and maintenance
+decisions able to reduce long-term risks and costs, while dealing with
+different uncertainties and constraints that lie in high-dimensional spaces. To
+date, static age- or condition-based maintenance methods and risk-based or
+periodic inspection plans have mostly addressed this class of optimization
+problems. However, optimality, scalability, and uncertainty limitations are
+often manifested under such approaches. The optimization problem in this work
+is cast in the framework of constrained Partially Observable Markov Decision
+Processes (POMDPs), which provides a comprehensive mathematical basis for
+stochastic sequential decision settings with observation uncertainties, risk
+considerations, and limited resources. To address significantly large state and
+action spaces, a Deep Decentralized Multi-agent Actor-Critic (DDMAC) DRL method
+with Centralized Training and Decentralized Execution (CTDE), termed as
+DDMAC-CTDE is developed. The performance strengths of the DDMAC-CTDE method are
+demonstrated in a generally representative and realistic example application of
+an existing transportation network in Virginia, USA. The network includes
+several bridge and pavement components with nonstationary degradation,
+agency-imposed constraints, and traffic delay and risk considerations. Compared
+to traditional management policies for transportation networks, the proposed
+DDMAC-CTDE method vastly outperforms its counterparts. Overall, the proposed
+algorithmic framework provides near optimal solutions for transportation
+infrastructure management under real-world constraints and complexities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Post-Training Embedding Alignment for Decoupling Enrollment and Runtime
+  Speaker Recognition Models <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyang Gao, Brecht Desplanques, Chelsea J. -T. Ju, Aman Chadha, Andreas Stolcke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated speaker identification (SID) is a crucial step for the
+personalization of a wide range of speech-enabled services. Typical SID systems
+use a symmetric enrollment-verification framework with a single model to derive
+embeddings both offline for voice profiles extracted from enrollment
+utterances, and online from runtime utterances. Due to the distinct
+circumstances of enrollment and runtime, such as different computation and
+latency constraints, several applications would benefit from an asymmetric
+enrollment-verification framework that uses different models for enrollment and
+runtime embedding generation. To support this asymmetric SID where each of the
+two models can be updated independently, we propose using a lightweight neural
+network to map the embeddings from the two independent models to a shared
+speaker embedding space. Our results show that this approach significantly
+outperforms cosine scoring in a shared speaker logit space for models that were
+trained with a contrastive loss on large datasets with many speaker identities.
+This proposed Neural Embedding Speaker Space Alignment (NESSA) combined with an
+asymmetric update of only one of the models delivers at least 60% of the
+performance gain achieved by updating both models in the standard symmetric SID
+approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Secure Federated Learning Approaches to Diagnosing COVID-19 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rittika Adhikari, Christopher Settles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent pandemic has underscored the importance of accurately diagnosing
+COVID-19 in hospital settings. A major challenge in this regard is
+differentiating COVID-19 from other respiratory illnesses based on chest
+X-rays, compounded by the restrictions of HIPAA compliance which limit the
+comparison of patient X-rays. This paper introduces a HIPAA-compliant model to
+aid in the diagnosis of COVID-19, utilizing federated learning. Federated
+learning is a distributed machine learning approach that allows for algorithm
+training across multiple decentralized devices using local data samples,
+without the need for data sharing. Our model advances previous efforts in chest
+X-ray diagnostic models. We examined leading models from established
+competitions in this domain and developed our own models tailored to be
+effective with specific hospital data. Considering the model's operation in a
+federated learning context, we explored the potential impact of biased data
+updates on the model's performance. To enhance hospital understanding of the
+model's decision-making process and to verify that the model is not focusing on
+irrelevant features, we employed a visualization technique that highlights key
+features in chest X-rays indicative of a positive COVID-19 diagnosis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Wasserstein Differential Privacy <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengyi Yang, Jiayin Qi, Aimin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differential privacy (DP) has achieved remarkable results in the field of
+privacy-preserving machine learning. However, existing DP frameworks do not
+satisfy all the conditions for becoming metrics, which prevents them from
+deriving better basic private properties and leads to exaggerated values on
+privacy budgets. We propose Wasserstein differential privacy (WDP), an
+alternative DP framework to measure the risk of privacy leakage, which
+satisfies the properties of symmetry and triangle inequality. We show and prove
+that WDP has 13 excellent properties, which can be theoretical supports for the
+better performance of WDP than other DP frameworks. In addition, we derive a
+general privacy accounting method called Wasserstein accountant, which enables
+WDP to be applied in stochastic gradient descent (SGD) scenarios containing
+sub-sampling. Experiments on basic mechanisms, compositions and deep learning
+show that the privacy budgets obtained by Wasserstein accountant are relatively
+stable and less influenced by order. Moreover, the overestimation on privacy
+budgets can be effectively alleviated. The code is available at
+https://github.com/Hifipsysta/WDP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantitative Analysis of Molecular Transport in the Extracellular Space
+  Using Physics-Informed Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12435v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12435v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Xie, Hongfeng Li, Yu Jiang, Jin Cheng, Qingrui Cai, Hanbo Tan, Lingyun Zu, Xiaobo Qu, Hongbin Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The brain extracellular space (ECS), an irregular, extremely tortuous
+nanoscale space located between cells or between cells and blood vessels, is
+crucial for nerve cell survival. It plays a pivotal role in high-level brain
+functions such as memory, emotion, and sensation. However, the specific form of
+molecular transport within the ECS remain elusive. To address this challenge,
+this paper proposes a novel approach to quantitatively analyze the molecular
+transport within the ECS by solving an inverse problem derived from the
+advection-diffusion equation (ADE) using a physics-informed neural network
+(PINN). PINN provides a streamlined solution to the ADE without the need for
+intricate mathematical formulations or grid settings. Additionally, the
+optimization of PINN facilitates the automatic computation of the diffusion
+coefficient governing long-term molecule transport and the velocity of
+molecules driven by advection. Consequently, the proposed method allows for the
+quantitative analysis and identification of the specific pattern of molecular
+transport within the ECS through the calculation of the Peclet number.
+Experimental validation on two datasets of magnetic resonance images (MRIs)
+captured at different time points showcases the effectiveness of the proposed
+method. Notably, our simulations reveal identical molecular transport patterns
+between datasets representing rats with tracer injected into the same brain
+region. These findings highlight the potential of PINN as a promising tool for
+comprehensively exploring molecular transport within the ECS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Neglected Tails of Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Parashar, Zhiqiu Lin, Tian Liu, Xiangjue Dong, Yanan Li, Deva Ramanan, James Caverlee, Shu Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) excel in zero-shot recognition but exhibit
+drastically imbalanced performance across visual concepts. For example, CLIP,
+despite an impressive mean zero-shot accuracy on ImageNet (72.7%), yields
+$<$10% on ten concepts (e.g., gyromitra and night snake), presumably, because
+these concepts are under-represented in VLMs' imbalanced pretraining data. Yet,
+assessing this imbalance is challenging as it is non-trivial to calculate the
+frequency of specific concepts within VLMs' large-scale pretraining data. Our
+work makes the first attempt to measure the concept frequency by analyzing
+pretraining texts. We use off-the-shelf language models to help count relevant
+texts that contain synonyms of the given concepts and resolve linguistic
+ambiguity. We confirm that popular VLM datasets like LAION indeed exhibit
+long-tailed concept distributions, which strongly correlate with per-class
+accuracies. Further, contemporary multimodal systems, e.g., visual chatbots and
+text-to-image generators, also struggle with the rare concepts identified by
+our method. To mitigate VLMs' imbalanced performance in zero-shot recognition,
+we propose REtrieval-Augmented Learning REAL. First, instead of prompting VLMs
+using the original class names, REAL uses their most frequent synonyms found in
+VLMs' pretraining texts. This already outperforms human-engineered and
+LLM-generated prompts over nine benchmark datasets, likely because VLMs have
+seen more images associated with the frequently used synonyms. Second, REAL
+uses all the concept synonyms to retrieve a small, class-balanced set of
+pretraining data to train a robust classifier. REAL surpasses the recent
+retrieval-augmented solution REACT, using 400x less storage and 10,000x less
+training time!
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page:
+  https://shubhamprshr27.github.io/neglected-tails-of-vlms/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DALex: Lexicase-like Selection via Diverse Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Ni, Li Ding, Lee Spector
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lexicase selection has been shown to provide advantages over other selection
+algorithms in several areas of evolutionary computation and machine learning.
+In its standard form, lexicase selection filters a population or other
+collection based on randomly ordered training cases that are considered one at
+a time. This iterated filtering process can be time-consuming, particularly in
+settings with large numbers of training cases. In this paper, we propose a new
+method that is nearly equivalent to lexicase selection in terms of the
+individuals that it selects, but which does so significantly more quickly. The
+new method, called DALex (for Diversely Aggregated Lexicase), selects the best
+individual with respect to a weighted sum of training case errors, where the
+weights are randomly sampled. This allows us to formulate the core computation
+required for selection as matrix multiplication instead of recursive loops of
+comparisons, which in turn allows us to take advantage of optimized and
+parallel algorithms designed for matrix multiplication for speedup.
+Furthermore, we show that we can interpolate between the behavior of lexicase
+selection and its "relaxed" variants, such as epsilon or batch lexicase
+selection, by adjusting a single hyperparameter, named "particularity
+pressure," which represents the importance granted to each individual training
+case. Results on program synthesis, deep learning, symbolic regression, and
+learning classifier systems demonstrate that DALex achieves significant
+speedups over lexicase selection and its relaxed variants while maintaining
+almost identical problem-solving performance. Under a fixed computational
+budget, these savings free up resources that can be directed towards increasing
+population size or the number of generations, enabling the potential for
+solving more difficult problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures. Submitted to EuroGP'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NLBAC: A Neural Ordinary Differential Equations-based Framework for
+  Stable and Safe Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13148v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13148v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liqun Zhao, Keyan Miao, Konstantinos Gatsis, Antonis Papachristodoulou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) excels in applications such as video games and
+robotics, but ensuring safety and stability remains challenging when using RL
+to control real-world systems where using model-free algorithms suffering from
+low sample efficiency might be prohibitive. This paper first provides safety
+and stability definitions for the RL system, and then introduces a Neural
+ordinary differential equations-based Lyapunov-Barrier Actor-Critic (NLBAC)
+framework that leverages Neural Ordinary Differential Equations (NODEs) to
+approximate system dynamics and integrates the Control Barrier Function (CBF)
+and Control Lyapunov Function (CLF) frameworks with the actor-critic method to
+assist in maintaining the safety and stability for the system. Within this
+framework, we employ the augmented Lagrangian method to update the RL-based
+controller parameters. Additionally, we introduce an extra backup controller in
+situations where CBF constraints for safety and the CLF constraint for
+stability cannot be satisfied simultaneously. Simulation results demonstrate
+that the framework leads the system to approach the desired state and allows
+fewer violations of safety constraints with better sample efficiency compared
+to other methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The comprehensive version of one paper submitted to 6th Annual
+  Learning for Dynamics & Control Conference (L4DC 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contractive Diffusion Probabilistic Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13115v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13115v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenpin Tang, Hanyang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion probabilistic models (DPMs) have emerged as a promising technology
+in generative modeling. The success of DPMs relies on two ingredients: time
+reversal of Markov diffusion processes and score matching. Most existing work
+implicitly assumes that score matching is close to perfect, while this
+assumption is questionable. In view of possibly unguaranteed score matching, we
+propose a new criterion -- the contraction of backward sampling in the design
+of DPMs. This leads to a novel class of contractive DPMs (CDPMs), including
+contractive Ornstein-Uhlenbeck (OU) processes and contractive sub-variance
+preserving (sub-VP) stochastic differential equations (SDEs). The key insight
+is that the contraction in the backward process narrows score matching errors,
+as well as discretization error. Thus, the proposed CDPMs are robust to both
+sources of error. Our proposal is supported by theoretical results, and is
+corroborated by experiments. Notably, contractive sub-VP shows the best
+performance among all known SDE-based DPMs on the CIFAR-10 dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse identification of nonlinear dynamics in the presence of library
+  and system uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13099v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13099v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew O'Brien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The SINDy algorithm has been successfully used to identify the governing
+equations of dynamical systems from time series data. However, SINDy assumes
+the user has prior knowledge of the variables in the system and of a function
+library that can act as a basis for the system. In this paper, we demonstrate
+on real world data how the Augmented SINDy algorithm outperforms SINDy in the
+presence of system variable uncertainty. We then show SINDy can be further
+augmented to perform robustly when both kinds of uncertainty are present.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gravity-Informed Deep Learning Framework for Predicting Ship Traffic
+  Flow and Invasion Risk of Non-Indigenous Species via Ballast Water Discharge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixin Song, Gabriel Spadon, Sarah Bailey, Ronald Pelot, Stan Matwin, Amilcar Soares
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Invasive species in water bodies pose a major threat to the environment and
+biodiversity globally. Due to increased transportation and trade, non-native
+species have been introduced to new environments, causing damage to ecosystems
+and leading to economic losses in agriculture, forestry, and fisheries.
+Therefore, there is a pressing need for risk assessment and management
+techniques to mitigate the impact of these invasions. This study aims to
+develop a new physics-inspired model to forecast maritime shipping traffic and
+thus inform risk assessment of invasive species spread through global
+transportation networks. Inspired by the gravity model for international
+trades, our model considers various factors that influence the likelihood and
+impact of vessel activities, such as shipping flux density, distance between
+ports, trade flow, and centrality measures of transportation hubs.
+Additionally, by analyzing the risk network of invasive species, we provide a
+comprehensive framework for assessing the invasion threat level given a pair of
+origin and destination. Accordingly, this paper introduces transformers to
+gravity models to rebuild the short- and long-term dependencies that make the
+risk analysis feasible. Thus, we introduce a physics-inspired framework that
+achieves an 89% segmentation accuracy for existing and non-existing
+trajectories and an 84.8% accuracy for the number of vessels flowing between
+key port areas, representing more than 10% improvement over the traditional
+deep-gravity model. Along these lines, this research contributes to a better
+understanding of invasive species risk assessment. It allows policymakers,
+conservationists, and stakeholders to prioritize management actions by
+identifying high-risk invasion pathways. Besides, our model is versatile and
+can include new data sources, making it suitable for assessing species invasion
+risks in a changing global landscape.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 7 figures, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Demand Forecasting with Graph Neural Networks <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikita Kozodoi, Elizaveta Zinovyeva, Simon Valentin, João Pereira, Rodrigo Agundez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Demand forecasting is a prominent business use case that allows retailers to
+optimize inventory planning, logistics, and core business decisions. One of the
+key challenges in demand forecasting is accounting for relationships and
+interactions between articles. Most modern forecasting approaches provide
+independent article-level predictions that do not consider the impact of
+related articles. Recent research has attempted addressing this challenge using
+Graph Neural Networks (GNNs) and showed promising results. This paper builds on
+previous research on GNNs and makes two contributions. First, we integrate a
+GNN encoder into a state-of-the-art DeepAR model. The combined model produces
+probabilistic forecasts, which are crucial for decision-making under
+uncertainty. Second, we propose to build graphs using article attribute
+similarity, which avoids reliance on a pre-defined graph structure. Experiments
+on three real-world datasets show that the proposed approach consistently
+outperforms non-graph benchmarks. We also show that our approach produces
+article embeddings that encode article similarity and demand dynamics and are
+useful for other downstream business tasks beyond forecasting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of the paper accepted to ECML PKDD 2023 ML4ITS Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Trustable Language Models: Investigating Information Quality of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rick Rejeleene, Xiaowei Xu, John Talburt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLM) are generating information at a rapid pace,
+requiring users to increasingly rely and trust the data. Despite remarkable
+advances of LLM, Information generated by LLM is not completely trustworthy,
+due to challenges in information quality. Specifically, integrity of
+Information quality decreases due to unreliable, biased, tokenization during
+pre-training of LLM. Moreover, due to decreased information quality issues, has
+led towards hallucination, fabricated information. Unreliable information can
+lead towards flawed decisions in businesses, which impacts economic activity.
+In this work, we introduce novel mathematical information quality evaluation of
+LLM, we furthermore analyze and highlight information quality challenges,
+scaling laws to systematically scale language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IndiText Boost: Text Augmentation for Low Resource India Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Onkar Litake, Niraj Yagnik, Shreyas Labhsetwar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text Augmentation is an important task for low-resource languages. It helps
+deal with the problem of data scarcity. A data augmentation strategy is used to
+deal with the problem of data scarcity. Through the years, much work has been
+done on data augmentation for the English language. In contrast, very less work
+has been done on Indian languages. This is contrary to the fact that data
+augmentation is used to deal with data scarcity. In this work, we focus on
+implementing techniques like Easy Data Augmentation, Back Translation,
+Paraphrasing, Text Generation using LLMs, and Text Expansion using LLMs for
+text classification on different languages. We focus on 6 Indian languages
+namely: Sindhi, Marathi, Hindi, Gujarati, Telugu, and Sanskrit. According to
+our knowledge, no such work exists for text augmentation on Indian languages.
+We carry out binary as well as multi-class text classification to make our
+results more comparable. We get surprising results as basic data augmentation
+techniques surpass LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Frustrated Random Walks: A Fast Method to Compute Node Distances on
+  Hypergraphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13054v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13054v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enzhi Li, Bilal Fadlallah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A hypergraph is a generalization of a graph that arises naturally when
+attribute-sharing among entities is considered. Although a hypergraph can be
+converted into a graph by expanding its hyperedges into fully connected
+subgraphs, going the reverse way is computationally complex and NP-complete. We
+therefore hypothesize that a hypergraph contains more information than a graph.
+In addition, it is more convenient to manipulate a hypergraph directly, rather
+than expand it into a graph. An open problem in hypergraphs is how to
+accurately and efficiently calculate their node distances. Estimating node
+distances enables us to find a node's nearest neighbors, and perform label
+propagation on hypergraphs using a K-nearest neighbors (KNN) approach. In this
+paper, we propose a novel approach based on random walks to achieve label
+propagation on hypergraphs. We estimate node distances as the expected hitting
+times of random walks. We note that simple random walks (SRW) cannot accurately
+describe highly complex real-world hypergraphs, which motivates us to introduce
+frustrated random walks (FRW) to better describe them. We further benchmark our
+method against DeepWalk, and show that while the latter can achieve comparable
+results, FRW has a distinct computational advantage in cases where the number
+of targets is fairly small. For such cases, we show that FRW runs in
+significantly shorter time than DeepWalk. Finally, we analyze the time
+complexity of our method, and show that for large and sparse hypergraphs, the
+complexity is approximately linear, rendering it superior to the DeepWalk
+alternative.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CIS-UNet: Multi-Class Segmentation of the Aorta in Computed Tomography
+  Angiography via Context-Aware Shifted Window Self-Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Imran, Jonathan R Krebs, Veera Rajasekhar Reddy Gopu, Brian Fazzone, Vishal Balaji Sivaraman, Amarjeet Kumar, Chelsea Viscardi, Robert Evans Heithaus, Benjamin Shickel, Yuyin Zhou, Michol A Cooper, Wei Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in medical imaging and endovascular grafting have facilitated
+minimally invasive treatments for aortic diseases. Accurate 3D segmentation of
+the aorta and its branches is crucial for interventions, as inaccurate
+segmentation can lead to erroneous surgical planning and endograft
+construction. Previous methods simplified aortic segmentation as a binary image
+segmentation problem, overlooking the necessity of distinguishing between
+individual aortic branches. In this paper, we introduce Context Infused
+Swin-UNet (CIS-UNet), a deep learning model designed for multi-class
+segmentation of the aorta and thirteen aortic branches. Combining the strengths
+of Convolutional Neural Networks (CNNs) and Swin transformers, CIS-UNet adopts
+a hierarchical encoder-decoder structure comprising a CNN encoder, symmetric
+decoder, skip connections, and a novel Context-aware Shifted Window
+Self-Attention (CSW-SA) as the bottleneck block. Notably, CSW-SA introduces a
+unique utilization of the patch merging layer, distinct from conventional Swin
+transformers. It efficiently condenses the feature map, providing a global
+spatial context and enhancing performance when applied at the bottleneck layer,
+offering superior computational efficiency and segmentation accuracy compared
+to the Swin transformers. We trained our model on computed tomography (CT)
+scans from 44 patients and tested it on 15 patients. CIS-UNet outperformed the
+state-of-the-art SwinUNetR segmentation model, which is solely based on Swin
+transformers, by achieving a superior mean Dice coefficient of 0.713 compared
+to 0.697, and a mean surface distance of 2.78 mm compared to 3.39 mm.
+CIS-UNet's superior 3D aortic segmentation offers improved precision and
+optimization for planning endovascular treatments. Our dataset and code will be
+publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessment of Sports Concussion in Female Athletes: A Role for
+  Neuroinformatics? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rachel Edelstein, Sterling Gutterman, Benjamin Newman, John Darrell Van Horn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past decade, the intricacies of sports-related concussions among
+female athletes have become readily apparent. Traditional clinical methods for
+diagnosing concussions suffer limitations when applied to female athletes,
+often failing to capture subtle changes in brain structure and function.
+Advanced neuroinformatics techniques and machine learning models have become
+invaluable assets in this endeavor. While these technologies have been
+extensively employed in understanding concussion in male athletes, there
+remains a significant gap in our comprehension of their effectiveness for
+female athletes. With its remarkable data analysis capacity, machine learning
+offers a promising avenue to bridge this deficit. By harnessing the power of
+machine learning, researchers can link observed phenotypic neuroimaging data to
+sex-specific biological mechanisms, unraveling the mysteries of concussions in
+female athletes. Furthermore, embedding methods within machine learning enable
+examining brain architecture and its alterations beyond the conventional
+anatomical reference frame. In turn, allows researchers to gain deeper insights
+into the dynamics of concussions, treatment responses, and recovery processes.
+To guarantee that female athletes receive the optimal care they deserve,
+researchers must employ advanced neuroimaging techniques and sophisticated
+machine-learning models. These tools enable an in-depth investigation of the
+underlying mechanisms responsible for concussion symptoms stemming from
+neuronal dysfunction in female athletes. This paper endeavors to address the
+crucial issue of sex differences in multimodal neuroimaging experimental design
+and machine learning approaches within female athlete populations, ultimately
+ensuring that they receive the tailored care they require when facing the
+challenges of concussions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Locality Sensitive Sparse Encoding for Learning World Models Online <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13034v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13034v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichen Liu, Chao Du, Wee Sun Lee, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acquiring an accurate world model online for model-based reinforcement
+learning (MBRL) is challenging due to data nonstationarity, which typically
+causes catastrophic forgetting for neural networks (NNs). From the online
+learning perspective, a Follow-The-Leader (FTL) world model is desirable, which
+optimally fits all previous experiences at each round. Unfortunately, NN-based
+models need re-training on all accumulated data at every interaction step to
+achieve FTL, which is computationally expensive for lifelong agents. In this
+paper, we revisit models that can achieve FTL with incremental updates.
+Specifically, our world model is a linear regression model supported by
+nonlinear random features. The linear part ensures efficient FTL update while
+the nonlinear random feature empowers the fitting of complex environments. To
+best trade off model capacity and computation efficiency, we introduce a
+locality sensitive sparse encoding, which allows us to conduct efficient sparse
+updates even with very high dimensional nonlinear features. We validate the
+representation power of our encoding and verify that it allows efficient online
+learning under data covariate shift. We also show, in the Dyna MBRL setting,
+that our world models learned online using a single pass of trajectory data
+either surpass or match the performance of deep world models trained with
+replay and other continual learning methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RudolfV: A Foundation Model by Pathologists for Pathologists 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04079v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04079v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Dippel, Barbara Feulner, Tobias Winterhoff, Simon Schallenberg, Gabriel Dernbach, Andreas Kunft, Stephan Tietz, Philipp Jurmeister, David Horst, Lukas Ruff, Klaus-Robert Müller, Frederick Klauschen, Maximilian Alber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Histopathology plays a central role in clinical medicine and biomedical
+research. While artificial intelligence shows promising results on many
+pathological tasks, generalization and dealing with rare diseases, where
+training data is scarce, remains a challenge. Distilling knowledge from
+unlabeled data into a foundation model before learning from, potentially
+limited, labeled data provides a viable path to address these challenges. In
+this work, we extend the state of the art of foundation models for digital
+pathology whole slide images by semi-automated data curation and incorporating
+pathologist domain knowledge. Specifically, we combine computational and
+pathologist domain knowledge (1) to curate a diverse dataset of 103k slides
+corresponding to 750 million image patches covering data from different
+fixation, staining, and scanning protocols as well as data from different
+indications and labs across the EU and US, (2) for grouping semantically
+similar slides and tissue patches, and (3) to augment the input images during
+training. We evaluate the resulting model on a set of public and internal
+benchmarks and show that although our foundation model is trained with an order
+of magnitude less slides, it performs on par or better than competing models.
+We expect that scaling our approach to more data and larger models will further
+increase its performance and capacity to deal with increasingly complex real
+world tasks in diagnostics and biomedical research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tracking Any Object Amodally 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12433v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12433v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng-Yen Hsieh, Tarasha Khurana, Achal Dave, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Amodal perception, the ability to comprehend complete object structures from
+partial visibility, is a fundamental skill, even for infants. Its significance
+extends to applications like autonomous driving, where a clear understanding of
+heavily occluded objects is essential. However, modern detection and tracking
+algorithms often overlook this critical capability, perhaps due to the
+prevalence of modal annotations in most datasets. To address the scarcity of
+amodal data, we introduce the TAO-Amodal benchmark, featuring 880 diverse
+categories in thousands of video sequences. Our dataset includes amodal and
+modal bounding boxes for visible and occluded objects, including objects that
+are partially out-of-frame. To enhance amodal tracking with object permanence,
+we leverage a lightweight plug-in module, the amodal expander, to transform
+standard, modal trackers into amodal ones through fine-tuning on a few hundred
+video sequences with data augmentation. We achieve a 3.3\% and 1.6\%
+improvement on the detection and tracking of occluded objects on TAO-Amodal.
+When evaluated on people, our method produces dramatic improvements of 2x
+compared to state-of-the-art modal baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://tao-amodal.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Local Neighborhood-based Neural Networks for MR Image
+  Reconstruction from Undersampled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.00775v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.00775v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijun Liang, Anish Lahiri, Saiprasad Ravishankar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent medical image reconstruction techniques focus on generating
+high-quality medical images suitable for clinical use at the lowest possible
+cost and with the fewest possible adverse effects on patients. Recent works
+have shown significant promise for reconstructing MR images from sparsely
+sampled k-space data using deep learning. In this work, we propose a technique
+that rapidly estimates deep neural networks directly at reconstruction time by
+fitting them on small adaptively estimated neighborhoods of a training set. In
+brief, our algorithm alternates between searching for neighbors in a data set
+that are similar to the test reconstruction, and training a local network on
+these neighbors followed by updating the test reconstruction. Because our
+reconstruction model is learned on a dataset that is in some sense similar to
+the image being reconstructed rather than being fit on a large, diverse
+training set, it is more adaptive to new scans. It can also handle changes in
+training sets and flexible scan settings, while being relatively fast. Our
+approach, dubbed LONDN-MRI, was validated on multiple data sets using deep
+unrolled reconstruction networks. Reconstructions were performed at four fold
+and eight fold undersampling of k-space with 1D variable-density random
+phase-encode undersampling masks. Our results demonstrate that our proposed
+locally-trained method produces higher-quality reconstructions compared to
+models trained globally on larger datasets as well as other scan-adaptive
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Het<span class="highlight-title">GPT</span>: Harnessing the Power of <span class="highlight-title">Prompt</span> Tuning in <span class="highlight-title">Pre-Train</span>ed
+  Heterogeneous Graph Neural Networks <span class="chip">WWW 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.15318v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.15318v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihong Ma, Ning Yan, Jiayu Li, Masood Mortazavi, Nitesh V. Chawla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graphs have emerged as a natural choice to represent and analyze the
+intricate patterns and rich information of the Web, enabling applications such
+as online page classification and social recommendation. The prevailing
+"pre-train, fine-tune" paradigm has been widely adopted in graph machine
+learning tasks, particularly in scenarios with limited labeled nodes. However,
+this approach often exhibits a misalignment between the training objectives of
+pretext tasks and those of downstream tasks. This gap can result in the
+"negative transfer" problem, wherein the knowledge gained from pre-training
+adversely affects performance in the downstream tasks. The surge in
+prompt-based learning within Natural Language Processing (NLP) suggests the
+potential of adapting a "pre-train, prompt" paradigm to graphs as an
+alternative. However, existing graph prompting techniques are tailored to
+homogeneous graphs, neglecting the inherent heterogeneity of Web graphs. To
+bridge this gap, we propose HetGPT, a general post-training prompting framework
+to improve the predictive performance of pre-trained heterogeneous graph neural
+networks (HGNNs). The key is the design of a novel prompting function that
+integrates a virtual class prompt and a heterogeneous feature prompt, with the
+aim to reformulate downstream tasks to mirror pretext tasks. Moreover, HetGPT
+introduces a multi-view neighborhood aggregation mechanism, capturing the
+complex neighborhood structure in heterogeneous graphs. Extensive experiments
+on three benchmark datasets demonstrate HetGPT's capability to enhance the
+performance of state-of-the-art HGNNs on semi-supervised node classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to WWW 2024 as research paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Geometric Framework for Neural Feature Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10140v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10140v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangxiang Xu, Lizhong Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel framework for learning system design based on neural
+feature extractors. First, we introduce the feature geometry, which unifies
+statistical dependence and features in the same function space with geometric
+structures. By applying the feature geometry, we formulate each learning
+problem as solving the optimal feature approximation of the dependence
+component specified by the learning setting. We propose a nesting technique for
+designing learning algorithms to learn the optimal features from data samples,
+which can be applied to off-the-shelf network architectures and optimizers. To
+demonstrate the applications of the nesting technique, we further discuss
+multivariate learning problems, including conditioned inference and multimodal
+learning, where we present the optimal features and reveal their connections to
+classical approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>76 pages, 24 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Personalized Algorithmic Recourse with Preference Elicitation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.13743v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.13743v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanni De Toni, Paolo Viappiani, Stefano Teso, Bruno Lepri, Andrea Passerini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Algorithmic Recourse (AR) is the problem of computing a sequence of actions
+that -- once performed by a user -- overturns an undesirable machine decision.
+It is paramount that the sequence of actions does not require too much effort
+for users to implement. Yet, most approaches to AR assume that actions cost the
+same for all users, and thus may recommend unfairly expensive recourse plans to
+certain users. Prompted by this observation, we introduce PEAR, the first
+human-in-the-loop approach capable of providing personalized algorithmic
+recourse tailored to the needs of any end-user. PEAR builds on insights from
+Bayesian Preference Elicitation to iteratively refine an estimate of the costs
+of actions by asking choice set queries to the target user. The queries
+themselves are computed by maximizing the Expected Utility of Selection, a
+principled measure of information gain accounting for uncertainty on both the
+cost estimate and the user's responses. PEAR integrates elicitation into a
+Reinforcement Learning agent coupled with Monte Carlo Tree Search to quickly
+identify promising recourse plans. Our empirical evaluation on real-world
+datasets highlights how PEAR produces high-quality personalized recourse in
+only a handful of iterations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions in Machine Learning Research (TMLR),
+  January 2024. See https://openreview.net/forum?id=8sg2I9zXgO for the official
+  submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HARDCORE: H-field and power loss estimation for arbitrary waveforms with
+  residual, dilated convolutional neural networks in ferrite cores 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wilhelm Kirchgässner, Nikolas Förster, Till Piepenbrock, Oliver Schweins, Oliver Wallscheid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The MagNet Challenge 2023 calls upon competitors to develop data-driven
+models for the material-specific, waveform-agnostic estimation of steady-state
+power losses in toroidal ferrite cores. The following HARDCORE (H-field and
+power loss estimation for Arbitrary waveforms with Residual, Dilated
+convolutional neural networks in ferrite COREs) approach shows that a residual
+convolutional neural network with physics-informed extensions can serve this
+task efficiently when trained on observational data beforehand. One key
+solution element is an intermediate model layer which first reconstructs the bh
+curve and then estimates the power losses based on the curve's area rendering
+the proposed topology physically interpretable. In addition, emphasis was
+placed on expert-based feature engineering and information-rich inputs in order
+to enable a lean model architecture. A model is trained from scratch for each
+material, while the topology remains the same. A Pareto-style trade-off between
+model size and estimation accuracy is demonstrated, which yields an optimum at
+as low as 1755 parameters and down to below 8\,\% for the 95-th percentile of
+the relative error for the worst-case material with sufficient samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Competition submission version, slightly change author order</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust stabilization of polytopic systems via fast and reliable neural
+  network-based approximations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.13209v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.13209v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Filippo Fabiani, Paul J. Goulart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the design of fast and reliable neural network (NN)-based
+approximations of traditional stabilizing controllers for linear systems with
+polytopic uncertainty, including control laws with variable structure and those
+based on a (minimal) selection policy. Building upon recent approaches for the
+design of reliable control surrogates with guaranteed structural properties, we
+develop a systematic procedure to certify the closed-loop stability and
+performance of a linear uncertain system when a trained rectified linear unit
+(ReLU)-based approximation replaces such traditional controllers. First, we
+provide a sufficient condition, which involves the worst-case approximation
+error between ReLU-based and traditional controller-based state-to-input
+mappings, ensuring that the system is ultimately bounded within a set with
+adjustable size and convergence rate. Then, we develop an offline,
+mixed-integer optimization-based method that allows us to compute that quantity
+exactly.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reservoir-Computing Model for Mapping and Forecasting Neuronal
+  Interactions from Electrophysiological Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03131v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03131v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilya Auslender, Giorgio Letti, Yasaman Heydari, Clara Zaccaria, Lorenzo Pavesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electrophysiological nature of neuronal networks allows to reveal various
+interactions between different cell units at a very short time-scales. One of
+the many challenges in analyzing these signals is to retrieve the morphology
+and functionality of a given network. In this work we developed a computational
+model, based on Reservoir Computing Network (RCN) architecture, which decodes
+the spatio-temporal data from electro-physiological measurements of neuronal
+cultures and reconstructs the network structure on a macroscopic domain,
+representing the connectivity between neuronal units. We demonstrate that the
+model can predict the connectivity map of the network with higher accuracy than
+the common methods such as Cross-Correlation and Transfer-Entropy. In addition,
+we experimentally demonstrate the ability of the model to predict a network
+response to a specific input, such as localized stimulus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-submission draft</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Choice of training label matters: how to best use deep learning for
+  quantitative MRI parameter estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.05587v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.05587v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean C. Epstein, Timothy J. P. Bray, Margaret Hall-Craggs, Hui Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) is gaining popularity as a parameter estimation method for
+quantitative MRI. A range of competing implementations have been proposed,
+relying on either supervised or self-supervised learning. Self-supervised
+approaches, sometimes referred to as unsupervised, have been loosely based on
+auto-encoders, whereas supervised methods have, to date, been trained on
+groundtruth labels. These two learning paradigms have been shown to have
+distinct strengths. Notably, self-supervised approaches have offered lower-bias
+parameter estimates than their supervised alternatives. This result is
+counterintuitive - incorporating prior knowledge with supervised labels should,
+in theory, lead to improved accuracy. In this work, we show that this apparent
+limitation of supervised approaches stems from the naive choice of groundtruth
+training labels. By training on labels which are deliberately not groundtruth,
+we show that the low-bias parameter estimation previously associated with
+self-supervised methods can be replicated - and improved on - within a
+supervised learning framework. This approach sets the stage for a single,
+unifying, deep learning parameter estimation framework, based on supervised
+learning, where trade-offs between bias and variance are made by careful
+adjustment of training label.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the Journal of Machine Learning for
+  Biomedical Imaging (MELBA) https://melba-journal.org/2024:002</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive Benchmark for COVID-19 Predictive Modeling Using
+  Electronic Health Records in Intensive Care 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.07805v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.07805v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyi Gao, Yinghao Zhu, Wenqing Wang, Yasha Wang, Wen Tang, Ewen M. Harrison, Liantao Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 pandemic has posed a heavy burden to the healthcare system
+worldwide and caused huge social disruption and economic loss. Many deep
+learning models have been proposed to conduct clinical predictive tasks such as
+mortality prediction for COVID-19 patients in intensive care units using
+Electronic Health Record (EHR) data. Despite their initial success in certain
+clinical applications, there is currently a lack of benchmarking results to
+achieve a fair comparison so that we can select the optimal model for clinical
+use. Furthermore, there is a discrepancy between the formulation of traditional
+prediction tasks and real-world clinical practice in intensive care. To fill
+these gaps, we propose two clinical prediction tasks, Outcome-specific
+length-of-stay prediction and Early mortality prediction for COVID-19 patients
+in intensive care units. The two tasks are adapted from the naive
+length-of-stay and mortality prediction tasks to accommodate the clinical
+practice for COVID-19 patients. We propose fair, detailed, open-source
+data-preprocessing pipelines and evaluate 17 state-of-the-art predictive models
+on two tasks, including 5 machine learning models, 6 basic deep learning models
+and 6 deep learning predictive models specifically designed for EHR data. We
+provide benchmarking results using data from two real-world COVID-19 EHR
+datasets. One dataset is publicly available without needing any inquiry and
+another dataset can be accessed on request. We provide fair, reproducible
+benchmarking results for two tasks. We deploy all experiment results and models
+on an online platform. We also allow clinicians and researchers to upload their
+data to the platform and get quick prediction results using our trained models.
+We hope our efforts can further facilitate deep learning and machine learning
+research for COVID-19 predictive modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Junyi Gao, Yinghao Zhu and Wenqing Wang contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MNL-Bandit with Knapsacks: a near-optimal algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.01135v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.01135v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdellah Aznag, Vineet Goyal, Noemie Perivier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a dynamic assortment selection problem where a seller has a fixed
+inventory of $N$ substitutable products and faces an unknown demand that
+arrives sequentially over $T$ periods. In each period, the seller needs to
+decide on the assortment of products (satisfying certain constraints) to offer
+to the customers. The customer's response follows an unknown multinomial logit
+model (MNL) with parameter $\boldsymbol{v}$. If customer selects product $i \in
+[N]$, the seller receives revenue $r_i$. The goal of the seller is to maximize
+the total expected revenue from the $T$ customers given the fixed initial
+inventory of $N$ products. We present MNLwK-UCB, a UCB-based algorithm and
+characterize its regret under different regimes of inventory size. We show that
+when the inventory size grows quasi-linearly in time, MNLwK-UCB achieves a
+$\tilde{O}(N + \sqrt{NT})$ regret bound. We also show that for a smaller
+inventory (with growth $\sim T^{\alpha}$, $\alpha < 1$), MNLwK-UCB achieves a
+$\tilde{O}(N(1 + T^{\frac{1 - \alpha}{2}}) + \sqrt{NT})$. In particular, over a
+long time horizon $T$, the rate $\tilde{O}(\sqrt{NT})$ is always achieved
+regardless of the constraints and the size of the inventory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Nystrom Approximation for Preconditioning in Kernel Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03311v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03311v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amirhesam Abedsoltan, Parthe Pandit, Luis Rademacher, Mikhail Belkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kernel methods are a popular class of nonlinear predictive models in machine
+learning. Scalable algorithms for learning kernel models need to be iterative
+in nature, but convergence can be slow due to poor conditioning. Spectral
+preconditioning is an important tool to speed-up the convergence of such
+iterative algorithms for training kernel models. However computing and storing
+a spectral preconditioner can be expensive which can lead to large
+computational and storage overheads, precluding the application of kernel
+methods to problems with large datasets. A Nystrom approximation of the
+spectral preconditioner is often cheaper to compute and store, and has
+demonstrated success in practical applications. In this paper we analyze the
+trade-offs of using such an approximated preconditioner. Specifically, we show
+that a sample of logarithmic size (as a function of the size of the dataset)
+enables the Nystrom-based approximated preconditioner to accelerate gradient
+descent nearly as well as the exact preconditioner, while also reducing the
+computational and storage overheads.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sample-efficient Adversarial Imitation Learning <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07846v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07846v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dahuin Jung, Hyungyu Lee, Sungroh Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation learning, in which learning is performed by demonstration, has been
+studied and advanced for sequential decision-making tasks in which a reward
+function is not predefined. However, imitation learning methods still require
+numerous expert demonstration samples to successfully imitate an expert's
+behavior. To improve sample efficiency, we utilize self-supervised
+representation learning, which can generate vast training signals from the
+given data. In this study, we propose a self-supervised representation-based
+adversarial imitation learning method to learn state and action representations
+that are robust to diverse distortions and temporally predictive, on non-image
+control tasks. In particular, in comparison with existing self-supervised
+learning methods for tabular data, we propose a different corruption method for
+state and action representations that is robust to diverse distortions. We
+theoretically and empirically observe that making an informative feature
+manifold with less sample complexity significantly improves the performance of
+imitation learning. The proposed method shows a 39% relative improvement over
+existing adversarial imitation learning methods on MuJoCo in a setting limited
+to 100 expert state-action pairs. Moreover, we conduct comprehensive ablations
+and additional experiments using demonstrations with varying optimality to
+provide insights into a range of factors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at JMLR (Journal of Machine Learning Research), A
+  preliminary version of this manuscript was presented at Deep RL Workshop,
+  NeurIPS 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When Does Confidence-Based Cascade Deferral Suffice? <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02764v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02764v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wittawat Jitkrittum, Neha Gupta, Aditya Krishna Menon, Harikrishna Narasimhan, Ankit Singh Rawat, Sanjiv Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cascades are a classical strategy to enable inference cost to vary adaptively
+across samples, wherein a sequence of classifiers are invoked in turn. A
+deferral rule determines whether to invoke the next classifier in the sequence,
+or to terminate prediction. One simple deferral rule employs the confidence of
+the current classifier, e.g., based on the maximum predicted softmax
+probability. Despite being oblivious to the structure of the cascade -- e.g.,
+not modelling the errors of downstream models -- such confidence-based deferral
+often works remarkably well in practice. In this paper, we seek to better
+understand the conditions under which confidence-based deferral may fail, and
+when alternate deferral strategies can perform better. We first present a
+theoretical characterisation of the optimal deferral rule, which precisely
+characterises settings under which confidence-based deferral may suffer. We
+then study post-hoc deferral mechanisms, and demonstrate they can significantly
+improve upon confidence-based deferral in settings where (i) downstream models
+are specialists that only work well on a subset of inputs, (ii) samples are
+subject to label noise, and (iii) there is distribution shift between the train
+and test set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Energy-based Models are Zero-Shot Planners for Compositional Scene
+  Rearrangement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14391v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14391v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolaos Gkanatsios, Ayush Jain, Zhou Xian, Yunchu Zhang, Christopher Atkeson, Katerina Fragkiadaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language is compositional; an instruction can express multiple relation
+constraints to hold among objects in a scene that a robot is tasked to
+rearrange. Our focus in this work is an instructable scene-rearranging
+framework that generalizes to longer instructions and to spatial concept
+compositions never seen at training time. We propose to represent
+language-instructed spatial concepts with energy functions over relative object
+arrangements. A language parser maps instructions to corresponding energy
+functions and an open-vocabulary visual-language model grounds their arguments
+to relevant objects in the scene. We generate goal scene configurations by
+gradient descent on the sum of energy functions, one per language predicate in
+the instruction. Local vision-based policies then re-locate objects to the
+inferred goal locations. We test our model on established instruction-guided
+manipulation benchmarks, as well as benchmarks of compositional instructions we
+introduce. We show our model can execute highly compositional instructions
+zero-shot in simulation and in the real world. It outperforms
+language-to-action reactive policies and Large Language Model planners by a
+large margin, especially for long instructions that involve compositions of
+multiple spatial concepts. Simulation and real-world robot execution videos, as
+well as our code and datasets are publicly available on our website:
+https://ebmplanner.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally | RSS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Generative AI to Generative Internet of Things: Fundamentals,
+  Framework, and Outlooks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.18382v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.18382v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinbo Wen, Jiangtian Nie, Jiawen Kang, Dusit Niyato, Hongyang Du, Yang Zhang, Mohsen Guizani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Artificial Intelligence (GAI) possesses the capabilities of
+generating realistic data and facilitating advanced decision-making. By
+integrating GAI into modern Internet of Things (IoT), Generative Internet of
+Things (GIoT) is emerging and holds immense potential to revolutionize various
+aspects of society, enabling more efficient and intelligent IoT applications,
+such as smart surveillance and voice assistants. In this article, we present
+the concept of GIoT and conduct an exploration of its potential prospects.
+Specifically, we first overview four GAI techniques and investigate promising
+GIoT applications. Then, we elaborate on the main challenges in enabling GIoT
+and propose a general GAI-based secure incentive mechanism framework to address
+them, in which we adopt Generative Diffusion Models (GDMs) for incentive
+mechanism designs and apply blockchain technologies for secure GIoT management.
+Moreover, we conduct a case study on modern Internet of Vehicle traffic
+monitoring, which utilizes GDMs to generate effective contracts for
+incentivizing users to contribute sensing data with high quality. Finally, we
+suggest several open directions worth investigating for the future popularity
+of GIoT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PartIR: Composing SPMD Partitioning Strategies for Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11202v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11202v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sami Alabed, Bart Chrzaszcz, Juliana Franco, Dominik Grewe, Dougal Maclaurin, James Molloy, Tom Natan, Tamara Norman, Xiaoyue Pan, Adam Paszke, Norman A. Rink, Michael Schaarschmidt, Timur Sitdikov, Agnieszka Swietlik, Dimitrios Vytiniotis, Joel Wee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training of modern large neural networks (NN) requires a combination of
+parallelization strategies encompassing data, model, or optimizer sharding.
+When strategies increase in complexity, it becomes necessary for partitioning
+tools to be 1) expressive, allowing the composition of simpler strategies, and
+2) predictable to estimate performance analytically. We present PartIR, our
+design for a NN partitioning system. PartIR is focused on an incremental
+approach to rewriting and is hardware-and-runtime agnostic. We present a simple
+but powerful API for composing sharding strategies and a simulator to validate
+them. The process is driven by high-level programmer-issued partitioning
+tactics, which can be both manual and automatic. Importantly, the tactics are
+specified separately from the model code, making them easy to change. We
+evaluate PartIR on several different models to demonstrate its predictability,
+expressibility, and ability to reach peak performance..
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Score-Based Generative Models for PET Image Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.14190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.14190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Imraj RD Singh, Alexander Denker, Riccardo Barbano, Željko Kereta, Bangti Jin, Kris Thielemans, Peter Maass, Simon Arridge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Score-based generative models have demonstrated highly promising results for
+medical image reconstruction tasks in magnetic resonance imaging or computed
+tomography. However, their application to Positron Emission Tomography (PET) is
+still largely unexplored. PET image reconstruction involves a variety of
+challenges, including Poisson noise with high variance and a wide dynamic
+range. To address these challenges, we propose several PET-specific adaptations
+of score-based generative models. The proposed framework is developed for both
+2D and 3D PET. In addition, we provide an extension to guided reconstruction
+using magnetic resonance images. We validate the approach through extensive 2D
+and 3D $\textit{in-silico}$ experiments with a model trained on
+patient-realistic data without lesions, and evaluate on data without lesions as
+well as out-of-distribution data with lesions. This demonstrates the proposed
+method's robustness and significant potential for improved PET reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the Journal of Machine Learning for
+  Biomedical Imaging (MELBA) https://melba-journal.org/2024:001</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Infinite-Horizon Graph Filters: Leveraging Power Series to Enhance
+  Sparse Information Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09943v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09943v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruizhe Zhang, Xinke Jiang, Yuchen Fang, Jiayuan Luo, Yongxin Xu, Yichen Zhu, Xu Chu, Junfeng Zhao, Yasha Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have shown considerable effectiveness in a
+variety of graph learning tasks, particularly those based on the
+message-passing approach in recent years. However, their performance is often
+constrained by a limited receptive field, a challenge that becomes more acute
+in the presence of sparse graphs. In light of the power series, which possesses
+infinite expansion capabilities, we propose a novel Graph Power Filter Neural
+Network (GPFN) that enhances node classification by employing a power series
+graph filter to augment the receptive field. Concretely, our GPFN designs a new
+way to build a graph filter with an infinite receptive field based on the
+convergence power series, which can be analyzed in the spectral and spatial
+domains. Besides, we theoretically prove that our GPFN is a general framework
+that can integrate any power series and capture long-range dependencies.
+Finally, experimental results on three datasets demonstrate the superiority of
+our GPFN over state-of-the-art baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revolutionizing TCAD Simulations with Universal Device Encoding and
+  Graph Attention Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11624v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11624v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangxi Fan, Leilai Shao, Kain Lu Low
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An innovative methodology that leverages artificial intelligence (AI) and
+graph representation for semiconductor device encoding in TCAD device
+simulation is proposed. A graph-based universal encoding scheme is presented
+that not only considers material-level and device-level embeddings, but also
+introduces a novel spatial relationship embedding inspired by interpolation
+operations typically used in finite element meshing. Universal physical laws
+from device simulations are leveraged for comprehensive data-driven modeling,
+which encompasses surrogate Poisson emulation and current-voltage (IV)
+prediction based on drift-diffusion model. Both are achieved using a novel
+graph attention network, referred to as RelGAT. Comprehensive technical details
+based on the device simulator Sentaurus TCAD are presented, empowering
+researchers to adopt the proposed AI-driven Electronic Design Automation (EDA)
+solution at the device level.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 13 figures and 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Bilevel Optimization: Regret Analysis of Online Alternating
+  Gradient Methods <span class="chip">AISTATS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.02829v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.02829v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davoud Ataee Tarzanagh, Parvin Nazari, Bojian Hou, Li Shen, Laura Balzano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an \textit{online bilevel optimization} setting in
+which a sequence of time-varying bilevel problems are revealed one after the
+other. We extend the known regret bounds for single-level online algorithms to
+the bilevel setting. Specifically, we provide new notions of \textit{bilevel
+regret}, develop an online alternating time-averaged gradient method that is
+capable of leveraging smoothness, and give regret bounds in terms of the
+path-length of the inner and outer minimizer sequences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at AISTATS 2024. v5: experiments are
+  expanded</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-Driven Online Model Selection With Regret Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02869v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02869v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aldo Pacchiano, Christoph Dann, Claudio Gentile
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider model selection for sequential decision making in stochastic
+environments with bandit feedback, where a meta-learner has at its disposal a
+pool of base learners, and decides on the fly which action to take based on the
+policies recommended by each base learner. Model selection is performed by
+regret balancing but, unlike the recent literature on this subject, we do not
+assume any prior knowledge about the base learners like candidate regret
+guarantees; instead, we uncover these quantities in a data-driven manner. The
+meta-learner is therefore able to leverage the realized regret incurred by each
+base learner for the learning environment at hand (as opposed to the expected
+regret), and single out the best such regret. We design two model selection
+algorithms operating with this more ambitious notion of regret and, besides
+proving model selection guarantees via regret balancing, we experimentally
+demonstrate the compelling practical benefits of dealing with actual regrets
+instead of candidate regret bounds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causal Forecasting for Pricing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.15282v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.15282v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Douglas Schultz, Johannes Stephan, Julian Sieber, Trudie Yeh, Manuel Kunz, Patrick Doupe, Tim Januschowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel method for demand forecasting in a pricing
+context. Here, modeling the causal relationship between price as an input
+variable to demand is crucial because retailers aim to set prices in a (profit)
+optimal manner in a downstream decision making problem. Our methods bring
+together the Double Machine Learning methodology for causal inference and
+state-of-the-art transformer-based forecasting models. In extensive empirical
+experiments, we show on the one hand that our method estimates the causal
+effect better in a fully controlled setting via synthetic, yet realistic data.
+On the other hand, we demonstrate on real-world data that our method
+outperforms forecasting methods in off-policy settings (i.e., when there's a
+change in the pricing policy) while only slightly trailing in the on-policy
+setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Region-Wise Attentive Multi-View Representation Learning for Urban
+  Region Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03212v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03212v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiliang Chan, Qianqian Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Urban region embedding is an important and yet highly challenging issue due
+to the complexity and constantly changing nature of urban data. To address the
+challenges, we propose a Region-Wise Multi-View Representation Learning (ROMER)
+to capture multi-view dependencies and learn expressive representations of
+urban regions without the constraints of rigid neighbourhood region conditions.
+Our model focus on learn urban region representation from multi-source urban
+data. First, we capture the multi-view correlations from mobility flow
+patterns, POI semantics and check-in dynamics. Then, we adopt global graph
+attention networks to learn similarity of any two vertices in graphs. To
+comprehensively consider and share features of multiple views, a two-stage
+fusion module is further proposed to learn weights with external attention to
+fuse multi-view embeddings. Extensive experiments for two downstream tasks on
+real-world datasets demonstrate that our model outperforms state-of-the-art
+methods by up to 17\% improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data Augmentation for Traffic Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10754v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10754v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Wang, Alessandro Finamore, Pietro Michiardi, Massimo Gallo, Dario Rossi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data Augmentation (DA) -- enriching training data by adding synthetic samples
+-- is a technique widely adopted in Computer Vision (CV) and Natural Language
+Processing (NLP) tasks to improve models performance. Yet, DA has struggled to
+gain traction in networking contexts, particularly in Traffic Classification
+(TC) tasks. In this work, we fulfill this gap by benchmarking 18 augmentation
+functions applied to 3 TC datasets using packet time series as input
+representation and considering a variety of training conditions. Our results
+show that (i) DA can reap benefits previously unexplored, (ii) augmentations
+acting on time series sequence order and masking are better suited for TC than
+amplitude augmentations and (iii) basic models latent space analysis can help
+understanding the positive/negative effects of augmentations on classification
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to appear at Passive and Active Measurements (PAM), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DPGNN: Dual-Perception Graph Neural Network for Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.07869v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.07869v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Zhou, Wenyu Chen, Dingyi Zeng, Shaohuan Cheng, Wanlong Liu, Malu Zhang, Hong Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have drawn increasing attention in recent years
+and achieved remarkable performance in many graph-based tasks, especially in
+semi-supervised learning on graphs. However, most existing GNNs are based on
+the message-passing paradigm to iteratively aggregate neighborhood information
+in a single topology space. Despite their success, the expressive power of GNNs
+is limited by some drawbacks, such as inflexibility of message source
+expansion, negligence of node-level message output discrepancy, and restriction
+of single message space. To address these drawbacks, we present a novel
+message-passing paradigm, based on the properties of multi-step message source,
+node-specific message output, and multi-space message interaction. To verify
+its validity, we instantiate the new message-passing paradigm as a
+Dual-Perception Graph Neural Network (DPGNN), which applies a node-to-step
+attention mechanism to aggregate node-specific multi-step neighborhood
+information adaptively. Our proposed DPGNN can capture the structural
+neighborhood information and the feature-related information simultaneously for
+graph representation learning. Experimental results on six benchmark datasets
+with different topological structures demonstrate that our method outperforms
+the latest state-of-the-art models, which proves the superiority and
+versatility of our method. To our knowledge, we are the first to consider
+node-specific message passing in the GNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Knowledge-Based Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Algorithms for Stochastic Complementary Composite Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.01758v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.01758v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre d'Aspremont, Cristóbal Guzmán, Clément Lezane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by regularization techniques in statistics and machine learning, we
+study complementary composite minimization in the stochastic setting. This
+problem corresponds to the minimization of the sum of a (weakly) smooth
+function endowed with a stochastic first-order oracle, and a structured
+uniformly convex (possibly nonsmooth and non-Lipschitz) regularization term.
+Despite intensive work on closely related settings, prior to our work no
+complexity bounds for this problem were known. We close this gap by providing
+novel excess risk bounds, both in expectation and with high probability. Our
+algorithms are nearly optimal, which we prove via novel lower complexity bounds
+for this class of problems. We conclude by providing numerical results
+comparing our methods to the state of the art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conditional Variational Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02246v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02246v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel della Maggiora, Luis Alberto Croquevielle, Nikita Deshpande, Harry Horsley, Thomas Heinis, Artur Yakimovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inverse problems aim to determine parameters from observations, a crucial
+task in engineering and science. Lately, generative models, especially
+diffusion models, have gained popularity in this area for their ability to
+produce realistic solutions and their good mathematical properties. Despite
+their success, an important drawback of diffusion models is their sensitivity
+to the choice of variance schedule, which controls the dynamics of the
+diffusion process. Fine-tuning this schedule for specific applications is
+crucial but time-costly and does not guarantee an optimal result. We propose a
+novel approach for learning the schedule as part of the training process. Our
+method supports probabilistic conditioning on data, provides high-quality
+solutions, and is flexible, proving able to adapt to different applications
+with minimum overhead. This approach is tested in two unrelated inverse
+problems: super-resolution microscopy and quantitative phase imaging, yielding
+comparable or superior results to previous methods and fine-tuned diffusion
+models. We conclude that fine-tuning the schedule by experimentation should be
+avoided because it can be learned during training in a stable way that yields
+better results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Denoising Diffusion Probabilistic Models, Inverse Problems,
+  Generative Models, Super Resolution, Phase Quantification, Variational
+  Methods</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Determinantal Point Process Attention Over Grid Cell Code Supports Out
+  of Distribution Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18417v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18417v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanka Subhra Mondal, Steven Frankland, Taylor Webb, Jonathan D. Cohen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have made tremendous gains in emulating human-like
+intelligence, and have been used increasingly as ways of understanding how the
+brain may solve the complex computational problems on which this relies.
+However, these still fall short of, and therefore fail to provide insight into
+how the brain supports strong forms of generalization of which humans are
+capable. One such case is out-of-distribution (OOD) generalization-successful
+performance on test examples that lie outside the distribution of the training
+set. Here, we identify properties of processing in the brain that may
+contribute to this ability. We describe a two-part algorithm that draws on
+specific features of neural computation to achieve OOD generalization, and
+provide a proof of concept by evaluating performance on two challenging
+cognitive tasks. First we draw on the fact that the mammalian brain represents
+metric spaces using grid cell code (e.g., in the entorhinal cortex): abstract
+representations of relational structure, organized in recurring motifs that
+cover the representational space. Second, we propose an attentional mechanism
+that operates over the grid cell code using Determinantal Point Process (DPP),
+that we call DPP attention (DPP-A) -- a transformation that ensures maximum
+sparseness in the coverage of that space. We show that a loss function that
+combines standard task-optimized error with DPP-A can exploit the recurring
+motifs in the grid cell code, and can be integrated with common architectures
+to achieve strong OOD generalization performance on analogy and arithmetic
+tasks. This provides both an interpretation of how the grid cell code in the
+mammalian brain may contribute to generalization performance, and at the same
+time a potential means for improving such capabilities in artificial neural
+networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages (including Appendix), 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Calibrating <span class="highlight-title">Transformer</span>s via Sparse Gaussian Processes <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02444v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02444v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenlong Chen, Yingzhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer models have achieved profound success in prediction tasks in a
+wide range of applications in natural language processing, speech recognition
+and computer vision. Extending Transformer's success to safety-critical domains
+requires calibrated uncertainty estimation which remains under-explored. To
+address this, we propose Sparse Gaussian Process attention (SGPA), which
+performs Bayesian inference directly in the output space of multi-head
+attention blocks (MHAs) in transformer to calibrate its uncertainty. It
+replaces the scaled dot-product operation with a valid symmetric kernel and
+uses sparse Gaussian processes (SGP) techniques to approximate the posterior
+processes of MHA outputs. Empirically, on a suite of prediction tasks on text,
+images and graphs, SGPA-based Transformers achieve competitive predictive
+accuracy, while noticeably improving both in-distribution calibration and
+out-of-distribution robustness and detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at The Eleventh International Conference on Learning
+  Representations (ICLR 2023). This latest Arxiv version includes a
+  clarification of how ECE/MCE are computed (at page 10)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> <span class="highlight-title">Pretrain</span>ing for Robust Personalized Voice Activity
+  Detection in Adverse Conditions <span class="chip">ICASSP2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.16613v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.16613v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Holger Severin Bovbjerg, Jesper Jensen, Jan Østergaard, Zheng-Hua Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose the use of self-supervised pretraining on a large
+unlabelled data set to improve the performance of a personalized voice activity
+detection (VAD) model in adverse conditions. We pretrain a long short-term
+memory (LSTM)-encoder using the autoregressive predictive coding (APC)
+framework and fine-tune it for personalized VAD. We also propose a denoising
+variant of APC, with the goal of improving the robustness of personalized VAD.
+The trained models are systematically evaluated on both clean speech and speech
+contaminated by various types of noise at different SNR-levels and compared to
+a purely supervised model. Our experiments show that self-supervised
+pretraining not only improves performance in clean conditions, but also yields
+models which are more robust to adverse conditions compared to purely
+supervised learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published at ICASSP2024, 14th of April 2024, Seoul, South
+  Korea. Copyright (c) 2023 IEEE. 5 pages, 2, figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Policy Gradient Algorithms for Robust MDPs with Non-Rectangular
+  Uncertainty Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19004v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19004v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengmeng Li, Daniel Kuhn, Tobias Sutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose policy gradient algorithms for robust infinite-horizon Markov
+decision processes (MDPs) with non-rectangular uncertainty sets, thereby
+addressing an open challenge in the robust MDP literature. Indeed, uncertainty
+sets that display statistical optimality properties and make optimal use of
+limited data often fail to be rectangular. Unfortunately, the corresponding
+robust MDPs cannot be solved with dynamic programming techniques and are in
+fact provably intractable. We first present a randomized projected Langevin
+dynamics algorithm that solves the robust policy evaluation problem to global
+optimality but is inefficient. We also propose a deterministic policy gradient
+method that is efficient but solves the robust policy evaluation problem only
+approximately, and we prove that the approximation error scales with a new
+measure of non-rectangularity of the uncertainty set. Finally, we describe an
+actor-critic algorithm that finds an $\epsilon$-optimal solution for the robust
+policy improvement problem in $\mathcal{O}(1/\epsilon^4)$ iterations. We thus
+present the first complete solution scheme for robust MDPs with non-rectangular
+uncertainty sets offering global optimality guarantees. Numerical experiments
+show that our algorithms compare favorably against state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>comments are welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Insights From Insurance for Fair Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14624v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14624v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Fröhlich, Robert C. Williamson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We argue that insurance can act as an analogon for the social situatedness of
+machine learning systems, hence allowing machine learning scholars to take
+insights from the rich and interdisciplinary insurance literature. Tracing the
+interaction of uncertainty, fairness and responsibility in insurance provides a
+fresh perspective on fairness in machine learning. We link insurance fairness
+conceptions to their machine learning relatives, and use this bridge to
+problematize fairness as calibration. In this process, we bring to the
+forefront two themes that have been largely overlooked in the machine learning
+literature: responsibility and aggregate-individual tensions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Trustworthy AI Software Development Assistance <span class="chip">ICSE</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09126v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09126v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Maninger, Krishna Narasimhan, Mira Mezini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is expected that in the near future, AI software development assistants
+will play an important role in the software industry. However, current software
+development assistants tend to be unreliable, often producing incorrect,
+unsafe, or low-quality code. We seek to resolve these issues by introducing a
+holistic architecture for constructing, training, and using trustworthy AI
+software development assistants. In the center of the architecture, there is a
+foundational LLM trained on datasets representative of real-world coding
+scenarios and complex software architectures, and fine-tuned on code quality
+criteria beyond correctness. The LLM will make use of graph-based code
+representations for advanced semantic comprehension. We envision a knowledge
+graph integrated into the system to provide up-to-date background knowledge and
+to enable the assistant to provide appropriate explanations. Finally, a modular
+framework for constrained decoding will ensure that certain guarantees (e.g.,
+for correctness and security) hold for the generated code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure; to be published in New Ideas and Emerging Results
+  (ICSE-NIER'24), April 14-20, 2024, Lisbon, Portugal; updated version to
+  reflect the information provided by ACM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Copula Conformal Prediction for Multi-step Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.03281v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.03281v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sophia Sun, Rose Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate uncertainty measurement is a key step to building robust and
+reliable machine learning systems. Conformal prediction is a distribution-free
+uncertainty quantification algorithm popular for its ease of implementation,
+statistical coverage guarantees, and versatility for underlying forecasters.
+However, existing conformal prediction algorithms for time series are limited
+to single-step prediction without considering the temporal dependency. In this
+paper we propose a Copula Conformal Prediction algorithm for multivariate,
+multi-step Time Series forecasting, CopulaCPTS. We prove that CopulaCPTS has
+finite sample validity guarantee. On several synthetic and real-world
+multivariate time series datasets, we show that CopulaCPTS produces more
+calibrated and sharp confidence intervals for multi-step prediction tasks than
+existing techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A ripple in time: a discontinuity in American history 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01185v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01185v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Kolpakov, Igor Rivin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this note we use the State of the Union Address (SOTU) dataset from Kaggle
+to make some surprising (and some not so surprising) observations pertaining to
+the general timeline of American history, and the character and nature of the
+addresses themselves. Our main approach is using vector embeddings, such as
+BERT (DistilBERT) and GPT-2.
+  While it is widely believed that BERT (and its variations) is most suitable
+for NLP classification tasks, we find out that GPT-2 in conjunction with
+nonlinear dimension reduction methods such as UMAP provide better separation
+and stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In
+our case, no model fine-tuning is required, and the pre-trained out-of-the-box
+GPT-2 model is enough.
+  We also used a fine-tuned DistilBERT model for classification detecting which
+President delivered which address, with very good results (accuracy 93% - 95%
+depending on the run). An analogous task was performed to determine the year of
+writing, and we were able to pin it down to about 4 years (which is a single
+presidential term).
+  It is worth noting that SOTU addresses provide relatively small writing
+samples (with about 8'000 words on average, and varying widely from under 2'000
+words to more than 20'000), and that the number of authors is relatively large
+(we used SOTU addresses of 42 US presidents). This shows that the techniques
+employed turn out to be rather efficient, while all the computations described
+in this note can be performed using a single GPU instance of Google Colab.
+  The accompanying code is available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 8 figures; GitHub repository
+  https://github.com/sashakolpakov/ripple_in_time</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeepSeaNet: Improving Underwater Object Detection using EfficientDet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06075v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06075v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanyam Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Marine animals and deep underwater objects are difficult to recognize and
+monitor for safety of aquatic life. There is an increasing challenge when the
+water is saline with granular particles and impurities. In such natural
+adversarial environment, traditional approaches like CNN start to fail and are
+expensive to compute. This project involves implementing and evaluating various
+object detection models, including EfficientDet, YOLOv5, YOLOv8, and
+Detectron2, on an existing annotated underwater dataset, called the
+Brackish-Dataset. The dataset comprises annotated image sequences of fish,
+crabs, starfish, and other aquatic animals captured in Limfjorden water with
+limited visibility. The aim of this research project is to study the efficiency
+of newer models on the same dataset and contrast them with the previous results
+based on accuracy and inference time. Firstly, I compare the results of YOLOv3
+(31.10% mean Average Precision (mAP)), YOLOv4 (83.72% mAP), YOLOv5 (97.6%),
+YOLOv8 (98.20%), EfficientDet (98.56% mAP) and Detectron2 (95.20% mAP) on the
+same dataset. Secondly, I provide a modified BiSkFPN mechanism (BiFPN neck with
+skip connections) to perform complex feature fusion in adversarial noise which
+makes modified EfficientDet robust to perturbations. Third, analyzed the effect
+on accuracy of EfficientDet (98.63% mAP) and YOLOv5 by adversarial learning
+(98.04% mAP). Last, I provide class activation map based explanations (CAM) for
+the two models to promote Explainability in black box models. Overall, the
+results indicate that modified EfficientDet achieved higher accuracy with
+five-fold cross validation than the other models with 88.54% IoU of feature
+maps.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leaping through tree space: continuous phylogenetic inference for rooted
+  and unrooted trees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05739v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05739v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew J Penn, Neil Scheidwasser, Joseph Penn, Christl A Donnelly, David A Duchêne, Samir Bhatt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Phylogenetics is now fundamental in life sciences, providing insights into
+the earliest branches of life and the origins and spread of epidemics. However,
+finding suitable phylogenies from the vast space of possible trees remains
+challenging. To address this problem, for the first time, we perform both tree
+exploration and inference in a continuous space where the computation of
+gradients is possible. This continuous relaxation allows for major leaps across
+tree space in both rooted and unrooted trees, and is less susceptible to
+convergence to local minima. Our approach outperforms the current best methods
+for inference on unrooted trees and, in simulation, accurately infers the tree
+and root in ultrametric cases. The approach is effective in cases of empirical
+data with negligible amounts of data, which we demonstrate on the phylogeny of
+jawed vertebrates. Indeed, only a few genes with an ultrametric signal were
+generally sufficient for resolving the major lineages of vertebrates.
+Optimisation is possible via automatic differentiation and our method presents
+an effective way forwards for exploring the most difficult, data-deficient
+phylogenetic questions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 3 figures, 2 tables, 20 supplementary pages, 3
+  supplementary figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deciphering Raw Data in Neuro-Symbolic Learning with Provable Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10487v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10487v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lue Tao, Yu-Xuan Huang, Wang-Zhou Dai, Yuan Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neuro-symbolic hybrid systems are promising for integrating machine learning
+and symbolic reasoning, where perception models are facilitated with
+information inferred from a symbolic knowledge base through logical reasoning.
+Despite empirical evidence showing the ability of hybrid systems to learn
+accurate perception models, the theoretical understanding of learnability is
+still lacking. Hence, it remains unclear why a hybrid system succeeds for a
+specific task and when it may fail given a different knowledge base. In this
+paper, we introduce a novel way of characterising supervision signals from a
+knowledge base, and establish a criterion for determining the knowledge's
+efficacy in facilitating successful learning. This, for the first time, allows
+us to address the two questions above by inspecting the knowledge base under
+investigation. Our analysis suggests that many knowledge bases satisfy the
+criterion, thus enabling effective learning, while some fail to satisfy it,
+indicating potential failures. Comprehensive experiments confirm the utility of
+our criterion on benchmark tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empowering GNNs via Edge-Aware Weisfeiler-Leman Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02059v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02059v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Liu, Haiyang Yu, Shuiwang Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Message passing graph neural networks (GNNs) are known to have their
+expressiveness upper-bounded by 1-dimensional Weisfeiler-Leman (1-WL)
+algorithm. To achieve more powerful GNNs, existing attempts either require ad
+hoc features, or involve operations that incur high time and space
+complexities. In this work, we propose a general and provably powerful GNN
+framework that preserves the scalability of the message passing scheme. In
+particular, we first propose to empower 1-WL for graph isomorphism test by
+considering edges among neighbors, giving rise to NC-1-WL. The expressiveness
+of NC-1-WL is shown to be strictly above 1-WL and below 3-WL theoretically.
+Further, we propose the NC-GNN framework as a differentiable neural version of
+NC-1-WL. Our simple implementation of NC-GNN is provably as powerful as
+NC-1-WL. Experiments demonstrate that our NC-GNN performs effectively and
+efficiently on various benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Refined Edge Usage of Graph Neural Networks for Edge Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12970v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12970v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarui Jin, Yangkun Wang, Weinan Zhang, Quan Gan, Xiang Song, Yong Yu, Zheng Zhang, David Wipf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs), originally proposed for node classification,
+have also motivated many recent works on edge prediction (a.k.a., link
+prediction). However, existing methods lack elaborate design regarding the
+distinctions between two tasks that have been frequently overlooked: (i) edges
+only constitute the topology in the node classification task but can be used as
+both the topology and the supervisions (i.e., labels) in the edge prediction
+task; (ii) the node classification makes prediction over each individual node,
+while the edge prediction is determinated by each pair of nodes. To this end,
+we propose a novel edge prediction paradigm named Edge-aware Message PassIng
+neuRal nEtworks (EMPIRE). Concretely, we first introduce an edge splitting
+technique to specify use of each edge where each edge is solely used as either
+the topology or the supervision (named as topology edge or supervision edge).
+We then develop a new message passing mechanism that generates the messages to
+source nodes (through topology edges) being aware of target nodes (through
+supervision edges). In order to emphasize the differences between pairs
+connected by supervision edges and pairs unconnected, we further weight the
+messages to highlight the relative ones that can reflect the differences. In
+addition, we design a novel negative node-pair sampling trick that efficiently
+samples 'hard' negative instances in the supervision instances, and can
+significantly improve the performance. Experimental results verify that the
+proposed method can significantly outperform existing state-of-the-art models
+regarding the edge prediction task on multiple homogeneous and heterogeneous
+graph datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Need major revisions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retrieval meets Long Context Large Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03025v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03025v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xu, Wei Ping, Xianchao Wu, Lawrence McAfee, Chen Zhu, Zihan Liu, Sandeep Subramanian, Evelina Bakhturina, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extending the context window of large language models (LLMs) is getting
+popular recently, while the solution of augmenting LLMs with retrieval has
+existed for years. The natural questions are: i) Retrieval-augmentation versus
+long context window, which one is better for downstream tasks? ii) Can both
+methods be combined to get the best of both worlds? In this work, we answer
+these questions by studying both solutions using two state-of-the-art
+pretrained LLMs, i.e., a proprietary 43B GPT and Llama2-70B. Perhaps
+surprisingly, we find that LLM with 4K context window using simple
+retrieval-augmentation at generation can achieve comparable performance to
+finetuned LLM with 16K context window via positional interpolation on long
+context tasks, while taking much less computation. More importantly, we
+demonstrate that retrieval can significantly improve the performance of LLMs
+regardless of their extended context window sizes. Our best model,
+retrieval-augmented Llama2-70B with 32K context window, outperforms
+GPT-3.5-turbo-16k and Davinci003 in terms of average score on nine long context
+tasks including question answering, query-based summarization, and in-context
+few-shot learning tasks. It also outperforms its non-retrieval Llama2-70B-32k
+baseline by a margin, while being much faster at generation. Our study provides
+general insights on the choice of retrieval-augmentation versus long context
+extension of LLM for practitioners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sequential Model for Predicting Patient Adherence in Subcutaneous
+  Immunotherapy for Allergic Rhinitis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11447v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11447v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yin Li, Yu Xiong, Wenxin Fan, Kai Wang, Qingqing Yu, Liping Si, Patrick van der Smagt, Jun Tang, Nutan Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: Subcutaneous Immunotherapy (SCIT) is the long-lasting causal
+treatment of allergic rhinitis. How to enhance the adherence of patients to
+maximize the benefit of allergen immunotherapy (AIT) plays a crucial role in
+the management of AIT. This study aims to leverage novel machine learning
+models to precisely predict the risk of non-adherence of patients and related
+systematic symptom scores, to provide a novel approach in the management of
+long-term AIT.
+  Methods: The research develops and analyzes two models, Sequential Latent
+Actor-Critic (SLAC) and Long Short-Term Memory (LSTM), evaluating them based on
+scoring and adherence prediction capabilities.
+  Results: Excluding the biased samples at the first time step, the predictive
+adherence accuracy of the SLAC models is from $60\,\%$ to $72\%$, and for LSTM
+models, it is $66\,\%$ to $84\,\%$, varying according to the time steps. The
+range of Root Mean Square Error (RMSE) for SLAC models is between $0.93$ and
+$2.22$, while for LSTM models it is between $1.09$ and $1.77$. Notably, these
+RMSEs are significantly lower than the random prediction error of $4.55$.
+  Conclusion: We creatively apply sequential models in the long-term management
+of SCIT with promising accuracy in the prediction of SCIT nonadherence in
+Allergic Rhinitis (AR) patients. While LSTM outperforms SLAC in adherence
+prediction, SLAC excels in score prediction for patients undergoing SCIT for
+AR. The state-action-based SLAC adds flexibility, presenting a novel and
+effective approach for managing long-term AIT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatial-Temporal Large Language Model for Traffic Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10134v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10134v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxi Liu, Sun Yang, Qianxiong Xu, Zhishuai Li, Cheng Long, Ziyue Li, Rui Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic prediction, a critical component for intelligent transportation
+systems, endeavors to foresee future traffic at specific locations using
+historical data. Although existing traffic prediction models often emphasize
+developing complex neural network structures, their accuracy has not seen
+improvements accordingly. Recently, Large Language Models (LLMs) have shown
+outstanding capabilities in time series analysis. Differing from existing
+models, LLMs progress mainly through parameter expansion and extensive
+pre-training while maintaining their fundamental structures. In this paper, we
+propose a Spatial-Temporal Large Language Model (ST-LLM) for traffic
+prediction. Specifically, ST-LLM redefines the timesteps at each location as
+tokens and incorporates a spatial-temporal embedding module to learn the
+spatial location and global temporal representations of tokens. Then these
+representations are fused to provide each token with unified spatial and
+temporal information. Furthermore, we propose a novel partially frozen
+attention strategy of the LLM, which is designed to capture spatial-temporal
+dependencies for traffic prediction. Comprehensive experiments on real traffic
+datasets offer evidence that ST-LLM outperforms state-of-the-art models.
+Notably, the ST-LLM also exhibits robust performance in both few-shot and
+zero-shot prediction scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revise</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalized Out-of-Distribution Detection: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.11334v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.11334v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingkang Yang, Kaiyang Zhou, Yixuan Li, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection is critical to ensuring the reliability
+and safety of machine learning systems. For instance, in autonomous driving, we
+would like the driving system to issue an alert and hand over the control to
+humans when it detects unusual scenes or objects that it has never seen during
+training time and cannot make a safe decision. The term, OOD detection, first
+emerged in 2017 and since then has received increasing attention from the
+research community, leading to a plethora of methods developed, ranging from
+classification-based to density-based to distance-based ones. Meanwhile,
+several other problems, including anomaly detection (AD), novelty detection
+(ND), open set recognition (OSR), and outlier detection (OD), are closely
+related to OOD detection in terms of motivation and methodology. Despite common
+goals, these topics develop in isolation, and their subtle differences in
+definition and problem setting often confuse readers and practitioners. In this
+survey, we first present a unified framework called generalized OOD detection,
+which encompasses the five aforementioned problems, i.e., AD, ND, OSR, OOD
+detection, and OD. Under our framework, these five problems can be seen as
+special cases or sub-tasks, and are easier to distinguish. We then review each
+of these five areas by summarizing their recent technical developments, with a
+special focus on OOD detection methodologies. We conclude this survey with open
+challenges and potential research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Feel free to comment on our Overleaf manuscript:
+  https://www.overleaf.com/9899719915wmccvdtwpkct#c25192</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-Aware Hardware Trojan Detection Using Multimodal Deep
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09479v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09479v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul Vishwakarma, Amin Rezaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The risk of hardware Trojans being inserted at various stages of chip
+production has increased in a zero-trust fabless era. To counter this, various
+machine learning solutions have been developed for the detection of hardware
+Trojans. While most of the focus has been on either a statistical or deep
+learning approach, the limited number of Trojan-infected benchmarks affects the
+detection accuracy and restricts the possibility of detecting zero-day Trojans.
+To close the gap, we first employ generative adversarial networks to amplify
+our data in two alternative representation modalities, a graph and a tabular,
+ensuring that the dataset is distributed in a representative manner. Further,
+we propose a multimodal deep learning approach to detect hardware Trojans and
+evaluate the results from both early fusion and late fusion strategies. We also
+estimate the uncertainty quantification metrics of each prediction for
+risk-aware decision-making. The outcomes not only confirms the efficacy of our
+proposed hardware Trojan detection method but also opens a new door for future
+studies employing multimodality and uncertainty quantification to address other
+hardware security challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2024 Design, Automation and Test in Europe Conference | The European
+  Event for Electronic System Design & Test (accepted)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Personalized Predictions of Glioblastoma Infiltration: Mathematical
+  Models, Physics-Informed Neural Networks and Multimodal Scans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.16536v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.16536v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ray Zirui Zhang, Ivan Ezhov, Michal Balcerak, Andy Zhu, Benedikt Wiestler, Bjoern Menze, John Lowengrub
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting the infiltration of Glioblastoma (GBM) from medical MRI scans is
+crucial for understanding tumor growth dynamics and designing personalized
+radiotherapy treatment plans.Mathematical models of GBM growth can complement
+the data in the prediction of spatial distributions of tumor cells. However,
+this requires estimating patient-specific parameters of the model from clinical
+data, which is a challenging inverse problem due to limited temporal data and
+the limited time between imaging and diagnosis. This work proposes a method
+that uses Physics-Informed Neural Networks (PINNs) to estimate patient-specific
+parameters of a reaction-diffusion PDE model of GBM growth from a single 3D
+structural MRI snapshot. PINNs embed both the data and the PDE into a loss
+function, thus integrating theory and data. Key innovations include the
+identification and estimation of characteristic non-dimensional parameters, a
+pre-training step that utilizes the non-dimensional parameters and a
+fine-tuning step to determine the patient specific parameters. Additionally,
+the diffuse domain method is employed to handle the complex brain geometry
+within the PINN framework. Our method is validated both on synthetic and
+patient datasets, and shows promise for real-time parametric inference in the
+clinical setting for personalized GBM treatment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Anomaly Detection in Computer Vision and Beyond: A
+  <span class="highlight-title">Survey</span> and Outlook 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.05173v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.05173v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hadi Hojjati, Thi Kieu Khanh Ho, Narges Armanfard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection (AD) plays a crucial role in various domains, including
+cybersecurity, finance, and healthcare, by identifying patterns or events that
+deviate from normal behaviour. In recent years, significant progress has been
+made in this field due to the remarkable growth of deep learning models.
+Notably, the advent of self-supervised learning has sparked the development of
+novel AD algorithms that outperform the existing state-of-the-art approaches by
+a considerable margin. This paper aims to provide a comprehensive review of the
+current methodologies in self-supervised anomaly detection. We present
+technical details of the standard methods and discuss their strengths and
+drawbacks. We also compare the performance of these models against each other
+and other state-of-the-art anomaly detection models. Finally, the paper
+concludes with a discussion of future directions for self-supervised anomaly
+detection, including the development of more effective and efficient algorithms
+and the integration of these techniques with other related fields, such as
+multi-modal learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 4 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of <span class="highlight-title">GPT</span>-3 for Anti-Cancer Drug Sensitivity Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10016v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10016v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaika Chowdhury, Sivaraman Rajaganapathy, Lichao Sun, James Cerhan, Nansu Zong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we investigated the potential of GPT-3 for the anti-cancer
+drug sensitivity prediction task using structured pharmacogenomics data across
+five tissue types and evaluated its performance with zero-shot prompting and
+fine-tuning paradigms. The drug's smile representation and cell line's genomic
+mutation features were predictive of the drug response. The results from this
+study have the potential to pave the way for designing more efficient treatment
+protocols in precision oncology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AMIA Informatics Summit 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpecSTG: A Fast Spectral Diffusion Framework for Probabilistic
+  Spatio-Temporal Traffic Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08119v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08119v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lequan Lin, Dai Shi, Andi Han, Junbin Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic forecasting, a crucial application of spatio-temporal graph (STG)
+learning, has traditionally relied on deterministic models for accurate point
+estimations. Yet, these models fall short of identifying latent risks of
+unexpected volatility in future observations. To address this gap,
+probabilistic methods, especially variants of diffusion models, have emerged as
+uncertainty-aware solutions. However, existing diffusion methods typically
+focus on generating separate future time series for individual sensors in the
+traffic network, resulting in insufficient involvement of spatial network
+characteristics in the probabilistic learning process. To better leverage
+spatial dependencies and systematic patterns inherent in traffic data, we
+propose SpecSTG, a novel spectral diffusion framework. Our method generates the
+Fourier representation of future time series, transforming the learning process
+into the spectral domain enriched with spatial information. Additionally, our
+approach incorporates a fast spectral graph convolution designed for Fourier
+input, alleviating the computational burden associated with existing models.
+Numerical experiments show that SpecSTG achieves outstanding performance with
+traffic flow and traffic speed datasets compared to state-of-the-art baselines.
+The source code for SpecSTG is available at
+https://anonymous.4open.science/r/SpecSTG.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imagination-Augmented Hierarchical Reinforcement Learning for Safe and
+  Interactive Autonomous Driving in Urban Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.10309v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.10309v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sang-Hyun Lee, Yoonjae Jung, Seung-Woo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hierarchical reinforcement learning (HRL) incorporates temporal abstraction
+into reinforcement learning (RL) by explicitly taking advantage of hierarchical
+structure. Modern HRL typically designs a hierarchical agent composed of a
+high-level policy and low-level policies. The high-level policy selects which
+low-level policy to activate at a lower frequency and the activated low-level
+policy selects an action at each time step. Recent HRL algorithms have achieved
+performance gains over standard RL algorithms in synthetic navigation tasks.
+However, we cannot apply these HRL algorithms to real-world navigation tasks.
+One of the main challenges is that real-world navigation tasks require an agent
+to perform safe and interactive behaviors in dynamic environments. In this
+paper, we propose imagination-augmented HRL (IAHRL) that efficiently integrates
+imagination into HRL to enable an agent to learn safe and interactive behaviors
+in real-world navigation tasks. Imagination is to predict the consequences of
+actions without interactions with actual environments. The key idea behind
+IAHRL is that the low-level policies imagine safe and structured behaviors, and
+then the high-level policy infers interactions with surrounding objects by
+interpreting the imagined behaviors. We also introduce a new attention
+mechanism that allows our high-level policy to be permutation-invariant to the
+order of surrounding objects and to prioritize our agent over them. To evaluate
+IAHRL, we introduce five complex urban driving tasks, which are among the most
+challenging real-world navigation tasks. The experimental results indicate that
+IAHRL enables an agent to perform safe and interactive behaviors, achieving
+higher success rates and lower average episode steps than baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures; corrected typos, added references, revised
+  experiments (results unchanged)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gradual Domain Adaptation via Normalizing Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.11492v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.11492v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shogo Sagawa, Hideitsu Hino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Standard domain adaptation methods do not work well when a large gap exists
+between the source and target domains. Gradual domain adaptation is one of the
+approaches used to address the problem. It involves leveraging the intermediate
+domain, which gradually shifts from the source domain to the target domain. In
+previous work, it is assumed that the number of intermediate domains is large
+and the distance between adjacent domains is small; hence, the gradual domain
+adaptation algorithm, involving self-training with unlabeled datasets, is
+applicable. In practice, however, gradual self-training will fail because the
+number of intermediate domains is limited and the distance between adjacent
+domains is large. We propose the use of normalizing flows to deal with this
+problem while maintaining the framework of unsupervised domain adaptation. The
+proposed method learns a transformation from the distribution of the target
+domain to the Gaussian mixture distribution via the source domain. We evaluate
+our proposed method by experiments using real-world datasets and confirm that
+it mitigates the above-explained problem and improves the classification
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Homophily modulates double descent generalization in graph convolution
+  networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.13069v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.13069v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Shi, Liming Pan, Hong Hu, Ivan Dokmanić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) excel in modeling relational data such as
+biological, social, and transportation networks, but the underpinnings of their
+success are not well understood. Traditional complexity measures from
+statistical learning theory fail to account for observed phenomena like the
+double descent or the impact of relational semantics on generalization error.
+Motivated by experimental observations of ``transductive'' double descent in
+key networks and datasets, we use analytical tools from statistical physics and
+random matrix theory to precisely characterize generalization in simple graph
+convolution networks on the contextual stochastic block model. Our results
+illuminate the nuances of learning on homophilic versus heterophilic data and
+predict double descent whose existence in GNNs has been questioned by recent
+work. We show how risk is shaped by the interplay between the graph noise,
+feature noise, and the number of training labels. Our findings apply beyond
+stylized models, capturing qualitative trends in real-world GNNs and datasets.
+As a case in point, we use our analytic insights to improve performance of
+state-of-the-art graph convolution networks on heterophilic datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Homotopy-based training of NeuralODEs for accurate dynamics discovery <span class="chip">NeurIPS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.01407v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.01407v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joon-Hyuk Ko, Hankyul Koh, Nojun Park, Wonho Jhe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Ordinary Differential Equations (NeuralODEs) present an attractive way
+to extract dynamical laws from time series data, as they bridge neural networks
+with the differential equation-based modeling paradigm of the physical
+sciences. However, these models often display long training times and
+suboptimal results, especially for longer duration data. While a common
+strategy in the literature imposes strong constraints to the NeuralODE
+architecture to inherently promote stable model dynamics, such methods are
+ill-suited for dynamics discovery as the unknown governing equation is not
+guaranteed to satisfy the assumed constraints. In this paper, we develop a new
+training method for NeuralODEs, based on synchronization and homotopy
+optimization, that does not require changes to the model architecture. We show
+that synchronizing the model dynamics and the training data tames the
+originally irregular loss landscape, which homotopy optimization can then
+leverage to enhance training. Through benchmark experiments, we demonstrate our
+method achieves competitive or better training loss while often requiring less
+than half the number of training epochs compared to other model-agnostic
+techniques. Furthermore, models trained with our method display better
+extrapolation capabilities, highlighting the effectiveness of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, accepted at NeurIPS2023
+  (https://neurips.cc/virtual/2023/poster/70313)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explainability-Driven Leaf Disease Classification Using Adversarial
+  Training and Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00334v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00334v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian-Vasile Echim, Iulian-Marius Tăiatu, Dumitru-Clementin Cercel, Florin Pop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work focuses on plant leaf disease classification and explores three
+crucial aspects: adversarial training, model explainability, and model
+compression. The models' robustness against adversarial attacks is enhanced
+through adversarial training, ensuring accurate classification even in the
+presence of threats. Leveraging explainability techniques, we gain insights
+into the model's decision-making process, improving trust and transparency.
+Additionally, we explore model compression techniques to optimize computational
+efficiency while maintaining classification performance. Through our
+experiments, we determine that on a benchmark dataset, the robustness can be
+the price of the classification accuracy with performance reductions of 3%-20%
+for regular tests and gains of 50%-70% for adversarial attack tests. We also
+demonstrate that a student model can be 15-25 times more computationally
+efficient for a slight performance reduction, distilling the knowledge of more
+complex models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, Accepted by ICAART 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Lightweight Method for Tackling Unknown Participation Statistics in
+  Federated Averaging <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03401v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03401v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiqiang Wang, Mingyue Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In federated learning (FL), clients usually have diverse participation
+statistics that are unknown a priori, which can significantly harm the
+performance of FL if not handled properly. Existing works aiming at addressing
+this problem are usually based on global variance reduction, which requires a
+substantial amount of additional memory in a multiplicative factor equal to the
+total number of clients. An important open problem is to find a lightweight
+method for FL in the presence of clients with unknown participation rates. In
+this paper, we address this problem by adapting the aggregation weights in
+federated averaging (FedAvg) based on the participation history of each client.
+We first show that, with heterogeneous participation statistics, FedAvg with
+non-optimal aggregation weights can diverge from the optimal solution of the
+original FL objective, indicating the need of finding optimal aggregation
+weights. However, it is difficult to compute the optimal weights when the
+participation statistics are unknown. To address this problem, we present a new
+algorithm called FedAU, which improves FedAvg by adaptively weighting the
+client updates based on online estimates of the optimal weights without knowing
+the statistics of client participation. We provide a theoretical convergence
+analysis of FedAU using a novel methodology to connect the estimation error and
+convergence. Our theoretical results reveal important and interesting insights,
+while showing that FedAU converges to an optimal solution of the original
+objective and has desirable properties such as linear speedup. Our experimental
+results also verify the advantage of FedAU over baseline methods with various
+participation patterns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChatQA: Building <span class="highlight-title">GPT</span>-4 Level Conversational QA Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10225v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10225v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Liu, Wei Ping, Rajarshi Roy, Peng Xu, Chankyu Lee, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce ChatQA, a family of conversational question
+answering (QA) models that obtain GPT-4 level accuracies. Specifically, we
+propose a two-stage instruction tuning method that can significantly improve
+the zero-shot conversational QA results from large language models (LLMs). To
+handle retrieval-augmented generation in conversational QA, we fine-tune a
+dense retriever on a multi-turn QA dataset, which provides comparable results
+to using the state-of-the-art query rewriting model while largely reducing
+deployment cost. Notably, our ChatQA-70B can outperform GPT-4 in terms of
+average score on 10 conversational QA datasets (54.14 vs. 53.90), without
+relying on any synthetic data from OpenAI GPT models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We added ChatQA-22B results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Koopman operator learning using invertible neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17396v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17396v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhuang Meng, Jianguo Huang, Yue Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Koopman operator theory, a finite-dimensional nonlinear system is
+transformed into an infinite but linear system using a set of observable
+functions. However, manually selecting observable functions that span the
+invariant subspace of the Koopman operator based on prior knowledge is
+inefficient and challenging, particularly when little or no information is
+available about the underlying systems. Furthermore, current methodologies tend
+to disregard the importance of the invertibility of observable functions, which
+leads to inaccurate results. To address these challenges, we propose the
+so-called FlowDMD, aka Flow-based Dynamic Mode Decomposition, that utilizes the
+Coupling Flow Invertible Neural Network (CF-INN) framework. FlowDMD leverages
+the intrinsically invertible characteristics of the CF-INN to learn the
+invariant subspaces of the Koopman operator and accurately reconstruct state
+variables. Numerical experiments demonstrate the superior performance of our
+algorithm compared to state-of-the-art methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpecInfer: Accelerating Generative Large Language Model Serving with
+  Tree-based Speculative Inference and Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09781v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09781v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Zhengxin Zhang, Rae Ying Yee Wong, Alan Zhu, Lijie Yang, Xiaoxiang Shi, Chunan Shi, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, Zhihao Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces SpecInfer, a system that accelerates generative large
+language model (LLM) serving with tree-based speculative inference and
+verification. The key idea behind SpecInfer is leveraging small speculative
+models to predict the LLM's outputs; the predictions are organized as a token
+tree, whose nodes each represent a candidate token sequence. The correctness of
+all candidate token sequences represented by a token tree is verified against
+the LLM in parallel using a novel tree-based parallel decoding mechanism.
+SpecInfer uses an LLM as a token tree verifier instead of an incremental
+decoder, which significantly reduces the end-to-end latency and computational
+requirement for serving generative LLMs while provably preserving model
+quality. Our evaluation shows that SpecInfer outperforms existing LLM serving
+systems by 1.5-2.8x for distributed LLM inference and by 2.6-3.5x for
+offloading-based LLM inference, while preserving the same generative
+performance. SpecInfer is publicly available at
+https://github.com/flexflow/FlexFlow/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Loss Functions for Training Decision Trees with Noisy Labels <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12937v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12937v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Wilton, Nan Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider training decision trees using noisily labeled data, focusing on
+loss functions that can lead to robust learning algorithms. Our contributions
+are threefold. First, we offer novel theoretical insights on the robustness of
+many existing loss functions in the context of decision tree learning. We show
+that some of the losses belong to a class of what we call conservative losses,
+and the conservative losses lead to an early stopping behavior during training
+and noise-tolerant predictions during testing. Second, we introduce a framework
+for constructing robust loss functions, called distribution losses. These
+losses apply percentile-based penalties based on an assumed margin
+distribution, and they naturally allow adapting to different noise rates via a
+robustness parameter. In particular, we introduce a new loss called the
+negative exponential loss, which leads to an efficient greedy
+impurity-reduction learning algorithm. Lastly, our experiments on multiple
+datasets and noise settings validate our theoretical insight and the
+effectiveness of our adaptive negative exponential loss.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI Conference on Artificial Intelligence 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Stability Principle for Learning under Non-Stationarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.18304v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.18304v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengpiao Huang, Kaizheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a versatile framework for statistical learning in non-stationary
+environments. In each time period, our approach applies a stability principle
+to select a look-back window that maximizes the utilization of historical data
+while keeping the cumulative bias within an acceptable range relative to the
+stochastic error. Our theory showcases the adaptability of this approach to
+unknown non-stationarity. The regret bound is minimax optimal up to logarithmic
+factors when the population losses are strongly convex, or Lipschitz only. At
+the heart of our analysis lie two novel components: a measure of similarity
+between functions and a segmentation technique for dividing the non-stationary
+data sequence into quasi-stationary pieces.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>48 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal Misinformation Detection: Approaches, Challenges and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.13883v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.13883v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Abdali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As social media platforms are evolving from text-based forums into
+multi-modal environments, the nature of misinformation in social media is also
+transforming accordingly. Taking advantage of the fact that visual modalities
+such as images and videos are more favorable and attractive to the users and
+textual contents are sometimes skimmed carelessly, misinformation spreaders
+have recently targeted contextual connections between the modalities e.g., text
+and image. Hence many researchers have developed automatic techniques for
+detecting possible cross-modal discordance in web-based content. We analyze,
+categorize and identify existing approaches in addition to challenges and
+shortcomings they face in order to unearth new research opportunities in the
+field of multi-modal misinformation detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SkipNode: On Alleviating Performance Degradation for Deep Graph
+  Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.11628v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.11628v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weigang Lu, Yibing Zhan, Binbin Lin, Ziyu Guan, Liu Liu, Baosheng Yu, Wei Zhao, Yaming Yang, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Convolutional Networks (GCNs) suffer from performance degradation when
+models go deeper. However, earlier works only attributed the performance
+degeneration to over-smoothing. In this paper, we conduct theoretical and
+experimental analysis to explore the fundamental causes of performance
+degradation in deep GCNs: over-smoothing and gradient vanishing have a mutually
+reinforcing effect that causes the performance to deteriorate more quickly in
+deep GCNs. On the other hand, existing anti-over-smoothing methods all perform
+full convolutions up to the model depth. They could not well resist the
+exponential convergence of over-smoothing due to model depth increasing. In
+this work, we propose a simple yet effective plug-and-play module, Skipnode, to
+overcome the performance degradation of deep GCNs. It samples graph nodes in
+each convolutional layer to skip the convolution operation. In this way, both
+over-smoothing and gradient vanishing can be effectively suppressed since (1)
+not all nodes'features propagate through full layers and, (2) the gradient can
+be directly passed back through ``skipped'' nodes. We provide both theoretical
+analysis and empirical evaluation to demonstrate the efficacy of Skipnode and
+its superiority over SOTA baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S$^{2}$-DMs:Skip-Step Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01520v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01520v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Wang, Shuangyin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as powerful generative tools, rivaling GANs in
+sample quality and mirroring the likelihood scores of autoregressive models. A
+subset of these models, exemplified by DDIMs, exhibit an inherent asymmetry:
+they are trained over $T$ steps but only sample from a subset of $T$ during
+generation. This selective sampling approach, though optimized for speed,
+inadvertently misses out on vital information from the unsampled steps, leading
+to potential compromises in sample quality. To address this issue, we present
+the S$^{2}$-DMs, which is a new training method by using an innovative
+$L_{skip}$, meticulously designed to reintegrate the information omitted during
+the selective sampling phase. The benefits of this approach are manifold: it
+notably enhances sample quality, is exceptionally simple to implement, requires
+minimal code modifications, and is flexible enough to be compatible with
+various sampling algorithms. On the CIFAR10 dataset, models trained using our
+algorithm showed an improvement of 3.27% to 14.06% over models trained with
+traditional methods across various sampling algorithms (DDIMs, PNDMs, DEIS) and
+different numbers of sampling steps (10, 20, ..., 1000). On the CELEBA dataset,
+the improvement ranged from 8.97% to 27.08%. Access to the code and additional
+resources is provided in the github.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Safe and Generalized end-to-end Autonomous Driving System with
+  Reinforcement Learning and Demonstrations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11792v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11792v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuojin Tang, Xiaoyu Chen, YongQiang Li, Jianyu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An intelligent driving system should be capable of dynamically formulating
+appropriate driving strategies based on the current environment and vehicle
+status, while ensuring the security and reliability of the system. However,
+existing methods based on reinforcement learning and imitation learning suffer
+from low safety, poor generalization, and inefficient sampling. Additionally,
+they cannot accurately predict future driving trajectories, and the accurate
+prediction of future driving trajectories is a precondition for making optimal
+decisions. To solve these problems, in this paper, we introduce a Safe and
+Generalized end-to-end Autonomous Driving System (SGADS) for complex and
+various scenarios. Our SGADS incorporates variational inference with
+normalizing flows, enabling the intelligent vehicle to accurately predict
+future driving trajectories. Moreover, we propose the formulation of robust
+safety constraints. Furthermore, we combine reinforcement learning with
+demonstrations to augment search process of the agent. The experimental results
+demonstrate that our SGADS can significantly improve safety performance,
+exhibit strong generalization, and enhance the training efficiency of
+intelligent vehicles in complex urban scenarios compared to existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GI-PIP: Do We Require Impractical Auxiliary <span class="highlight-title">Dataset</span> for Gradient
+  Inversion Attacks? <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11748v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11748v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu sun, Gaojian Xiong, Xianxun Yao, Kailang Ma, Jian Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep gradient inversion attacks expose a serious threat to Federated Learning
+(FL) by accurately recovering private data from shared gradients. However, the
+state-of-the-art heavily relies on impractical assumptions to access excessive
+auxiliary data, which violates the basic data partitioning principle of FL. In
+this paper, a novel method, Gradient Inversion Attack using Practical Image
+Prior (GI-PIP), is proposed under a revised threat model. GI-PIP exploits
+anomaly detection models to capture the underlying distribution from fewer
+data, while GAN-based methods consume significant more data to synthesize
+images. The extracted distribution is then leveraged to regulate the attack
+process as Anomaly Score loss. Experimental results show that GI-PIP achieves a
+16.12 dB PSNR recovery using only 3.8% data of ImageNet, while GAN-based
+methods necessitate over 70%. Moreover, GI-PIP exhibits superior capability on
+distribution generalization compared to GAN-based methods. Our approach
+significantly alleviates the auxiliary data requirement on both amount and
+distribution in gradient inversion attacks, hence posing more substantial
+threat to real-world FL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TIM: An Efficient Temporal Interaction Module for Spiking <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11687v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11687v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicheng Shen, Dongcheng Zhao, Guobin Shen, Yi Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNNs), as the third generation of neural networks,
+have gained prominence for their biological plausibility and computational
+efficiency, especially in processing diverse datasets. The integration of
+attention mechanisms, inspired by advancements in neural network architectures,
+has led to the development of Spiking Transformers. These have shown promise in
+enhancing SNNs' capabilities, particularly in the realms of both static and
+neuromorphic datasets. Despite their progress, a discernible gap exists in
+these systems, specifically in the Spiking Self Attention (SSA) mechanism's
+effectiveness in leveraging the temporal processing potential of SNNs. To
+address this, we introduce the Temporal Interaction Module (TIM), a novel,
+convolution-based enhancement designed to augment the temporal data processing
+abilities within SNN architectures. TIM's integration into existing SNN
+frameworks is seamless and efficient, requiring minimal additional parameters
+while significantly boosting their temporal information handling capabilities.
+Through rigorous experimentation, TIM has demonstrated its effectiveness in
+exploiting temporal information, leading to state-of-the-art performance across
+various neuromorphic datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9pages,6figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Loss-Controlling Calibration for Predictive Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.04378v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.04378v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Wang, Junzhi Shi, Pingping Wang, Shuo Zhuang, Hongyue Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a learning framework for calibrating predictive models to make
+loss-controlling prediction for exchangeable data, which extends our recently
+proposed conformal loss-controlling prediction for more general cases. By
+comparison, the predictors built by the proposed loss-controlling approach are
+not limited to set predictors, and the loss function can be any measurable
+function without the monotone assumption. To control the loss values in an
+efficient way, we introduce transformations preserving exchangeability to prove
+finite-sample controlling guarantee when the test label is obtained, and then
+develop an approximation approach to construct predictors. The transformations
+can be built on any predefined function, which include using optimization
+algorithms for parameter searching. This approach is a natural extension of
+conformal loss-controlling prediction, since it can be reduced to the latter
+when the set predictors have the nesting property and the loss functions are
+monotone. Our proposed method is applied to selective regression and
+high-impact weather forecasting problems, which demonstrates its effectiveness
+for general loss-controlling prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformal Loss-Controlling Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.02424v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.02424v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Wang, Ping Wang, Zhong Ji, Xiaojun Yang, Hongyue Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conformal prediction is a learning framework controlling prediction coverage
+of prediction sets, which can be built on any learning algorithm for point
+prediction. This work proposes a learning framework named conformal
+loss-controlling prediction, which extends conformal prediction to the
+situation where the value of a loss function needs to be controlled. Different
+from existing works about risk-controlling prediction sets and conformal risk
+control with the purpose of controlling the expected values of loss functions,
+the proposed approach in this paper focuses on the loss for any test object,
+which is an extension of conformal prediction from miscoverage loss to some
+general loss. The controlling guarantee is proved under the assumption of
+exchangeability of data in finite-sample cases and the framework is tested
+empirically for classification with a class-varying loss and statistical
+postprocessing of numerical weather forecasting applications, which are
+introduced as point-wise classification and point-wise regression problems. All
+theoretical analysis and experimental results confirm the effectiveness of our
+loss-controlling approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training Deep Boltzmann Networks with Sparse Ising Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10728v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10728v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaila Niazi, Navid Anjum Aadit, Masoud Mohseni, Shuvro Chowdhury, Yao Qin, Kerem Y. Camsari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The slowing down of Moore's law has driven the development of unconventional
+computing paradigms, such as specialized Ising machines tailored to solve
+combinatorial optimization problems. In this paper, we show a new application
+domain for probabilistic bit (p-bit) based Ising machines by training deep
+generative AI models with them. Using sparse, asynchronous, and massively
+parallel Ising machines we train deep Boltzmann networks in a hybrid
+probabilistic-classical computing setup. We use the full MNIST and Fashion
+MNIST (FMNIST) dataset without any downsampling and a reduced version of
+CIFAR-10 dataset in hardware-aware network topologies implemented in moderately
+sized Field Programmable Gate Arrays (FPGA). For MNIST, our machine using only
+4,264 nodes (p-bits) and about 30,000 parameters achieves the same
+classification accuracy (90%) as an optimized software-based restricted
+Boltzmann Machine (RBM) with approximately 3.25 million parameters. Similar
+results follow for FMNIST and CIFAR-10. Additionally, the sparse deep Boltzmann
+network can generate new handwritten digits and fashion products, a task the
+3.25 million parameter RBM fails at despite achieving the same accuracy. Our
+hybrid computer takes a measured 50 to 64 billion probabilistic flips per
+second, which is at least an order of magnitude faster than superficially
+similar Graphics and Tensor Processing Unit (GPU/TPU) based implementations.
+The massively parallel architecture can comfortably perform the contrastive
+divergence algorithm (CD-n) with up to n = 10 million sweeps per update, beyond
+the capabilities of existing software implementations. These results
+demonstrate the potential of using Ising machines for traditionally
+hard-to-train deep generative Boltzmann networks, with further possible
+improvement in nanodevice-based realizations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAR-RARP50: Segmentation of surgical instrumentation and Action
+  Recognition on Robot-Assisted Radical Prostatectomy Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrios Psychogyios, Emanuele Colleoni, Beatrice Van Amsterdam, Chih-Yang Li, Shu-Yu Huang, Yuchong Li, Fucang Jia, Baosheng Zou, Guotai Wang, Yang Liu, Maxence Boels, Jiayu Huo, Rachel Sparks, Prokar Dasgupta, Alejandro Granados, Sebastien Ourselin, Mengya Xu, An Wang, Yanan Wu, Long Bai, Hongliang Ren, Atsushi Yamada, Yuriko Harai, Yuto Ishikawa, Kazuyuki Hayashi, Jente Simoens, Pieter DeBacker, Francesco Cisternino, Gabriele Furnari, Alex Mottrie, Federica Ferraguti, Satoshi Kondo, Satoshi Kasai, Kousuke Hirasawa, Soohee Kim, Seung Hyun Lee, Kyu Eun Lee, Hyoun-Joong Kong, Kui Fu, Chao Li, Shan An, Stefanie Krell, Sebastian Bodenstedt, Nicolas Ayobi, Alejandra Perez, Santiago Rodriguez, Juanita Puentes, Pablo Arbelaez, Omid Mohareri, Danail Stoyanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surgical tool segmentation and action recognition are fundamental building
+blocks in many computer-assisted intervention applications, ranging from
+surgical skills assessment to decision support systems. Nowadays,
+learning-based action recognition and segmentation approaches outperform
+classical methods, relying, however, on large, annotated datasets. Furthermore,
+action recognition and tool segmentation algorithms are often trained and make
+predictions in isolation from each other, without exploiting potential
+cross-task relationships. With the EndoVis 2022 SAR-RARP50 challenge, we
+release the first multimodal, publicly available, in-vivo, dataset for surgical
+action recognition and semantic instrumentation segmentation, containing 50
+suturing video segments of Robotic Assisted Radical Prostatectomy (RARP). The
+aim of the challenge is twofold. First, to enable researchers to leverage the
+scale of the provided dataset and develop robust and highly accurate
+single-task action recognition and tool segmentation approaches in the surgical
+domain. Second, to further explore the potential of multitask-based learning
+approaches and determine their comparative advantage against their single-task
+counterparts. A total of 12 teams participated in the challenge, contributing 7
+action recognition methods, 9 instrument segmentation techniques, and 4
+multitask approaches that integrated both action recognition and instrument
+segmentation. The complete SAR-RARP50 dataset is available at:
+https://rdr.ucl.ac.uk/projects/SARRARP50_Segmentation_of_surgical_instrumentation_and_Action_Recognition_on_Robot-Assisted_Radical_Prostatectomy_Challenge/191091
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Risk-Aware Linear Bandits: Theory and Applications in Smart Order
+  Routing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.02389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.02389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwei Ji, Renyuan Xu, Ruihao Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by practical considerations in machine learning for financial
+decision-making, such as risk aversion and large action space, we consider
+risk-aware bandits optimization with applications in smart order routing (SOR).
+Specifically, based on preliminary observations of linear price impacts made
+from the NASDAQ ITCH dataset, we initiate the study of risk-aware linear
+bandits. In this setting, we aim at minimizing regret, which measures our
+performance deficit compared to the optimum's, under the mean-variance metric
+when facing a set of actions whose rewards are linear functions of (initially)
+unknown parameters. Driven by the variance-minimizing globally-optimal
+(G-optimal) design, we propose the novel instance-independent Risk-Aware
+Explore-then-Commit (RISE) algorithm and the instance-dependent Risk-Aware
+Successive Elimination (RISE++) algorithm. Then, we rigorously analyze their
+near-optimal regret upper bounds to show that, by leveraging the linear
+structure, our algorithms can dramatically reduce the regret when compared to
+existing methods. Finally, we demonstrate the performance of the algorithms by
+conducting extensive numerical experiments in the SOR setup using both
+synthetic datasets and the NASDAQ ITCH dataset. Our results reveal that 1) The
+linear structure assumption can indeed be well supported by the Nasdaq dataset;
+and more importantly 2) Both RISE and RISE++ can significantly outperform the
+competing methods, in terms of regret, especially in complex decision-making
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Distillation on Spatial-Temporal Graph Convolutional Network
+  for Traffic Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11798v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11798v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Izadi, Mehran Safayani, Abdolreza Mirzaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient real-time traffic prediction is crucial for reducing transportation
+time. To predict traffic conditions, we employ a spatio-temporal graph neural
+network (ST-GNN) to model our real-time traffic data as temporal graphs.
+Despite its capabilities, it often encounters challenges in delivering
+efficient real-time predictions for real-world traffic data. Recognizing the
+significance of timely prediction due to the dynamic nature of real-time data,
+we employ knowledge distillation (KD) as a solution to enhance the execution
+time of ST-GNNs for traffic prediction. In this paper, We introduce a cost
+function designed to train a network with fewer parameters (the student) using
+distilled data from a complex network (the teacher) while maintaining its
+accuracy close to that of the teacher. We use knowledge distillation,
+incorporating spatial-temporal correlations from the teacher network to enable
+the student to learn the complex patterns perceived by the teacher. However, a
+challenge arises in determining the student network architecture rather than
+considering it inadvertently. To address this challenge, we propose an
+algorithm that utilizes the cost function to calculate pruning scores,
+addressing small network architecture search issues, and jointly fine-tunes the
+network resulting from each pruning stage using KD. Ultimately, we evaluate our
+proposed ideas on two real-world datasets, PeMSD7 and PeMSD8. The results
+indicate that our method can maintain the student's accuracy close to that of
+the teacher, even with the retention of only $3\%$ of network parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeepGD: A Multi-Objective Black-Box Test Selection Approach for Deep
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04878v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04878v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zohreh Aghababaeyan, Manel Abdellatif, Mahboubeh Dadkhah, Lionel Briand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are widely used in various application domains
+such as image processing, speech recognition, and natural language processing.
+However, testing DNN models may be challenging due to the complexity and size
+of their input domain. Particularly, testing DNN models often requires
+generating or exploring large unlabeled datasets. In practice, DNN test
+oracles, which identify the correct outputs for inputs, often require expensive
+manual effort to label test data, possibly involving multiple experts to ensure
+labeling correctness. In this paper, we propose DeepGD, a black-box
+multi-objective test selection approach for DNN models. It reduces the cost of
+labeling by prioritizing the selection of test inputs with high fault revealing
+power from large unlabeled datasets. DeepGD not only selects test inputs with
+high uncertainty scores to trigger as many mispredicted inputs as possible but
+also maximizes the probability of revealing distinct faults in the DNN model by
+selecting diverse mispredicted inputs. The experimental results conducted on
+four widely used datasets and five DNN models show that in terms of
+fault-revealing ability: (1) White-box, coverage-based approaches fare poorly,
+(2) DeepGD outperforms existing black-box test selection approaches in terms of
+fault detection, and (3) DeepGD also leads to better guidance for DNN model
+retraining when using selected inputs to augment the training set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning DAGs from Data with Few Root Causes <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15936v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15936v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panagiotis Misiakos, Chris Wendler, Markus Püschel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel perspective and algorithm for learning directed acyclic
+graphs (DAGs) from data generated by a linear structural equation model (SEM).
+First, we show that a linear SEM can be viewed as a linear transform that, in
+prior work, computes the data from a dense input vector of random valued root
+causes (as we will call them) associated with the nodes. Instead, we consider
+the case of (approximately) few root causes and also introduce noise in the
+measurement of the data. Intuitively, this means that the DAG data is produced
+by few data-generating events whose effect percolates through the DAG. We prove
+identifiability in this new setting and show that the true DAG is the global
+minimizer of the $L^0$-norm of the vector of root causes. For data with few
+root causes, with and without noise, we show superior performance compared to
+prior DAG learning methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in 37th Conference on Neural Information Processing
+  Systems (NeurIPS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Compact LSTM-SVM Fusion Model for Long-Duration Cardiovascular
+  Diseases Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09442v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09442v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyang Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Globally, cardiovascular diseases (CVDs) are the leading cause of mortality,
+accounting for an estimated 17.9 million deaths annually. One critical clinical
+objective is the early detection of CVDs using electrocardiogram (ECG) data, an
+area that has received significant attention from the research community.
+Recent advancements based on machine learning and deep learning have achieved
+great progress in this domain. However, existing methodologies exhibit inherent
+limitations, including inappropriate model evaluations and instances of data
+leakage. In this study, we present a streamlined workflow paradigm for
+preprocessing ECG signals into consistent 10-second durations, eliminating the
+need for manual feature extraction/beat detection. We also propose a hybrid
+model of Long Short-Term Memory (LSTM) with Support Vector Machine (SVM) for
+fraud detection. This architecture consists of two LSTM layers and an SVM
+classifier, which achieves a SOTA results with an Average precision score of
+0.9402 on the MIT-BIH arrhythmia dataset and 0.9563 on the MIT-BIH atrial
+fibrillation dataset. Based on the results, we believe our method can
+significantly benefit the early detection and management of CVDs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TE2Rules: Explaining Tree Ensembles using Rules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.14359v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.14359v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        G Roshan Lal, Xiaotong Chen, Varun Mithal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tree Ensemble (TE) models, such as Gradient Boosted Trees, often achieve
+optimal performance on tabular datasets, yet their lack of transparency poses
+challenges for comprehending their decision logic. This paper introduces
+TE2Rules (Tree Ensemble to Rules), a novel approach for explaining binary
+classification tree ensemble models through a list of rules, particularly
+focusing on explaining the minority class. Many state-of-the-art explainers
+struggle with minority class explanations, making TE2Rules valuable in such
+cases. The rules generated by TE2Rules closely approximate the original model,
+ensuring high fidelity, providing an accurate and interpretable means to
+understand decision-making. Experimental results demonstrate that TE2Rules
+scales effectively to tree ensembles with hundreds of trees, achieving higher
+fidelity within runtimes comparable to baselines. TE2Rules allows for a
+trade-off between runtime and fidelity, enhancing its practical applicability.
+The implementation is available here: https://github.com/linkedin/TE2Rules.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning in Inverse Optimization: Incenter Cost, Augmented Suboptimality
+  Loss, and Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07730v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07730v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Zattoni Scroccaro, Bilge Atasoy, Peyman Mohajerin Esfahani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Inverse Optimization (IO), an expert agent solves an optimization problem
+parametric in an exogenous signal. From a learning perspective, the goal is to
+learn the expert's cost function given a dataset of signals and corresponding
+optimal actions. Motivated by the geometry of the IO set of consistent cost
+vectors, we introduce the "incenter" concept, a new notion akin to circumcenter
+recently proposed by Besbes et al. (2023). Discussing the geometric and
+robustness interpretation of the incenter cost vector, we develop corresponding
+tractable convex reformulations, which are in contrast with the circumcenter,
+which we show is equivalent to an intractable optimization program. We further
+propose a novel loss function called Augmented Suboptimality Loss (ASL), a
+relaxation of the incenter concept for problems with inconsistent data.
+Exploiting the structure of the ASL, we propose a novel first-order algorithm,
+which we name Stochastic Approximate Mirror Descent. This algorithm combines
+stochastic and approximate subgradient evaluations, together with mirror
+descent update steps, which is provably efficient for the IO problems with
+discrete feasible sets with high cardinality. We implement the IO approaches
+developed in this paper as a Python package called InvOpt. Our numerical
+experiments are reproducible, and the underlying source code is available as
+examples in the InvOpt package.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collective Relational Inference for learning heterogeneous interactions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.00557v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.00557v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhichao Han, Olga Fink, David S. Kammer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interacting systems are ubiquitous in nature and engineering, ranging from
+particle dynamics in physics to functionally connected brain regions. These
+interacting systems can be modeled by graphs where edges correspond to the
+interactions between interactive entities. Revealing interaction laws is of
+fundamental importance but also particularly challenging due to underlying
+configurational complexities. The associated challenges become exacerbated for
+heterogeneous systems that are prevalent in reality, where multiple interaction
+types coexist simultaneously and relational inference is required. Here, we
+propose a novel probabilistic method for relational inference, which possesses
+two distinctive characteristics compared to existing methods. First, it infers
+the interaction types of different edges collectively by explicitly encoding
+the correlation among incoming interactions with a joint distribution, and
+second, it allows handling systems with variable topological structure over
+time. We evaluate the proposed methodology across several benchmark datasets
+and demonstrate that it outperforms existing methods in accurately inferring
+interaction types. We further show that when combined with known constraints,
+it allows us, for example, to discover physics-consistent interaction laws of
+particle systems. Overall the proposed model is data-efficient and
+generalizable to large systems when trained on smaller ones. The developed
+methodology constitutes a key element for understanding interacting systems and
+may find application in graph structure learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review. Links to the supporting code can be found at the end of
+  the main content</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Peering Through Preferences: Unraveling Feedback Acquisition for
+  Aligning Large Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.15812v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.15812v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hritik Bansal, John Dang, Aditya Grover
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aligning large language models (LLMs) with human values and intents
+critically involves the use of human or AI feedback. While dense feedback
+annotations are expensive to acquire and integrate, sparse feedback presents a
+structural design choice between ratings (e.g., score Response A on a scale of
+1-7) and rankings (e.g., is Response A better than Response B?). In this work,
+we analyze the effect of this design choice for the alignment and evaluation of
+LLMs. We uncover an inconsistency problem wherein the preferences inferred from
+ratings and rankings significantly disagree 60% for both human and AI
+annotators. Our subsequent analysis identifies various facets of annotator
+biases that explain this phenomena, such as human annotators would rate denser
+responses higher while preferring accuracy during pairwise judgments. To our
+surprise, we also observe that the choice of feedback protocol also has a
+significant effect on the evaluation of aligned LLMs. In particular, we find
+that LLMs that leverage rankings data for alignment (say model X) are preferred
+over those that leverage ratings data (say model Y), with a rank-based
+evaluation protocol (is X/Y's response better than reference response?) but not
+with a rating-based evaluation protocol (score Rank X/Y's response on a scale
+of 1-7). Our findings thus shed light on critical gaps in methods for
+evaluating the real-world utility of language models and their strong
+dependence on the feedback protocol used for alignment. Our code and data are
+available at https://github.com/Hritikbansal/sparse_feedback.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, Accepted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SMT 2.0: A Surrogate Modeling Toolbox with a focus on Hierarchical and
+  Mixed Variables Gaussian Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13998v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13998v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Saves, Remi Lafage, Nathalie Bartoli, Youssef Diouane, Jasper Bussemaker, Thierry Lefebvre, John T. Hwang, Joseph Morlier, Joaquim R. R. A. Martins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Surrogate Modeling Toolbox (SMT) is an open-source Python package that
+offers a collection of surrogate modeling methods, sampling techniques, and a
+set of sample problems. This paper presents SMT 2.0, a major new release of SMT
+that introduces significant upgrades and new features to the toolbox. This
+release adds the capability to handle mixed-variable surrogate models and
+hierarchical variables. These types of variables are becoming increasingly
+important in several surrogate modeling applications. SMT 2.0 also improves SMT
+by extending sampling methods, adding new surrogate models, and computing
+variance and kernel derivatives for Kriging. This release also includes new
+functions to handle noisy and use multifidelity data. To the best of our
+knowledge, SMT 2.0 is the first open-source surrogate library to propose
+surrogate models for hierarchical and mixed inputs. This open-source software
+is distributed under the New BSD license.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10.1016/j.advengsoft.2023.103571</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A mixed-categorical correlation kernel for Gaussian process 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08262v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08262v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        P. Saves, Y. Diouane, N. Bartoli, T. Lefebvre, J. Morlier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been a growing interest for mixed-categorical meta-models
+based on Gaussian process (GP) surrogates. In this setting, several existing
+approaches use different strategies either by using continuous kernels (e.g.,
+continuous relaxation and Gower distance based GP) or by using a direct
+estimation of the correlation matrix. In this paper, we present a kernel-based
+approach that extends continuous exponential kernels to handle
+mixed-categorical variables. The proposed kernel leads to a new GP surrogate
+that generalizes both the continuous relaxation and the Gower distance based GP
+models. We demonstrate, on both analytical and engineering problems, that our
+proposed GP model gives a higher likelihood and a smaller residual error than
+the other kernel-based state-of-the-art models. Our method is available in the
+open-source software SMT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Neurocomputing. 10.1016/j.neucom.2023.126472</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Parametric Matrix Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11694v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11694v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Cook, Danny Jammooa, Morten Hjorth-Jensen, Daniel D. Lee, Dean Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a general class of machine learning algorithms called parametric
+matrix models. Parametric matrix models are based on matrix equations, and the
+design is motivated by the efficiency of reduced basis methods for
+approximating solutions of parametric equations. The dependent variables can be
+defined implicitly or explicitly, and the equations may use algebraic,
+differential, or integral relations. Parametric matrix models can be trained
+with empirical data only, and no high-fidelity model calculations are needed.
+While originally designed for scientific computing, parametric matrix models
+are universal function approximators that can be applied to general machine
+learning problems. After introducing the underlying theory, we apply parametric
+matrix models to a series of different challenges that show their performance
+for a wide range of problems. For all the challenges tested here, parametric
+matrix models produce accurate results within a computational framework that
+allows for parameter extrapolation and interpretability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using LLMs to discover emerging coded antisemitic hate-speech in
+  extremist social media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10841v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10841v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhanush Kikkisetti, Raza Ul Mustafa, Wendy Melillo, Roberto Corizzo, Zois Boukouvalas, Jeff Gill, Nathalie Japkowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online hate speech proliferation has created a difficult problem for social
+media platforms. A particular challenge relates to the use of coded language by
+groups interested in both creating a sense of belonging for its users and
+evading detection. Coded language evolves quickly and its use varies over time.
+This paper proposes a methodology for detecting emerging coded hate-laden
+terminology. The methodology is tested in the context of online antisemitic
+discourse. The approach considers posts scraped from social media platforms,
+often used by extremist users. The posts are scraped using seed expressions
+related to previously known discourse of hatred towards Jews. The method begins
+by identifying the expressions most representative of each post and calculating
+their frequency in the whole corpus. It filters out grammatically incoherent
+expressions as well as previously encountered ones so as to focus on emergent
+well-formed terminology. This is followed by an assessment of semantic
+similarity to known antisemitic terminology using a fine-tuned large language
+model, and subsequent filtering out of the expressions that are too distant
+from known expressions of hatred. Emergent antisemitic expressions containing
+terms clearly relating to Jewish topics are then removed to return only coded
+expressions of hatred.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, 2 algorithms, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Imitation Learning from Visual Observations using Latent
+  Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17371v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17371v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vittorio Giammarino, James Queeney, Ioannis Ch. Paschalidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We focus on the problem of imitation learning from visual observations, where
+the learning agent has access to videos of experts as its sole learning source.
+The challenges of this framework include the absence of expert actions and the
+partial observability of the environment, as the ground-truth states can only
+be inferred from pixels. To tackle this problem, we first conduct a theoretical
+analysis of imitation learning in partially observable environments. We
+establish upper bounds on the suboptimality of the learning agent with respect
+to the divergence between the expert and the agent latent state-transition
+distributions. Motivated by this analysis, we introduce an algorithm called
+Latent Adversarial Imitation from Observations, which combines off-policy
+adversarial imitation techniques with a learned latent representation of the
+agent's state from sequences of observations. In experiments on
+high-dimensional continuous robotic tasks, we show that our algorithm matches
+state-of-the-art performance while providing significant computational
+advantages. Additionally, we show how our method can be used to improve the
+efficiency of reinforcement learning from pixels by leveraging expert videos.
+To ensure reproducibility, we provide free access to our code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">5</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeRF-AD: Neural Radiance Field with Attention-based Disentanglement for
+  Talking Face Synthesis <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chongke Bi, Xiaoxing Liu, Zhilei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Talking face synthesis driven by audio is one of the current research
+hotspots in the fields of multidimensional signal processing and multimedia.
+Neural Radiance Field (NeRF) has recently been brought to this research field
+in order to enhance the realism and 3D effect of the generated faces. However,
+most existing NeRF-based methods either burden NeRF with complex learning tasks
+while lacking methods for supervised multimodal feature fusion, or cannot
+precisely map audio to the facial region related to speech movements. These
+reasons ultimately result in existing methods generating inaccurate lip shapes.
+This paper moves a portion of NeRF learning tasks ahead and proposes a talking
+face synthesis method via NeRF with attention-based disentanglement (NeRF-AD).
+In particular, an Attention-based Disentanglement module is introduced to
+disentangle the face into Audio-face and Identity-face using speech-related
+facial action unit (AU) information. To precisely regulate how audio affects
+the talking face, we only fuse the Audio-face with audio feature. In addition,
+AU information is also utilized to supervise the fusion of these two
+modalities. Extensive qualitative and quantitative experiments demonstrate that
+our NeRF-AD outperforms state-of-the-art methods in generating realistic
+talking face videos, including image quality and lip synchronization. To view
+video results, please refer to https://xiaoxingliu02.github.io/NeRF-AD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Low-latency Speech Enhancement via Speech Token Generation <span class="chip">ICASSP2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08981v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08981v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaying Xue, Xiulian Peng, Yan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing deep learning based speech enhancement mainly employ a data-driven
+approach, which leverage large amounts of data with a variety of noise types to
+achieve noise removal from noisy signal. However, the high dependence on the
+data limits its generalization on the unseen complex noises in real-life
+environment. In this paper, we focus on the low-latency scenario and regard
+speech enhancement as a speech generation problem conditioned on the noisy
+signal, where we generate clean speech instead of identifying and removing
+noises. Specifically, we propose a conditional generative framework for speech
+enhancement, which models clean speech by acoustic codes of a neural speech
+codec and generates the speech codes conditioned on past noisy frames in an
+auto-regressive way. Moreover, we propose an explicit-alignment approach to
+align noisy frames with the generated speech tokens to improve the robustness
+and scalability to different input lengths. Different from other methods that
+leverage multiple stages to generate speech codes, we leverage a single-stage
+speech generation approach based on the TF-Codec neural codec to achieve high
+speech quality with low latency. Extensive results on both synthetic and
+real-recorded test set show its superiority over data-driven approaches in
+terms of noise robustness and temporal speech coherence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, ICASSP2024(accepted)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SynthTab: Leveraging Synthesized Data for Guitar Tablature Transcription <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09085v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09085v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongyi Zang, Yi Zhong, Frank Cwitkowitz, Zhiyao Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Guitar tablature is a form of music notation widely used among guitarists. It
+captures not only the musical content of a piece, but also its implementation
+and ornamentation on the instrument. Guitar Tablature Transcription (GTT) is an
+important task with broad applications in music education, composition, and
+entertainment. Existing GTT datasets are quite limited in size and scope,
+rendering models trained on them prone to overfitting and incapable of
+generalizing to out-of-domain data. In order to address this issue, we present
+a methodology for synthesizing large-scale GTT audio using commercial acoustic
+and electric guitar plugins. We procure SynthTab, a dataset derived from
+DadaGP, which is a vast and diverse collection of richly annotated symbolic
+tablature. The proposed synthesis pipeline produces audio which faithfully
+adheres to the original fingerings and a subset of techniques specified in the
+tablature, and covers multiple guitars and styles for each track. Experiments
+show that pre-training a baseline GTT model on SynthTab can improve
+transcription performance when fine-tuning and testing on an individual
+dataset. More importantly, cross-dataset experiments show that pre-training
+significantly mitigates issues with overfitting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal Misinformation Detection: Approaches, Challenges and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.13883v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.13883v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Abdali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As social media platforms are evolving from text-based forums into
+multi-modal environments, the nature of misinformation in social media is also
+transforming accordingly. Taking advantage of the fact that visual modalities
+such as images and videos are more favorable and attractive to the users and
+textual contents are sometimes skimmed carelessly, misinformation spreaders
+have recently targeted contextual connections between the modalities e.g., text
+and image. Hence many researchers have developed automatic techniques for
+detecting possible cross-modal discordance in web-based content. We analyze,
+categorize and identify existing approaches in addition to challenges and
+shortcomings they face in order to unearth new research opportunities in the
+field of multi-modal misinformation detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Probing Commonsense Reasoning Capability of Text-to-Image Generative
+  Models via Non-visual Description 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07294v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07294v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mianzhi Pan, Jianfei Li, Mingyue Yu, Zheng Ma, Kanzhi Cheng, Jianbing Zhang, Jiajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Commonsense reasoning, the ability to make logical assumptions about daily
+scenes, is one core intelligence of human beings. In this work, we present a
+novel task and dataset for evaluating the ability of text-to-image generative
+models to conduct commonsense reasoning, which we call PAINTaboo. Given a
+description with few visual clues of one object, the goal is to generate images
+illustrating the object correctly. The dataset was carefully hand-curated and
+covered diverse object categories to analyze model performance comprehensively.
+Our investigation of several prevalent text-to-image generative models reveals
+that these models are not proficient in commonsense reasoning, as anticipated.
+We trust that PAINTaboo can improve our understanding of the reasoning
+abilities of text-to-image generative models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>It is an incomplete work</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-01-22T00:00:00Z">2024-01-22</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">65</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CheXagent: Towards a Foundation Model for Chest X-Ray Interpretation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihong Chen, Maya Varma, Jean-Benoit Delbrouck, Magdalini Paschali, Louis Blankemeier, Dave Van Veen, Jeya Maria Jose Valanarasu, Alaa Youssef, Joseph Paul Cohen, Eduardo Pontes Reis, Emily B. Tsai, Andrew Johnston, Cameron Olsen, Tanishq Mathew Abraham, Sergios Gatidis, Akshay S. Chaudhari, Curtis Langlotz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chest X-rays (CXRs) are the most frequently performed imaging test in
+clinical practice. Recent advances in the development of vision-language
+foundation models (FMs) give rise to the possibility of performing automated
+CXR interpretation, which can assist physicians with clinical decision-making
+and improve patient outcomes. However, developing FMs that can accurately
+interpret CXRs is challenging due to the (1) limited availability of
+large-scale vision-language datasets in the medical image domain, (2) lack of
+vision and language encoders that can capture the complexities of medical data,
+and (3) absence of evaluation frameworks for benchmarking the abilities of FMs
+on CXR interpretation. In this work, we address these challenges by first
+introducing \emph{CheXinstruct} - a large-scale instruction-tuning dataset
+curated from 28 publicly-available datasets. We then present \emph{CheXagent} -
+an instruction-tuned FM capable of analyzing and summarizing CXRs. To build
+CheXagent, we design a clinical large language model (LLM) for parsing
+radiology reports, a vision encoder for representing CXR images, and a network
+to bridge the vision and language modalities. Finally, we introduce
+\emph{CheXbench} - a novel benchmark designed to systematically evaluate FMs
+across 8 clinically-relevant CXR interpretation tasks. Extensive quantitative
+evaluations and qualitative reviews with five expert radiologists demonstrate
+that CheXagent outperforms previously-developed general- and medical-domain FMs
+on CheXbench tasks. Furthermore, in an effort to improve model transparency, we
+perform a fairness evaluation across factors of sex, race and age to highlight
+potential performance disparities. Our project is at
+\url{https://stanford-aimi.github.io/chexagent.html}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ APT: Adaptive Pruning and Tuning <span class="highlight-title">Pretrain</span>ed Language Models for
+  Efficient Training and Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12200v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12200v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Zhao, Hannaneh Hajishirzi, Qingqing Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning and inference with large Language Models (LM) are generally known
+to be expensive. Parameter-efficient fine-tuning over pretrained LMs reduces
+training memory by updating a small number of LM parameters but does not
+improve inference efficiency. Structured pruning improves LM inference
+efficiency by removing consistent parameter blocks, yet often increases
+training memory and time. To improve both training and inference efficiency, we
+introduce APT that adaptively prunes and tunes parameters for the LMs. At the
+early stage of fine-tuning, APT dynamically adds salient tuning parameters for
+fast and accurate convergence while discarding unimportant parameters for
+efficiency. Compared to baselines, our experiments show that APT maintains up
+to 98% task performance when pruning RoBERTa and T5 models with 40% parameters
+left while keeping 86.4% LLaMA models' performance with 70% parameters
+remained. Furthermore, APT speeds up LMs fine-tuning by up to 8x and reduces
+large LMs memory training footprint by up to 70%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text Embedding Inversion Attacks on Multilingual Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyi Chen, Heather Lent, Johannes Bjerva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representing textual information as real-numbered embeddings has become the
+norm in NLP. Moreover, with the rise of public interest in large language
+models (LLMs), Embeddings as a Service (EaaS) has rapidly gained traction as a
+business model. This is not without outstanding security risks, as previous
+research has demonstrated that sensitive data can be reconstructed from
+embeddings, even without knowledge of the underlying model that generated them.
+However, such work is limited by its sole focus on English, leaving all other
+languages vulnerable to attacks by malicious actors. %As many international and
+multilingual companies leverage EaaS, there is an urgent need for research into
+multilingual LLM security. To this end, this work investigates LLM security
+from the perspective of multilingual embedding inversion. Concretely, we define
+the problem of black-box multilingual and cross-lingual inversion attacks, with
+special attention to a cross-domain scenario. Our findings reveal that
+multilingual models are potentially more vulnerable to inversion attacks than
+their monolingual counterparts. This stems from the reduced data requirements
+for achieving comparable inversion performance in settings where the underlying
+language is not known a-priori. To our knowledge, this work is the first to
+delve into multilinguality within the context of inversion attacks, and our
+findings highlight the need for further investigation and enhanced defenses in
+the area of NLP Security.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WARM: On the Benefits of Weight Averaged Reward Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Ramé, Nino Vieillard, Léonard Hussenot, Robert Dadashi, Geoffrey Cideron, Olivier Bachem, Johan Ferret
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aligning large language models (LLMs) with human preferences through
+reinforcement learning (RLHF) can lead to reward hacking, where LLMs exploit
+failures in the reward model (RM) to achieve seemingly high rewards without
+meeting the underlying objectives. We identify two primary challenges when
+designing RMs to mitigate reward hacking: distribution shifts during the RL
+process and inconsistencies in human preferences. As a solution, we propose
+Weight Averaged Reward Models (WARM), first fine-tuning multiple RMs, then
+averaging them in the weight space. This strategy follows the observation that
+fine-tuned weights remain linearly mode connected when sharing the same
+pre-training. By averaging weights, WARM improves efficiency compared to the
+traditional ensembling of predictions, while improving reliability under
+distribution shifts and robustness to preference inconsistencies. Our
+experiments on summarization tasks, using best-of-N and RL methods, shows that
+WARM improves the overall quality and alignment of LLM predictions; for
+example, a policy RL fine-tuned with WARM has a 79.4% win rate against a policy
+RL fine-tuned with a single RM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Neurons in <span class="highlight-title">GPT</span>2 Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12181v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12181v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wes Gurnee, Theo Horsley, Zifan Carl Guo, Tara Rezaei Kheirkhah, Qinyi Sun, Will Hathaway, Neel Nanda, Dimitris Bertsimas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A basic question within the emerging field of mechanistic interpretability is
+the degree to which neural networks learn the same underlying mechanisms. In
+other words, are neural mechanisms universal across different models? In this
+work, we study the universality of individual neurons across GPT2 models
+trained from different initial random seeds, motivated by the hypothesis that
+universal neurons are likely to be interpretable. In particular, we compute
+pairwise correlations of neuron activations over 100 million tokens for every
+neuron pair across five different seeds and find that 1-5\% of neurons are
+universal, that is, pairs of neurons which consistently activate on the same
+inputs. We then study these universal neurons in detail, finding that they
+usually have clear interpretations and taxonomize them into a small number of
+neuron families. We conclude by studying patterns in neuron weights to
+establish several universal functional roles of neurons in simple circuits:
+deactivating attention heads, changing the entropy of the next token
+distribution, and predicting the next token to (not) be within a particular
+set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Learning for Extreme Multi-Label Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karel D'Oosterlinck, Omar Khattab, François Remy, Thomas Demeester, Chris Develder, Christopher Potts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-label classification problems with thousands of classes are hard to
+solve with in-context learning alone, as language models (LMs) might lack prior
+knowledge about the precise classes or how to assign them, and it is generally
+infeasible to demonstrate every class in a prompt. We propose a general
+program, $\texttt{Infer--Retrieve--Rank}$, that defines multi-step interactions
+between LMs and retrievers to efficiently tackle such problems. We implement
+this program using the $\texttt{DSPy}$ programming model, which specifies
+in-context systems in a declarative manner, and use $\texttt{DSPy}$ optimizers
+to tune it towards specific datasets by bootstrapping only tens of few-shot
+examples. Our primary extreme classification program, optimized separately for
+each task, attains state-of-the-art results across three benchmarks (HOUSE,
+TECH, TECHWOLF). We apply the same program to a benchmark with vastly different
+characteristics and attain competitive performance as well (BioDEX). Unlike
+prior work, our proposed solution requires no finetuning, is easily applicable
+to new tasks, alleviates prompt engineering, and requires only tens of labeled
+examples. Our code is public at https://github.com/KarelDO/xmc.dspy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning
+  Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyuan Chen, Zhuo Xu, Sean Kirmani, Brian Ichter, Danny Driess, Pete Florence, Dorsa Sadigh, Leonidas Guibas, Fei Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and reasoning about spatial relationships is a fundamental
+capability for Visual Question Answering (VQA) and robotics. While Vision
+Language Models (VLM) have demonstrated remarkable performance in certain VQA
+benchmarks, they still lack capabilities in 3D spatial reasoning, such as
+recognizing quantitative relationships of physical objects like distances or
+size differences. We hypothesize that VLMs' limited spatial reasoning
+capability is due to the lack of 3D spatial knowledge in training data and aim
+to solve this problem by training VLMs with Internet-scale spatial reasoning
+data. To this end, we present a system to facilitate this approach. We first
+develop an automatic 3D spatial VQA data generation framework that scales up to
+2 billion VQA examples on 10 million real-world images. We then investigate
+various factors in the training recipe, including data quality, training
+pipeline, and VLM architecture. Our work features the first internet-scale 3D
+spatial reasoning dataset in metric space. By training a VLM on such data, we
+significantly enhance its ability on both qualitative and quantitative spatial
+VQA. Finally, we demonstrate that this VLM unlocks novel downstream
+applications in chain-of-thought spatial reasoning and robotics due to its
+quantitative estimation capability. Project website:
+https://spatial-vlm.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anisotropy Is Inherent to Self-Attention in <span class="highlight-title">Transformer</span>s <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Godey, Éric de la Clergerie, Benoît Sagot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The representation degeneration problem is a phenomenon that is widely
+observed among self-supervised learning methods based on Transformers. In NLP,
+it takes the form of anisotropy, a singular property of hidden representations
+which makes them unexpectedly close to each other in terms of angular distance
+(cosine-similarity). Some recent works tend to show that anisotropy is a
+consequence of optimizing the cross-entropy loss on long-tailed distributions
+of tokens. We show in this paper that anisotropy can also be observed
+empirically in language models with specific objectives that should not suffer
+directly from the same consequences. We also show that the anisotropy problem
+extends to Transformers trained on other modalities. Our observations suggest
+that anisotropy is actually inherent to Transformers-based models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of EACL 2024. Previously presented at ACL-SRW 2023
+  (arXiv:2306.07656). arXiv admin note: substantial text overlap with
+  arXiv:2306.07656</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Curious Case of Nonverbal Abstract Reasoning with Multi-Modal Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kian Ahrabian, Zhivar Sourati, Kexuan Sun, Jiarui Zhang, Yifan Jiang, Fred Morstatter, Jay Pujara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) are still being adopted to new domains and
+utilized in novel applications, we are experiencing an influx of the new
+generation of foundation models, namely multi-modal large language models
+(MLLMs). These models integrate verbal and visual information, opening new
+possibilities to demonstrate more complex reasoning abilities at the
+intersection of the two modalities. However, despite the revolutionizing
+prospect of MLLMs, our understanding of their reasoning abilities is limited.
+In this study, we assess the nonverbal abstract reasoning abilities of
+open-source and closed-source MLLMs using variations of Raven's Progressive
+Matrices. Our experiments expose the difficulty of solving such problems while
+showcasing the immense gap between open-source and closed-source models. We
+also reveal critical shortcomings with individual visual and textual modules,
+subjecting the models to low-performance ceilings. Finally, to improve MLLMs'
+performance, we experiment with various methods, such as Chain-of-Thought
+prompting, resulting in a significant (up to 100%) boost in performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and datasets are available at
+  https://github.com/kahrabian/mllm-nvar</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Analysis of In-context Learning Abilities of LLMs for MT 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pranjal A. Chitale, Jay Gala, Varun Gumma, Mitesh M. Khapra, Raj Dabre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning (ICL) has consistently demonstrated superior performance
+over zero-shot performance in large language models (LLMs). However, the
+understanding of the dynamics of ICL and the aspects that influence downstream
+performance remains limited, especially for natural language generation (NLG)
+tasks. This work aims to address this gap by investigating the ICL capabilities
+of LLMs and studying the impact of different aspects of the in-context
+demonstrations for the task of machine translation (MT). Our preliminary
+investigations aim to discern whether in-context learning (ICL) is
+predominantly influenced by demonstrations or instructions by applying diverse
+perturbations to in-context demonstrations while preserving the task
+instruction. We observe varying behavior to perturbed examples across different
+model families, notably with BLOOM-7B derivatives being severely influenced by
+noise, whereas Llama 2 derivatives not only exhibit robustness but also tend to
+show enhancements over the clean baseline when subject to perturbed
+demonstrations. This suggests that the robustness of ICL may be governed by
+several factors, including the type of noise, perturbation direction (source or
+target), the extent of pretraining of the specific model, and fine-tuning for
+downstream tasks if applicable. Further investigation is warranted to develop a
+comprehensive understanding of these factors in future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Learning of Graph from Recipes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aissatou Diallo, Antonis Bikakis, Luke Dickens, Anthony Hunter, Rob Miller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooking recipes are one of the most readily available kinds of procedural
+text. They consist of natural language instructions that can be challenging to
+interpret. In this paper, we propose a model to identify relevant information
+from recipes and generate a graph to represent the sequence of actions in the
+recipe. In contrast with other approaches, we use an unsupervised approach. We
+iteratively learn the graph structure and the parameters of a $\mathsf{GNN}$
+encoding the texts (text-to-graph) one sequence at a time while providing the
+supervision by decoding the graph into text (graph-to-text) and comparing the
+generated text to the input. We evaluate the approach by comparing the
+identified entities with annotated datasets, comparing the difference between
+the input and output texts, and comparing our generated graphs with those
+generated by state of the art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Demonstration Selection Strategies in In-Context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keqin Peng, Liang Ding, Yancheng Yuan, Xuebo Liu, Min Zhang, Yuanxin Ouyang, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown an impressive ability to perform a
+wide range of tasks using in-context learning (ICL), where a few examples are
+used to describe a task to the model. However, the performance of ICL varies
+significantly with the choice of demonstrations, and it is still unclear why
+this happens or what factors will influence its choice. In this work, we first
+revisit the factors contributing to this variance from both data and model
+aspects, and find that the choice of demonstration is both data- and
+model-dependent. We further proposed a data- and model-dependent demonstration
+selection method, \textbf{TopK + ConE}, based on the assumption that
+\textit{the performance of a demonstration positively correlates with its
+contribution to the model's understanding of the test samples}, resulting in a
+simple and effective recipe for ICL. Empirically, our method yields consistent
+improvements in both language understanding and generation tasks with different
+model scales. Further analyses confirm that, besides the generality and
+stability under different circumstances, our method provides a unified
+explanation for the effectiveness of previous methods. Code will be released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ West-of-N: Synthetic Preference Generation for Improved Reward Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alizée Pace, Jonathan Mallinson, Eric Malmi, Sebastian Krause, Aliaksei Severyn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of reinforcement learning from human feedback (RLHF) in language
+model alignment is strongly dependent on the quality of the underlying reward
+model. In this paper, we present a novel approach to improve reward model
+quality by generating synthetic preference data, thereby augmenting the
+training dataset with on-policy, high-quality preference pairs. Motivated by
+the promising results of Best-of-N sampling strategies in language model
+training, we extend their application to reward model training. This results in
+a self-training strategy to generate preference pairs by selecting the best and
+worst candidates in a pool of responses to a given query. Empirically, we find
+that this approach improves the performance of any reward model, with an effect
+comparable to the addition of a similar quantity of human preference data. This
+work opens up new avenues of research for improving RLHF for language model
+alignment, by offering synthetic preference generation as a solution to reward
+modeling challenges.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal Blind Spots in Large Language Models <span class="chip">WSDM'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Wallat, Adam Jatowt, Avishek Anand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have recently gained significant attention due
+to their unparalleled ability to perform various natural language processing
+tasks. These models, benefiting from their advanced natural language
+understanding capabilities, have demonstrated impressive zero-shot performance.
+However, the pre-training data utilized in LLMs is often confined to a specific
+corpus, resulting in inherent freshness and temporal scope limitations.
+Consequently, this raises concerns regarding the effectiveness of LLMs for
+tasks involving temporal intents. In this study, we aim to investigate the
+underlying limitations of general-purpose LLMs when deployed for tasks that
+require a temporal understanding. We pay particular attention to handling
+factual temporal knowledge through three popular temporal QA datasets.
+Specifically, we observe low performance on detailed questions about the past
+and, surprisingly, for rather new information. In manual and automatic testing,
+we find multiple temporal errors and characterize the conditions under which QA
+performance deteriorates. Our analysis contributes to understanding LLM
+limitations and offers valuable insights into developing future models that can
+better cater to the demands of temporally-oriented tasks. The code is
+available\footnote{https://github.com/jwallat/temporalblindspots}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at WSDM'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-lingual Transfer Learning for Javanese Dependency Parsing <span class="chip">AACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12072v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12072v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fadli Aulawi Al Ghiffari, Ika Alfina, Kurniawati Azizah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While structure learning achieves remarkable performance in high-resource
+languages, the situation differs for under-represented languages due to the
+scarcity of annotated data. This study focuses on assessing the efficacy of
+transfer learning in enhancing dependency parsing for Javanese, a language
+spoken by 80 million individuals but characterized by limited representation in
+natural language processing. We utilized the Universal Dependencies dataset
+consisting of dependency treebanks from more than 100 languages, including
+Javanese. We propose two learning strategies to train the model: transfer
+learning (TL) and hierarchical transfer learning (HTL). While TL only uses a
+source language to pre-train the model, the HTL method uses a source language
+and an intermediate language in the learning process. The results show that our
+best model uses the HTL method, which improves performance with an increase of
+10% for both UAS and LAS evaluations compared to the baseline model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IJCNLP-AACL 2023 SRW</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated
+  Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhimanyu Hans, Avi Schwarzschild, Valeriia Cherepanova, Hamid Kazemi, Aniruddha Saha, Micah Goldblum, Jonas Geiping, Tom Goldstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting text generated by modern large language models is thought to be
+hard, as both LLMs and humans can exhibit a wide range of complex behaviors.
+However, we find that a score based on contrasting two closely related language
+models is highly accurate at separating human-generated and machine-generated
+text. Based on this mechanism, we propose a novel LLM detector that only
+requires simple calculations using a pair of pre-trained LLMs. The method,
+called Binoculars, achieves state-of-the-art accuracy without any training
+data. It is capable of spotting machine text from a range of modern LLMs
+without any model-specific modifications. We comprehensively evaluate
+Binoculars on a number of text sources and in varied situations. Over a wide
+range of document types, Binoculars detects over 90% of generated samples from
+ChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being
+trained on any ChatGPT data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, code available at https://github.com/ahans30/Binoculars</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ALMs: Authorial Language Models for Authorship Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihang Huang, Akira Murakami, Jack Grieve
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce an authorship attribution method called Authorial
+Language Models (ALMs) that involves identifying the most likely author of a
+questioned document based on the perplexity of the questioned document
+calculated for a set of causal language models fine-tuned on the writings of a
+set of candidate author. We benchmarked ALMs against state-of-art-systems using
+the CCAT50 dataset and the Blogs50 datasets. We find that ALMs achieves a
+macro-average accuracy score of 83.6% on Blogs50, outperforming all other
+methods, and 74.9% on CCAT50, matching the performance of the best method. To
+assess the performance of ALMs on shorter texts, we also conducted text
+ablation testing. We found that to reach a macro-average accuracy of 70%, ALMs
+needs 40 tokens on Blogs50 and 400 tokens on CCAT50, while to reach 60% ALMs
+requires 20 tokens on Blogs50 and 70 tokens on CCAT50.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synergizing Machine Learning & Symbolic Methods: A <span class="highlight-title">Survey</span> on Hybrid
+  Approaches to Natural Language Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rrubaa Panchendrarajan, Arkaitz Zubiaga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of machine learning and symbolic approaches have underscored
+their strengths and weaknesses in Natural Language Processing (NLP). While
+machine learning approaches are powerful in identifying patterns in data, they
+often fall short in learning commonsense and the factual knowledge required for
+the NLP tasks. Meanwhile, the symbolic methods excel in representing
+knowledge-rich data. However, they struggle to adapt dynamic data and
+generalize the knowledge. Bridging these two paradigms through hybrid
+approaches enables the alleviation of weaknesses in both while preserving their
+strengths. Recent studies extol the virtues of this union, showcasing promising
+results in a wide range of NLP tasks. In this paper, we present an overview of
+hybrid approaches used for NLP. Specifically, we delve into the
+state-of-the-art hybrid approaches used for a broad spectrum of NLP tasks
+requiring natural language understanding, generation, and reasoning.
+Furthermore, we discuss the existing resources available for hybrid approaches
+for NLP along with the challenges, offering a roadmap for future directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Claim Detection for Automated Fact-checking: A <span class="highlight-title">Survey</span> on Monolingual,
+  Multilingual and Cross-Lingual Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11969v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11969v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rrubaa Panchendrarajan, Arkaitz Zubiaga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated fact-checking has drawn considerable attention over the past few
+decades due to the increase in the diffusion of misinformation on online
+platforms. This is often carried out as a sequence of tasks comprising (i) the
+detection of sentences circulating in online platforms which constitute claims
+needing verification, followed by (ii) the verification process of those
+claims. This survey focuses on the former, by discussing existing efforts
+towards detecting claims needing fact-checking, with a particular focus on
+multilingual data and methods. This is a challenging and fertile direction
+where existing methods are yet far from matching human performance due to the
+profoundly challenging nature of the issue. Especially, the dissemination of
+information across multiple social platforms, articulated in multiple languages
+and modalities demands more generalized solutions for combating misinformation.
+Focusing on multilingual misinformation, we present a comprehensive survey of
+existing multilingual claim detection research. We present state-of-the-art
+multilingual claim detection research categorized into three key factors of the
+problem, verifiability, priority, and similarity. Further, we present a
+detailed overview of the existing multilingual datasets along with the
+challenges and suggest possible future advancements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding
+  Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ge Zhang, Xinrun Du, Bei Chen, Yiming Liang, Tongxu Luo, Tianyu Zheng, Kang Zhu, Yuyang Cheng, Chunpu Xu, Shuyue Guo, Haoran Zhang, Xingwei Qu, Junjie Wang, Ruibin Yuan, Yizhi Li, Zekun Wang, Yudong Liu, Yu-Hsuan Tsai, Fengji Zhang, Chenghua Lin, Wenhao Huang, Wenhu Chen, Jie Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the capabilities of large multimodal models (LMMs) continue to advance,
+evaluating the performance of LMMs emerges as an increasing need. Additionally,
+there is an even larger gap in evaluating the advanced knowledge and reasoning
+abilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU,
+a new Chinese Massive Multi-discipline Multimodal Understanding benchmark
+designed to evaluate LMMs on tasks demanding college-level subject knowledge
+and deliberate reasoning in a Chinese context. CMMMU is inspired by and
+strictly follows the annotation and analysis pattern of MMMU.
+  CMMMU includes 12k manually collected multimodal questions from college
+exams, quizzes, and textbooks, covering six core disciplines: Art & Design,
+Business, Science, Health & Medicine, Humanities & Social Science, and Tech &
+Engineering, like its companion, MMMU. These questions span 30 subjects and
+comprise 39 highly heterogeneous image types, such as charts, diagrams, maps,
+tables, music sheets, and chemical structures.
+  CMMMU focuses on complex perception and reasoning with domain-specific
+knowledge in the Chinese context. We evaluate 11 open-source LLMs and one
+proprietary GPT-4V(ision). Even GPT-4V only achieves accuracies of 42%,
+indicating a large space for improvement. CMMMU will boost the community to
+build the next-generation LMMs towards expert artificial intelligence and
+promote the democratization of LMMs by providing diverse language contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Large Multimodal Models against Common Corruptions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Zhang, Tianyu Pang, Chao Du, Yi Ren, Bo Li, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This technical report aims to fill a deficiency in the assessment of large
+multimodal models (LMMs) by specifically examining the self-consistency of
+their outputs when subjected to common corruptions. We investigate the
+cross-modal interactions between text, image, and speech, encompassing four
+essential generation tasks: text-to-image, image-to-text, text-to-speech, and
+speech-to-text. We create a comprehensive benchmark, named MMCBench, that
+covers more than 100 popular LMMs (totally over 150 model checkpoints). A
+thorough evaluation under common corruptions is critical for practical
+deployment and facilitates a better understanding of the reliability of
+cutting-edge LMMs. The benchmarking code is available at
+https://github.com/sail-sg/MMCBench
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Blinded by Generated Contexts: How Language Models Merge Generated and
+  Retrieved Contexts for Open-Domain QA? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11911v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11911v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hexiang Tan, Fei Sun, Wanli Yang, Yuanzhuo Wang, Qi Cao, Xueqi Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While auxiliary information has become a key to enhance Large Language Models
+(LLMs), relatively little is known about how well LLMs merge these contexts,
+specifically generated and retrieved. To study this, we formulate a task
+specifically designed to identify whether the answers, derived from the
+integration of generated and retrieved contexts, are attributed to either
+generated or retrieved contexts. To support this task, we develop a methodology
+to construct datasets with conflicting contexts, where each question is paired
+with both generated and retrieved contexts, yet only one of them contains the
+correct answer. Our experiments reveal a significant bias in LLMs towards
+generated contexts, as evidenced across state-of-the-art open (Llama2-7b/13b)
+and closed (GPT 3.5/4) systems. We further identify two key factors
+contributing to this bias: i) Contexts generated by LLMs typically show greater
+similarity to the questions, increasing their likelihood of selection; ii) The
+segmentation process used in retrieved contexts disrupts their completeness,
+thereby hindering their full utilization in LLMs. Our analysis enhances the
+understanding of how LLMs merge diverse contexts, offering valuable insights
+for advancing current augmentation methods for LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PsySafe: A Comprehensive Framework for Psychological-based Attack,
+  Defense, and Evaluation of Multi-agent System Safety 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zaibin Zhang, Yongting Zhang, Lijun Li, Hongzhi Gao, Lijun Wang, Huchuan Lu, Feng Zhao, Yu Qiao, Jing Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent systems, augmented with Large Language Models (LLMs), demonstrate
+significant capabilities for collective intelligence. However, the potential
+misuse of this intelligence for malicious purposes presents significant risks.
+To date, comprehensive research on the safety issues associated with
+multi-agent systems remains limited. From the perspective of agent psychology,
+we discover that the dark psychological states of agents can lead to severe
+safety issues. To address these issues, we propose a comprehensive framework
+grounded in agent psychology. In our framework, we focus on three aspects:
+identifying how dark personality traits in agents might lead to risky
+behaviors, designing defense strategies to mitigate these risks, and evaluating
+the safety of multi-agent systems from both psychological and behavioral
+perspectives. Our experiments reveal several intriguing phenomena, such as the
+collective dangerous behaviors among agents, agents' propensity for
+self-reflection when engaging in dangerous behavior, and the correlation
+between agents' psychological assessments and their dangerous behaviors. We
+anticipate that our framework and observations will provide valuable insights
+for further research into the safety of multi-agent systems. We will make our
+data and code publicly accessible at https:/github.com/AI4Good24/PsySafe.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Small Language Models' Mathematical Reasoning via Mix Thoughts
+  Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunyu Zhu, Jian Li, Yong Liu, Can Ma, Weiping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work addresses the challenge of democratizing advanced Large Language
+Models (LLMs) by compressing their mathematical reasoning capabilities into
+sub-billion parameter Small Language Models (SLMs) without compromising
+performance. We introduce Equation-of-Thought Distillation (EoTD), a novel
+technique that encapsulates the reasoning process into equation-based
+representations to construct an EoTD dataset for fine-tuning SLMs.
+Additionally, we propose the Mix Thoughts Distillation (MTD) framework to
+enhance the reasoning performance of SLMs. This involves creating a reasoning
+dataset with multiple thought processes and using it for fine-tuning. Our
+experimental findings demonstrate that EoTD significantly boosts the reasoning
+abilities of SLMs, while MTD enables these models to achieve state-of-the-art
+reasoning performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Right Model for the Job: An Evaluation of Legal Multi-Label
+  Classification Baselines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11852v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11852v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martina Forster, Claudia Schulz, Prudhvi Nokku, Melicaalsadat Mirsafian, Jaykumar Kasundra, Stavroula Skylaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Label Classification (MLC) is a common task in the legal domain, where
+more than one label may be assigned to a legal document. A wide range of
+methods can be applied, ranging from traditional ML approaches to the latest
+Transformer-based architectures. In this work, we perform an evaluation of
+different MLC methods using two public legal datasets, POSTURE50K and
+EURLEX57K. By varying the amount of training data and the number of labels, we
+explore the comparative advantage offered by different approaches in relation
+to the dataset properties. Our findings highlight DistilRoBERTa and LegalBERT
+as performing consistently well in legal MLC with reasonable computational
+demands. T5 also demonstrates comparable performance while offering advantages
+as a generative model in the presence of changing label sets. Finally, we show
+that the CrossEncoder exhibits potential for notable macro-F1 score
+improvements, albeit with increased computational costs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI for social science and social science of AI: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11839v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11839v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruoxi Xu, Yingfei Sun, Mengjie Ren, Shiguang Guo, Ruotong Pan, Hongyu Lin, Le Sun, Xianpei Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in artificial intelligence, particularly with the
+emergence of large language models (LLMs), have sparked a rethinking of
+artificial general intelligence possibilities. The increasing human-like
+capabilities of AI are also attracting attention in social science research,
+leading to various studies exploring the combination of these two fields. In
+this survey, we systematically categorize previous explorations in the
+combination of AI and social science into two directions that share common
+technical approaches but differ in their research objectives. The first
+direction is focused on AI for social science, where AI is utilized as a
+powerful tool to enhance various stages of social science research. While the
+second direction is the social science of AI, which examines AI agents as
+social entities with their human-like cognitive and linguistic capabilities. By
+conducting a thorough review, particularly on the substantial progress
+facilitated by recent advancements in large language models, this paper
+introduces a fresh perspective to reassess the relationship between AI and
+social science, provides a cohesive framework that allows researchers to
+understand the distinctions and connections between AI for social science and
+social science of AI, and also summarized state-of-art experiment simulation
+platforms to facilitate research in these two directions. We believe that as AI
+technology continues to advance and intelligent agents find increasing
+applications in our daily lives, the significance of the combination of AI and
+social science will become even more prominent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Information Processing and Management (IP&M)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SuperCLUE-Math6: Graded Multi-Step Math Reasoning Benchmark for LLMs in
+  Chinese 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Xu, Hang Xue, Lei Zhu, Kangkang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce SuperCLUE-Math6(SC-Math6), a new benchmark dataset to evaluate
+the mathematical reasoning abilities of Chinese language models. SC-Math6 is
+designed as an upgraded Chinese version of the GSM8K dataset with enhanced
+difficulty, diversity, and application scope. It consists of over 2000
+mathematical word problems requiring multi-step reasoning and providing natural
+language solutions. We propose an innovative scheme to quantify the reasoning
+capability of large models based on performance over problems with different
+reasoning steps. Experiments on 12 representative Chinese models demonstrate a
+clear stratification of reasoning levels, with top models like GPT-4 showing
+superior performance. SC-Math6 fills the gap in Chinese mathematical reasoning
+benchmarks and provides a comprehensive testbed to advance the intelligence of
+Chinese language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hallucination is Inevitable: An Innate Limitation of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziwei Xu, Sanjay Jain, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hallucination has been widely recognized to be a significant drawback for
+large language models (LLMs). There have been many works that attempt to reduce
+the extent of hallucination. These efforts have mostly been empirical so far,
+which cannot answer the fundamental question whether it can be completely
+eliminated. In this paper, we formalize the problem and show that it is
+impossible to eliminate hallucination in LLMs. Specifically, we define a formal
+world where hallucination is defined as inconsistencies between a computable
+LLM and a computable ground truth function. By employing results from learning
+theory, we show that LLMs cannot learn all of the computable functions and will
+therefore always hallucinate. Since the formal world is a part of the real
+world which is much more complicated, hallucinations are also inevitable for
+real world LLMs. Furthermore, for real world LLMs constrained by provable time
+complexity, we describe the hallucination-prone tasks and empirically validate
+our claims. Finally, using the formal world framework, we discuss the possible
+mechanisms and efficacies of existing hallucination mitigators as well as the
+practical implications on the safe deployment of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SemPLeS: Semantic <span class="highlight-title">Prompt</span> Learning for Weakly-Supervised Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11791v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11791v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ci-Siang Lin, Chien-Yi Wang, Yu-Chiang Frank Wang, Min-Hung Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation
+models using training image data with only image-level supervision. Since
+precise pixel-level annotations are not accessible, existing methods typically
+focus on producing pseudo masks for training segmentation models by refining
+CAM-like heatmaps. However, the produced heatmaps may only capture
+discriminative image regions of target object categories or the associated
+co-occurring backgrounds. To address the issues, we propose a Semantic Prompt
+Learning for WSSS (SemPLeS) framework, which learns to effectively prompt the
+CLIP space to enhance the semantic alignment between the segmented regions and
+the target object categories. More specifically, we propose Contrastive Prompt
+Learning and Class-associated Semantic Refinement to learn the prompts that
+adequately describe and suppress the image backgrounds associated with each
+target object category. In this way, our proposed framework is able to perform
+better semantic matching between object regions and the associated text labels,
+resulting in desired pseudo masks for training the segmentation model. The
+proposed SemPLeS framework achieves SOTA performance on the standard WSSS
+benchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the
+semantic visualization of our learned prompts. The codes will be released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speak It Out: Solving Symbol-Related Problems with Symbol-to-Language
+  Conversion for Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yile Wang, Sijie Cheng, Zixin Sun, Peng Li, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Symbols (or more broadly, non-natural language textual representations) such
+as numerical sequences, molecular formulas, and table delimiters widely exist,
+playing important roles in various tasks such as abstract reasoning, chemical
+property prediction, and table question answering. Despite the impressive
+natural language comprehension capabilities of large language models (LLMs),
+their reasoning abilities for symbols remain inadequate, which could attributed
+to the difference between symbol representations and general natural languages.
+We propose symbol-to-language (S2L), a tuning-free method that enables large
+language models to solve symbol-related problems with information expressed in
+natural language. Specifically, S2L first converts the symbols involved to
+language-based representations, which can be implemented by prompting LLMs or
+leveraging external tools, then these language-based representations are
+integrated into the original problem via direct substitution or concatenation,
+serving as useful input information for LLMs. We evaluate the S2L method using
+both API-based (GPT-4, ChatGPT) and open-source (OpenChat) models over eight
+symbol-related tasks, ranging from symbol-only abstract reasoning to sentiment
+analysis in social media. Experimental results show that S2L consistently leads
+to superior performance. For example, by employing S2L for GPT-4, there can be
+average significant improvements of +21.9% and +9.5% for subtasks in 1D-ARC and
+Dyck language, respectively. Codes and data are available at
+https://github.com/THUNLP-MT/symbol2language.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Keep Decoding Parallel with Effective Knowledge Distillation from
+  Language Models to End-to-end Speech Recognisers <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Hentschel, Yuta Nishikawa, Tatsuya Komatsu, Yusuke Fujita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a novel approach for knowledge distillation (KD) from a
+BERT teacher model to an automatic speech recognition (ASR) model using
+intermediate layers. To distil the teacher's knowledge, we use an attention
+decoder that learns from BERT's token probabilities. Our method shows that
+language model (LM) information can be more effectively distilled into an ASR
+model using both the intermediate layers and the final layer. By using the
+intermediate layers as distillation target, we can more effectively distil LM
+knowledge into the lower network layers. Using our method, we achieve better
+recognition accuracy than with shallow fusion of an external LM, allowing us to
+maintain fast parallel decoding. Experiments on the LibriSpeech dataset
+demonstrate the effectiveness of our approach in enhancing greedy decoding with
+connectionist temporal classification (CTC).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Streaming Bilingual End-to-End ASR model using Attention over Multiple
+  Softmax 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Patil, Vikas Joshi, Purvi Agrawal, Rupesh Mehta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even with several advancements in multilingual modeling, it is challenging to
+recognize multiple languages using a single neural model, without knowing the
+input language and most multilingual models assume the availability of the
+input language. In this work, we propose a novel bilingual end-to-end (E2E)
+modeling approach, where a single neural model can recognize both languages and
+also support switching between the languages, without any language input from
+the user. The proposed model has shared encoder and prediction networks, with
+language-specific joint networks that are combined via a self-attention
+mechanism. As the language-specific posteriors are combined, it produces a
+single posterior probability over all the output symbols, enabling a single
+beam search decoding and also allowing dynamic switching between the languages.
+The proposed approach outperforms the conventional bilingual baseline with
+13.3%, 8.23% and 1.3% word error rate relative reduction on Hindi, English and
+code-mixed test sets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in IEEE's Spoken Language Technology (SLT) 2022, 8 pages (6
+  + 2 for references), 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revolutionizing Finance with LLMs: An <span class="highlight-title">Overview</span> of Applications and
+  Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11641v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11641v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaqin Zhao, Zhengliang Liu, Zihao Wu, Yiwei Li, Tianze Yang, Peng Shu, Shaochen Xu, Haixing Dai, Lin Zhao, Gengchen Mai, Ninghao Liu, Tianming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Large Language Models (LLMs) like ChatGPT have seen
+considerable advancements and have been applied in diverse fields. Built on the
+Transformer architecture, these models are trained on extensive datasets,
+enabling them to understand and generate human language effectively. In the
+financial domain, the deployment of LLMs is gaining momentum. These models are
+being utilized for automating financial report generation, forecasting market
+trends, analyzing investor sentiment, and offering personalized financial
+advice. Leveraging their natural language processing capabilities, LLMs can
+distill key insights from vast financial data, aiding institutions in making
+informed investment choices and enhancing both operational efficiency and
+customer satisfaction. In this study, we provide a comprehensive overview of
+the emerging integration of LLMs into various financial tasks. Additionally, we
+conducted holistic tests on multiple financial tasks through the combination of
+natural language instructions. Our findings show that GPT-4 effectively follow
+prompt instructions across various financial tasks. This survey and evaluation
+of LLMs in the financial domain aim to deepen the understanding of LLMs'
+current role in finance for both financial practitioners and LLM researchers,
+identify new research and application prospects, and highlight how these
+technologies can be leveraged to solve practical challenges in the finance
+industry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Far Can 100 Samples Go? Unlocking Overall Zero-Shot Multilingual
+  Translation via Tiny Multi-Parallel Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Wu, Shaomu Tan, Yan Meng, David Stap, Christof Monz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot translation is an open problem, aiming to translate between
+language pairs unseen during training in Multilingual Machine Translation
+(MMT). A common, albeit resource-consuming, solution is to mine as many
+translation directions as possible to add to the parallel corpus. In this
+paper, we show that the zero-shot capability of an English-centric model can be
+easily enhanced by fine-tuning with a very small amount of multi-parallel data.
+For example, on the EC30 dataset, we show that up to +21.7 ChrF non-English
+overall improvements (870 directions) can be achieved by using only 100
+multi-parallel samples, meanwhile preserving capability in English-centric
+directions. We further study the size effect of fine-tuning data and its
+transfer capabilities. Surprisingly, our empirical analysis shows that
+comparable overall improvements can be achieved even through fine-tuning in a
+small, randomly sampled direction set (10\%). Also, the resulting non-English
+performance is quite close to the upper bound (complete translation). Due to
+its high efficiency and practicality, we encourage the community 1) to consider
+the use of the fine-tuning method as a strong baseline for zero-shot
+translation and 2) to construct more comprehensive and high-quality
+multi-parallel data to cover real-world demand.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing In-context Learning via Linear Probe Calibration <span class="chip">AISTATS2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12406v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12406v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Momin Abbas, Yi Zhou, Parikshit Ram, Nathalie Baracaldo, Horst Samulowitz, Theodoros Salonidis, Tianyi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning (ICL) is a new paradigm for natural language processing
+that utilizes Generative Pre-trained Transformer (GPT)-like models. This
+approach uses prompts that include in-context demonstrations to generate the
+corresponding output for a new query input. However, applying ICL in real cases
+does not scale with the number of samples, and lacks robustness to different
+prompt templates and demonstration permutations. In this paper, we first show
+that GPT-like models using ICL result in unreliable predictions based on a new
+metric based on Shannon entropy. Then, to solve this problem, we propose a new
+technique called the Linear Probe Calibration (LinC), a method that calibrates
+the model's output probabilities, resulting in reliable predictions and
+improved performance, while requiring only minimal additional samples (as few
+as five labeled data samples). LinC significantly enhances the ICL test
+performance of GPT models on various benchmark datasets, with an average
+improvement of up to 21%, and up to a 50% improvement in some cases, and
+significantly boosts the performance of PEFT methods, especially in the low
+resource regime. Moreover, LinC achieves lower expected calibration error, and
+is highly robust to varying label proportions, prompt templates, and
+demonstration permutations. Our code is available at
+\url{https://github.com/mominabbass/LinC}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AISTATS2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Longitudinal Sentiment Classification of Reddit Posts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Nwaoha, Ziyad Gaffar, Ho Joon Chun, Marina Sokolova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We report results of a longitudinal sentiment classification of Reddit posts
+written by students of four major Canadian universities. We work with the texts
+of the posts, concentrating on the years 2020-2023. By finely tuning a
+sentiment threshold to a range of [-0.075,0.075], we successfully built
+classifiers proficient in categorizing post sentiments into positive and
+negative categories. Noticeably, our sentiment classification results are
+consistent across the four university data sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 10 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development of an NLP-driven computer-based test guide for visually
+  impaired students 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tubo Faustinah Nemieboka, Ikechukwu E. Onyenwe, Doris C. Asogwa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, advancements in Natural Language Processing (NLP) techniques
+have revolutionized the field of accessibility and exclusivity of testing,
+particularly for visually impaired students (VIS). CBT has shown in years back
+its relevance in terms of administering exams electronically, making the test
+process easier, providing quicker and more accurate results, and offering
+greater flexibility and accessibility for candidates. Yet, its relevance was
+not felt by the visually impaired students as they cannot access printed
+documents. Hence, in this paper, we present an NLP-driven Computer-Based Test
+guide for visually impaired students. It employs a speech technology
+pre-trained methods to provide real-time assistance and support to visually
+impaired students. The system utilizes NLP technologies to convert the
+text-based questions and the associated options in a machine-readable format.
+Subsequently, the speech technology pre-trained model processes the converted
+text enabling the VIS to comprehend and analyze the content. Furthermore, we
+validated that this pre-trained model is not perverse by testing for accuracy
+using sample audio datasets labels (A, B, C, D, E, F, G) to compare with the
+voice recordings obtained from 20 VIS which is been predicted by the system to
+attain values for precision, recall, and F1-scores. These metrics are used to
+assess the performance of the pre-trained model and have indicated that it is
+proficient enough to give its better performance to the evaluated system. The
+methodology adopted for this system is Object Oriented Analysis and Design
+Methodology (OOADM) where Objects are discussed and built by modeling
+real-world instances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Subgraph Extraction-based Feedback-guided Iterative Scheduling for HLS <span class="chip">DATE'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanchen Ye, David Z. Pan, Chris Leary, Deming Chen, Xiaoqing Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes ISDC, a novel feedback-guided iterative system of
+difference constraints (SDC) scheduling algorithm for high-level synthesis
+(HLS). ISDC leverages subgraph extraction-based low-level feedback from
+downstream tools like logic synthesizers to iteratively refine HLS scheduling.
+Technical innovations include: (1) An enhanced SDC formulation that effectively
+integrates low-level feedback into the linear-programming (LP) problem; (2) A
+fanout and window-based subgraph extraction mechanism driving the feedback
+cycle; (3) A no-human-in-loop ISDC flow compatible with a wide range of
+downstream tools and process design kits (PDKs). Evaluation shows that ISDC
+reduces register usage by 28.5% against an industrial-strength open-source HLS
+tool.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>DATE'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-tuning Large Language Models for Multigenerator, Multidomain, and
+  Multilingual Machine-Generated Text Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Xiong, Thanet Markchom, Ziwei Zheng, Subin Jung, Varun Ojha, Huizhi Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SemEval-2024 Task 8 introduces the challenge of identifying machine-generated
+texts from diverse Large Language Models (LLMs) in various languages and
+domains. The task comprises three subtasks: binary classification in
+monolingual and multilingual (Subtask A), multi-class classification (Subtask
+B), and mixed text detection (Subtask C). This paper focuses on Subtask A & B.
+Each subtask is supported by three datasets for training, development, and
+testing. To tackle this task, two methods: 1) using traditional machine
+learning (ML) with natural language preprocessing (NLP) for feature extraction,
+and 2) fine-tuning LLMs for text classification. The results show that
+transformer models, particularly LoRA-RoBERTa, exceed traditional ML methods in
+effectiveness, with majority voting being particularly effective in
+multilingual contexts for identifying machine-generated texts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cheap Learning: Maximising Performance of Language Models for Social
+  Data Science Using Minimal Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo Castro-Gonzalez, Yi-Ling Chung, Hannak Rose Kirk, John Francis, Angus R. Williams, Pica Johansson, Jonathan Bright
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of machine learning has recently made significant progress in
+reducing the requirements for labelled training data when building new models.
+These `cheaper' learning techniques hold significant potential for the social
+sciences, where development of large labelled training datasets is often a
+significant practical impediment to the use of machine learning for analytical
+tasks. In this article we review three `cheap' techniques that have developed
+in recent years: weak supervision, transfer learning and prompt engineering.
+For the latter, we also review the particular case of zero-shot prompting of
+large language models. For each technique we provide a guide of how it works
+and demonstrate its application across six different realistic social science
+applications (two different tasks paired with three different dataset makeups).
+We show good performance for all techniques, and in particular we demonstrate
+how prompting of large language models can achieve high accuracy at very low
+cost. Our results are accompanied by a code repository to make it easy for
+others to duplicate our work and use it in their own research. Overall, our
+article is intended to stimulate further uptake of these techniques in the
+social sciences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 10 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GRATH: Gradual Self-Truthifying for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixin Chen, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Truthfulness is paramount for large language models (LLMs) as they are
+increasingly deployed in real-world applications. However, existing LLMs still
+struggle with generating truthful answers and content, as evidenced by their
+modest performance on benchmarks like TruthfulQA. To address this issue, we
+propose GRAdual self-truTHifying (GRATH), a novel post-processing method to
+enhance truthfulness of LLMs. GRATH utilizes out-of-domain question prompts to
+generate corresponding answers and adaptively optimizes the model via direct
+preference optimization (DPO). Note that during this process, GRATH learns
+truthfulness in a self-supervised manner without requiring annotated answers.
+In particular, GRATH first generates pairwise truthfulness training data by
+prompting the LLM itself, with each pair containing a question and its correct
+and incorrect answers. The model is then fine-tuned using DPO to learn from the
+difference between answer pairs. Subsequently, GRATH iteratively refines the
+truthfulness data and optimizes the model, leading to a gradual improvement in
+model truthfulness. Empirically, we evaluate GRATH using different 7B-LLMs and
+compare with LLMs with similar or even larger sizes on benchmark datasets. Our
+results show that GRATH effectively improves LLMs' truthfulness without
+compromising other core capabilities. Notably, GRATH achieves state-of-the-art
+performance on TruthfulQA, with MC1 accuracy as 54.71% and MC2 accuracy as
+69.10%, which even surpass those on larger-scale models, such as
+Llama2-Chat-70B, by 23.62% and 24.18%, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Ethics of Interaction: Mitigating Security Threats in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashutosh Kumar, Sagarika Singh, Shiv Vignesh Murty, Swathy Ragupathy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper comprehensively explores the ethical challenges arising from
+security threats to Language Learning Models (LLMs). These intricate digital
+repositories are increasingly integrated into our daily lives, making them
+prime targets for attacks that can compromise their training data and the
+confidentiality of their data sources. The paper delves into the nuanced
+ethical repercussions of such security threats on society and individual
+privacy. We scrutinize five major threats: prompt injection, jailbreaking,
+Personal Identifiable Information (PII) exposure, sexually explicit content,
+and hate based content, going beyond mere identification to assess their
+critical ethical consequences and the urgency they create for robust defensive
+strategies. The escalating reliance on LLMs underscores the crucial need for
+ensuring these systems operate within the bounds of ethical norms, particularly
+as their misuse can lead to significant societal and individual harm. We
+propose conceptualizing and developing an evaluative tool tailored for LLMs,
+which would serve a dual purpose, guiding developers and designers in
+preemptive fortification of backend systems and scrutinizing the ethical
+dimensions of LLM chatbot responses during the testing phase. By comparing LLM
+responses with those expected from humans in a moral context, we aim to discern
+the degree to which AI behaviors align with the ethical values held by a
+broader society. Ultimately, this paper not only underscores the ethical
+troubles presented by LLMs, it also highlights a path toward cultivating trust
+in these systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mind Your Format: Towards Consistent Evaluation of In-Context Learning
+  Improvements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anton Voronov, Lena Wolf, Max Ryabinin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models demonstrate a remarkable capability for learning to
+solve new tasks from a few examples. The prompt template, or the way the input
+examples are formatted to obtain the prompt, is an important yet often
+overlooked aspect of in-context learning. In this work, we conduct a
+comprehensive study of the template format's influence on the in-context
+learning performance. We evaluate the impact of the prompt template across
+models (from 770M to 70B parameters) and 4 standard classification datasets. We
+show that a poor choice of the template can reduce the performance of the
+strongest models and inference methods to a random guess level. More
+importantly, the best templates do not transfer between different setups and
+even between models of the same family. Our findings show that the currently
+prevalent approach to evaluation, which ignores template selection, may give
+misleading results due to different templates in different works. As a first
+step towards mitigating this issue, we propose Template Ensembles that
+aggregate model predictions across several templates. This simple test-time
+augmentation boosts average performance while being robust to the choice of
+random set of templates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 10 figures. Code:
+  https://github.com/yandex-research/mind-your-format</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Fusion of Large Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10491v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10491v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanqi Wan, Xinting Huang, Deng Cai, Xiaojun Quan, Wei Bi, Shuming Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While training large language models (LLMs) from scratch can generate models
+with distinct functionalities and strengths, it comes at significant costs and
+may result in redundant capabilities. Alternatively, a cost-effective and
+compelling approach is to merge existing pre-trained LLMs into a more potent
+model. However, due to the varying architectures of these LLMs, directly
+blending their weights is impractical. In this paper, we introduce the notion
+of knowledge fusion for LLMs, aimed at combining the capabilities of existing
+LLMs and transferring them into a single LLM. By leveraging the generative
+distributions of source LLMs, we externalize their collective knowledge and
+unique strengths, thereby potentially elevating the capabilities of the target
+model beyond those of any individual source LLM. We validate our approach using
+three popular LLMs with different architectures--Llama-2, MPT, and
+OpenLLaMA--across various benchmarks and tasks. Our findings confirm that the
+fusion of LLMs can improve the performance of the target model across a range
+of capabilities such as reasoning, commonsense, and code generation. Our code,
+model weights, and data are public at
+\url{https://github.com/fanqiwan/FuseLLM}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ICE-Score: Instructing Large Language Models to Evaluate Code <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Terry Yue Zhuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in the field of natural language generation have
+facilitated the use of large language models to assess the quality of generated
+text. Although these models have shown promising results in tasks such as
+machine translation and summarization, their applicability in code intelligence
+tasks remains limited without human involvement. The complexity of programming
+concepts required for such tasks makes it difficult to develop evaluation
+metrics that align with human judgment. Token-matching-based metrics, such as
+BLEU, have demonstrated weak correlations with human practitioners in code
+intelligence tasks. Moreover, utilizing human-written test suites to evaluate
+functional correctness can be challenging in domains with low resources. To
+overcome these obstacles, we propose \texttt{ICE-Score}, a new evaluation
+metric via instructing large language models (LLMs) for code assessments. Our
+metric addresses the limitations of existing approaches by achieving superior
+correlations with functional correctness and human preferences, without the
+need for test oracles or references. We evaluate the efficacy of our metric on
+two different aspects (\textit{human preference} and \textit{execution
+success}) and four programming languages. Our results demonstrate that our
+metric surpasses state-of-the-art metrics for code generation, delivering high
+levels of accuracy and consistency across various programming languages and
+tasks. We also make our evaluation metric and datasets available to the
+public\footnote{\url{https://github.com/terryyz/ice-score}}, encouraging
+further research in evaluating code intelligence tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Annotation Sensitivity: Training Data Collection Methods Affect Model
+  Performance <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.14212v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.14212v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Kern, Stephanie Eckman, Jacob Beck, Rob Chew, Bolei Ma, Frauke Kreuter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When training data are collected from human annotators, the design of the
+annotation instrument, the instructions given to annotators, the
+characteristics of the annotators, and their interactions can impact training
+data. This study demonstrates that design choices made when creating an
+annotation instrument also impact the models trained on the resulting
+annotations. We introduce the term annotation sensitivity to refer to the
+impact of annotation data collection methods on the annotations themselves and
+on downstream model performance and predictions. We collect annotations of hate
+speech and offensive language in five experimental conditions of an annotation
+instrument, randomly assigning annotators to conditions. We then fine-tune BERT
+models on each of the five resulting datasets and evaluate model performance on
+a holdout portion of each condition. We find considerable differences between
+the conditions for 1) the share of hate speech/offensive language annotations,
+2) model performance, 3) model predictions, and 4) model learning curves. Our
+results emphasize the crucial role played by the annotation instrument which
+has received little attention in the machine learning literature. We call for
+additional research into how and why the instrument impacts the annotations to
+inform the development of best practices in instrument design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings:
+  https://aclanthology.org/2023.findings-emnlp.992/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero and Few-shot Semantic Parsing with Ambiguous Inputs <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00824v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00824v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elias Stengel-Eskin, Kyle Rawlins, Benjamin Van Durme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the frequent challenges posed by ambiguity when representing meaning
+via natural language, it is often ignored or deliberately removed in tasks
+mapping language to formally-designed representations, which generally assume a
+one-to-one mapping between linguistic and formal representations. We attempt to
+address this shortcoming by introducing AmP, a framework, dataset, and
+challenge for translating ambiguous natural language to formal representations
+like logic and code. We define templates and generate data for five
+well-documented linguistic ambiguities. Using AmP, we investigate how several
+few-shot text-to-code systems handle ambiguity, introducing three new metrics.
+We find that large pre-trained models perform poorly at capturing the
+distribution of possible meanings without deliberate instruction. However,
+models are able to capture the distribution well when ambiguity is attested in
+their inputs. These results motivate a call for including ambiguity explicitly
+in datasets and promote considering the distribution of possible outputs when
+evaluating systems. Data and code: https://github.com/esteng/ambiguous_parsing
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024 Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TWIZ-v2: The Wizard of Multimodal Conversational-Stimulus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02118v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02118v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rafael Ferreira, Diogo Tavares, Diogo Silva, Rodrigo Valério, João Bordalo, Inês Simões, Vasco Ramos, David Semedo, João Magalhães
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this report, we describe the vision, challenges, and scientific
+contributions of the Task Wizard team, TWIZ, in the Alexa Prize TaskBot
+Challenge 2022. Our vision, is to build TWIZ bot as an helpful, multimodal,
+knowledgeable, and engaging assistant that can guide users towards the
+successful completion of complex manual tasks. To achieve this, we focus our
+efforts on three main research questions: (1) Humanly-Shaped Conversations, by
+providing information in a knowledgeable way; (2) Multimodal Stimulus, making
+use of various modalities including voice, images, and videos; and (3)
+Zero-shot Conversational Flows, to improve the robustness of the interaction to
+unseen scenarios. TWIZ is an assistant capable of supporting a wide range of
+tasks, with several innovative features such as creative cooking, video
+navigation through voice, and the robust TWIZ-LLM, a Large Language Model
+trained for dialoguing about complex manual tasks. Given ratings and feedback
+provided by users, we observed that TWIZ bot is an effective and robust system,
+capable of guiding users through tasks while providing several multimodal
+stimuli.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Connecting the Dots: What Graph-Based Text Representations Work Best for
+  Text Classification Using Graph Neural Networks? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14578v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14578v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Margarita Bugueño, Gerard de Melo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the success of Graph Neural Networks (GNNs) for structure-aware machine
+learning, many studies have explored their use for text classification, but
+mostly in specific domains with limited data characteristics. Moreover, some
+strategies prior to GNNs relied on graph mining and classical machine learning,
+making it difficult to assess their effectiveness in modern settings. This work
+extensively investigates graph representation methods for text classification,
+identifying practical implications and open challenges. We compare different
+graph construction schemes using a variety of GNN architectures and setups
+across five datasets, encompassing short and long documents as well as
+unbalanced scenarios in diverse domains. Two Transformer-based large language
+models are also included to complement the study. The results show that i)
+although the effectiveness of graphs depends on the textual input features and
+domain, simple graph constructions perform better the longer the documents are,
+ii) graph representations are especially beneficial for longer documents,
+outperforming Transformer-based models, iii) graph methods are particularly
+efficient at solving the task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of the Association for Computational
+  Linguistics: EMNLP 2023 (Long Paper). 17 pages, 2 figures, 15 tables. The
+  Appendix starts on page 12</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Who is Chat<span class="highlight-title">GPT</span>? Benchmarking LLMs' Psychological Portrayal Using
+  PsychoBench <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01386v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01386v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jen-tse Huang, Wenxuan Wang, Eric John Li, Man Ho Lam, Shujie Ren, Youliang Yuan, Wenxiang Jiao, Zhaopeng Tu, Michael R. Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have recently showcased their remarkable
+capacities, not only in natural language processing tasks but also across
+diverse domains such as clinical medicine, legal consultation, and education.
+LLMs become more than mere applications, evolving into assistants capable of
+addressing diverse user requests. This narrows the distinction between human
+beings and artificial intelligence agents, raising intriguing questions
+regarding the potential manifestation of personalities, temperaments, and
+emotions within LLMs. In this paper, we propose a framework, PsychoBench, for
+evaluating diverse psychological aspects of LLMs. Comprising thirteen scales
+commonly used in clinical psychology, PsychoBench further classifies these
+scales into four distinct categories: personality traits, interpersonal
+relationships, motivational tests, and emotional abilities. Our study examines
+five popular models, namely text-davinci-003, gpt-3.5-turbo, gpt-4, LLaMA-2-7b,
+and LLaMA-2-13b. Additionally, we employ a jailbreak approach to bypass the
+safety alignment protocols and test the intrinsic natures of LLMs. We have made
+PsychoBench openly accessible via https://github.com/CUHK-ARISE/PsychoBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for ICLR 2024 Oral Presentation. 15 pages (main text) and 5
+  pages (appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Noise Contrastive Estimation-based Matching Framework for Low-resource
+  Security Attack Pattern Recognition <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10337v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10337v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tu Nguyen, Nedim Srndic, Alexander Neth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tactics, Techniques and Procedures (TTPs) represent sophisticated attack
+patterns in the cybersecurity domain, described encyclopedically in textual
+knowledge bases. Identifying TTPs in cybersecurity writing, often called TTP
+mapping, is an important and challenging task. Conventional learning approaches
+often target the problem in the classical multi-class or multilabel
+classification setting. This setting hinders the learning ability of the model
+due to a large number of classes (i.e., TTPs), the inevitable skewness of the
+label distribution and the complex hierarchical structure of the label space.
+We formulate the problem in a different learning paradigm, where the assignment
+of a text to a TTP label is decided by the direct semantic similarity between
+the two, thus reducing the complexity of competing solely over the large
+labeling space. To that end, we propose a neural matching architecture with an
+effective sampling-based learn-to-compare mechanism, facilitating the learning
+process of the matching model despite constrained resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at EACL 2024, in ARR October 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unifying the Perspectives of NLP and Software Engineering: A <span class="highlight-title">Survey</span> on
+  Language Models for Code 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07989v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07989v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyin Zhang, Chaoyu Chen, Bingchang Liu, Cong Liao, Zi Gong, Hang Yu, Jianguo Li, Rui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we systematically review the recent advancements in code
+processing with language models, covering 50+ models, 30+ evaluation tasks,
+170+ datasets, and 700+ related works. We break down code processing models
+into general language models represented by the GPT family and specialized
+models that are specifically pretrained on code, often with tailored
+objectives. We discuss the relations and differences between these models, and
+highlight the historical transition of code modeling from statistical models
+and RNNs to pretrained Transformers and LLMs, which is exactly the same course
+that had been taken by NLP. We also discuss code-specific features such as AST,
+CFG, and unit tests, along with their application in training code language
+models, and identify key challenges and potential future directions in this
+domain. We keep the survey open and updated on GitHub at
+https://github.com/codefuse-ai/Awesome-Code-LLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Repo is available at https://github.com/codefuse-ai/Awesome-Code-LLM.
+  8 figures, 10 tables, and 713 references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TIM: Teaching Large Language Models to Translate with Comparison <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04408v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04408v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiali Zeng, Fandong Meng, Yongjing Yin, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-sourced large language models (LLMs) have demonstrated remarkable
+efficacy in various tasks with instruction tuning. However, these models can
+sometimes struggle with tasks that require more specialized knowledge such as
+translation. One possible reason for such deficiency is that instruction tuning
+aims to generate fluent and coherent text that continues from a given
+instruction without being constrained by any task-specific requirements.
+Moreover, it can be more challenging for tuning smaller LLMs with lower-quality
+training data. To address this issue, we propose a novel framework using
+examples in comparison to teach LLMs to learn translation. Our approach
+involves presenting the model with examples of correct and incorrect
+translations and using a preference loss to guide the model's learning. We
+evaluate our method on WMT2022 test sets and show that it outperforms existing
+methods. Our findings offer a new perspective on fine-tuning LLMs for
+translation tasks and provide a promising solution for generating high-quality
+translations. Please refer to Github for more details:
+https://github.com/lemon0830/TIM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bad Actor, Good Advisor: Exploring the Role of Large Language Models in
+  Fake News Detection <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beizhe Hu, Qiang Sheng, Juan Cao, Yuhui Shi, Yang Li, Danding Wang, Peng Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting fake news requires both a delicate sense of diverse clues and a
+profound understanding of the real-world background, which remains challenging
+for detectors based on small language models (SLMs) due to their knowledge and
+capability limitations. Recent advances in large language models (LLMs) have
+shown remarkable performance in various tasks, but whether and how LLMs could
+help with fake news detection remains underexplored. In this paper, we
+investigate the potential of LLMs in fake news detection. First, we conduct an
+empirical study and find that a sophisticated LLM such as GPT 3.5 could
+generally expose fake news and provide desirable multi-perspective rationales
+but still underperforms the basic SLM, fine-tuned BERT. Our subsequent analysis
+attributes such a gap to the LLM's inability to select and integrate rationales
+properly to conclude. Based on these findings, we propose that current LLMs may
+not substitute fine-tuned SLMs in fake news detection but can be a good advisor
+for SLMs by providing multi-perspective instructive rationales. To instantiate
+this proposal, we design an adaptive rationale guidance network for fake news
+detection (ARG), in which SLMs selectively acquire insights on news analysis
+from the LLMs' rationales. We further derive a rationale-free version of ARG by
+distillation, namely ARG-D, which services cost-sensitive scenarios without
+querying LLMs. Experiments on two real-world datasets demonstrate that ARG and
+ARG-D outperform three types of baseline methods, including SLM-based,
+LLM-based, and combinations of small and large language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures, and 9 tables. To appear at AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ All in How You Ask for It: Simple Black-Box Method for Jailbreak Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09798v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09798v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazuhiro Takemoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) like ChatGPT face `jailbreak' challenges, where
+safeguards are bypassed to produce ethically harmful prompts. This study
+proposes a simple black-box method to effectively generate jailbreak prompts,
+overcoming the high complexity and computational costs associated with existing
+methods. The proposed technique iteratively rewrites harmful prompts into
+non-harmful expressions using the target LLM itself, based on the hypothesis
+that LLMs can directly sample expressions that bypass safeguards. Demonstrated
+through experiments with ChatGPT (GPT-3.5 and GPT-4) and Gemini-Pro, this
+method achieved an attack success rate of over 80% within an average of 5
+iterations and remained effective despite model updates. The generated
+jailbreak prompts were naturally-worded and concise; moreover, they were
+difficult-to-defend. These results indicate that creating effective jailbreak
+prompts is simpler than previously considered, suggesting that black-box
+jailbreak attacks pose a more serious threat.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ETPNav: Evolving Topological Planning for Vision-Language Navigation in
+  Continuous Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03047v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03047v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong An, Hanqing Wang, Wenguan Wang, Zun Wang, Yan Huang, Keji He, Liang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language navigation is a task that requires an agent to follow
+instructions to navigate in environments. It becomes increasingly crucial in
+the field of embodied AI, with potential applications in autonomous navigation,
+search and rescue, and human-robot interaction. In this paper, we propose to
+address a more practical yet challenging counterpart setting - vision-language
+navigation in continuous environments (VLN-CE). To develop a robust VLN-CE
+agent, we propose a new navigation framework, ETPNav, which focuses on two
+critical skills: 1) the capability to abstract environments and generate
+long-range navigation plans, and 2) the ability of obstacle-avoiding control in
+continuous environments. ETPNav performs online topological mapping of
+environments by self-organizing predicted waypoints along a traversed path,
+without prior environmental experience. It privileges the agent to break down
+the navigation procedure into high-level planning and low-level control.
+Concurrently, ETPNav utilizes a transformer-based cross-modal planner to
+generate navigation plans based on topological maps and instructions. The plan
+is then performed through an obstacle-avoiding controller that leverages a
+trial-and-error heuristic to prevent navigation from getting stuck in
+obstacles. Experimental results demonstrate the effectiveness of the proposed
+method. ETPNav yields more than 10% and 20% improvements over prior
+state-of-the-art on R2R-CE and RxR-CE datasets, respectively. Our code is
+available at https://github.com/MarSaKi/ETPNav.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://github.com/MarSaKi/ETPNav</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Taxonomy of Foundation Model based Systems through the Lens of
+  Software Architecture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05352v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05352v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinghua Lu, Liming Zhu, Xiwei Xu, Yue Liu, Zhenchang Xing, Jon Whittle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent release of large language model (LLM) based chatbots, such as
+ChatGPT, has attracted huge interest in foundation models. It is widely
+believed that foundation models will serve as the fundamental building blocks
+for future AI systems. As foundation models are in their early stages, the
+design of foundation model based systems has not yet been systematically
+explored. There is limited understanding about the impact of introducing
+foundation models in software architecture. Therefore, in this paper, we
+propose a taxonomy of foundation model based systems, which classifies and
+compares the characteristics of foundation models and design options of
+foundation model based systems. Our taxonomy comprises three categories: the
+pretraining and adaptation of foundation models, the architecture design of
+foundation model based systems, and responsible-AI-by-design. This taxonomy can
+serve as concrete guidance for making major architectural design decisions when
+designing foundation model based systems and highlights trade-offs arising from
+design decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChatRule: Mining Logical Rules with Large Language Models for Knowledge
+  Graph Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01538v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01538v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linhao Luo, Jiaxin Ju, Bo Xiong, Yuan-Fang Li, Gholamreza Haffari, Shirui Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Logical rules are essential for uncovering the logical connections between
+relations, which could improve reasoning performance and provide interpretable
+results on knowledge graphs (KGs). Although there have been many efforts to
+mine meaningful logical rules over KGs, existing methods suffer from
+computationally intensive searches over the rule space and a lack of
+scalability for large-scale KGs. Besides, they often ignore the semantics of
+relations which is crucial for uncovering logical connections. Recently, large
+language models (LLMs) have shown impressive performance in the field of
+natural language processing and various applications, owing to their emergent
+ability and generalizability. In this paper, we propose a novel framework,
+ChatRule, unleashing the power of large language models for mining logical
+rules over knowledge graphs. Specifically, the framework is initiated with an
+LLM-based rule generator, leveraging both the semantic and structural
+information of KGs to prompt LLMs to generate logical rules. To refine the
+generated rules, a rule ranking module estimates the rule quality by
+incorporating facts from existing KGs. Last, the ranked rules can be used to
+conduct reasoning over KGs. ChatRule is evaluated on four large-scale KGs,
+w.r.t. different rule quality metrics and downstream tasks, showing the
+effectiveness and scalability of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empirical Study of Named Entity Recognition Performance Using
+  Distribution-aware Word Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.01636v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.01636v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Chen, Qi Zhao, Xinyang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the fast development of Deep Learning techniques, Named Entity
+Recognition (NER) is becoming more and more important in the information
+extraction task. The greatest difficulty that the NER task faces is to keep the
+detectability even when types of NE and documents are unfamiliar. Realizing
+that the specificity information may contain potential meanings of a word and
+generate semantic-related features for word embedding, we develop a
+distribution-aware word embedding and implement three different methods to make
+use of the distribution information in a NER framework. And the result shows
+that the performance of NER will be improved if the word specificity is
+incorporated into existing NER methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Want to correct</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using Twitter Data to Understand Public Perceptions of Approved versus
+  Off-label Use for COVID-19-related Medications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.14358v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.14358v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yining Hua, Hang Jiang, Shixu Lin, Jie Yang, Joseph M. Plasek, David W. Bates, Li Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding public discourse on emergency use of unproven therapeutics is
+crucial for monitoring safe use and combating misinformation. We developed a
+natural language processing-based pipeline to comprehend public perceptions of
+and stances on coronavirus disease 2019 (COVID-19)-related drugs on Twitter
+over time. This retrospective study included 609,189 US-based tweets from
+January 29, 2020, to November 30, 2021, about four drugs that garnered
+significant public attention during the COVID-19 pandemic: (1)
+Hydroxychloroquine and Ivermectin, therapies with anecdotal evidence; and (2)
+Molnupiravir and Remdesivir, FDA-approved treatments for eligible patients.
+Time-trend analysis was employed to understand popularity trends and related
+events. Content and demographic analyses were conducted to explore potential
+rationales behind people's stances on each drug. Time-trend analysis indicated
+that Hydroxychloroquine and Ivermectin were discussed more than Molnupiravir
+and Remdesivir, particularly during COVID-19 surges. Hydroxychloroquine and
+Ivermectin discussions were highly politicized, related to conspiracy theories,
+hearsay, and celebrity influences. The distribution of stances between the two
+major US political parties was significantly different (P < .001); Republicans
+were more likely to support Hydroxychloroquine (55%) and Ivermectin (30%) than
+Democrats. People with healthcare backgrounds tended to oppose
+Hydroxychloroquine (7%) more than the general population, while the general
+population was more likely to support Ivermectin (14%). Our study found that
+social media users have varying perceptions and stances on off-label versus
+FDA-authorized drug use at different stages of COVID-19. This indicates that
+health systems, regulatory agencies, and policymakers should design tailored
+strategies to monitor and reduce misinformation to promote safe drug use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Full paper published in JAMIA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Streamlining Social Media Information Extraction for Public Health
+  Research with Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16001v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16001v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yining Hua, Shixu Lin, Minghui Li, Yujie Zhang, Dinah Foer, Siwen Wang, Peilin Zhou, Li Zhou, Jie Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: Social media-based public health research is crucial for epidemic
+surveillance, but most studies identify relevant corpora with keyword matching.
+This study develops a system to streamline the process of curating colloquial
+medical dictionaries. We demonstrate the pipeline by curating a UMLS-colloquial
+symptom dictionary from COVID-19-related tweets as proof of concept. Methods:
+COVID-19-related tweets from February 1, 2020, to April 30, 2022 were used. The
+pipeline includes three modules: a named entity recognition module to detect
+symptoms in tweets; an entity normalization module to aggregate detected
+entities; and a mapping module that iteratively maps entities to Unified
+Medical Language System concepts. A random 500 entity sample were drawn from
+the final dictionary for accuracy validation. Additionally, we conducted a
+symptom frequency distribution analysis to compare our dictionary to a
+pre-defined lexicon from previous research. Results: We identified 498,480
+unique symptom entity expressions from the tweets. Pre-processing reduces the
+number to 18,226. The final dictionary contains 38,175 unique expressions of
+symptoms that can be mapped to 966 UMLS concepts (accuracy = 95%). Symptom
+distribution analysis found that our dictionary detects more symptoms and is
+effective at identifying psychiatric disorders like anxiety and depression,
+often missed by pre-defined lexicons. Conclusion: This study advances public
+health research by implementing a novel, systematic pipeline for curating
+symptom lexicons from social media data. The final lexicon's high accuracy,
+validated by medical professionals, underscores the potential of this
+methodology to reliably interpret and categorize vast amounts of unstructured
+social media data into actionable medical insights across diverse linguistic
+and regional landscapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated full paper. Abstract presented at IEEE ICHI 2023 and AMIA
+  Annual Symposium 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiariST: Streaming Speech Translation with Speaker Diarization <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08007v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08007v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mu Yang, Naoyuki Kanda, Xiaofei Wang, Junkun Chen, Peidong Wang, Jian Xue, Jinyu Li, Takuya Yoshioka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end speech translation (ST) for conversation recordings involves
+several under-explored challenges such as speaker diarization (SD) without
+accurate word time stamps and handling of overlapping speech in a streaming
+fashion. In this work, we propose DiariST, the first streaming ST and SD
+solution. It is built upon a neural transducer-based streaming ST system and
+integrates token-level serialized output training and t-vector, which were
+originally developed for multi-talker speech recognition. Due to the absence of
+evaluation benchmarks in this area, we develop a new evaluation dataset,
+DiariST-AliMeeting, by translating the reference Chinese transcriptions of the
+AliMeeting corpus into English. We also propose new metrics, called
+speaker-agnostic BLEU and speaker-attributed BLEU, to measure the ST quality
+while taking SD accuracy into account. Our system achieves a strong ST and SD
+capability compared to offline systems based on Whisper, while performing
+streaming inference for overlapping speech. To facilitate the research in this
+new direction, we release the evaluation data, the offline baseline systems,
+and the evaluation code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenAI Against Humanity: Nefarious Applications of Generative Artificial
+  Intelligence and Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00737v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00737v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emilio Ferrara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Artificial Intelligence (GenAI) and Large Language Models (LLMs)
+are marvels of technology; celebrated for their prowess in natural language
+processing and multimodal content generation, they promise a transformative
+future. But as with all powerful tools, they come with their shadows. Picture
+living in a world where deepfakes are indistinguishable from reality, where
+synthetic identities orchestrate malicious campaigns, and where targeted
+misinformation or scams are crafted with unparalleled precision. Welcome to the
+darker side of GenAI applications. This article is not just a journey through
+the meanders of potential misuse of GenAI and LLMs, but also a call to
+recognize the urgency of the challenges ahead. As we navigate the seas of
+misinformation campaigns, malicious content generation, and the eerie creation
+of sophisticated malware, we'll uncover the societal implications that ripple
+through the GenAI revolution we are witnessing. From AI-powered botnets on
+social media platforms to the unnerving potential of AI to generate fabricated
+identities, or alibis made of synthetic realities, the stakes have never been
+higher. The lines between the virtual and the real worlds are blurring, and the
+consequences of potential GenAI's nefarious applications impact us all. This
+article serves both as a synthesis of rigorous research presented on the risks
+of GenAI and misuse of LLMs and as a thought-provoking vision of the different
+types of harmful GenAI applications we might encounter in the near future, and
+some ways we can prepare for them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in: Journal of Computational Social Science</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Generate Novel Scientific Directions with Contextualized
+  Literature-based Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14259v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14259v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyun Wang, Doug Downey, Heng Ji, Tom Hope
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Literature-Based Discovery (LBD) aims to discover new scientific knowledge by
+mining papers and generating hypotheses. Standard LBD is limited to predicting
+pairwise relations between discrete concepts (e.g., drug-disease links), and
+ignores critical contexts like experimental settings (e.g., a specific patient
+population where a drug is evaluated) and background motivations (e.g., to find
+drugs without specific side effects). We address these limitations with a novel
+formulation of contextualized-LBD (C-LBD): generating scientific hypotheses in
+natural language, while grounding them in a context that controls the
+hypothesis search space. We present a modeling framework using retrieval of
+``inspirations'' from past scientific papers. Our evaluations reveal that GPT-4
+tends to generate ideas with overall low technical depth and novelty, while our
+inspiration prompting approaches partially mitigate this issue. Our work
+represents a first step toward building language models that generate new ideas
+derived from scientific literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages. Code and resource is available at
+  https://github.com/EagleW/CLBD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Partial Diacritization: A Context-Contrastive Inference Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08919v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08919v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad ElNokrashy, Badr AlKhamissi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diacritization plays a pivotal role in improving readability and
+disambiguating the meaning of Arabic texts. Efforts have so far focused on
+marking every eligible character (Full Diacritization). Comparatively
+overlooked, Partial Diacritzation (PD) is the selection of a subset of
+characters to be marked to aid comprehension where needed. Research has
+indicated that excessive diacritic marks can hinder skilled readers--reducing
+reading speed and accuracy. We conduct a behavioral experiment and show that
+partially marked text is often easier to read than fully marked text, and
+sometimes easier than plain text. In this light, we introduce
+Context-Contrastive Partial Diacritization (CCPD)--a novel approach to PD which
+integrates seamlessly with existing Arabic diacritization systems. CCPD
+processes each word twice, once with context and once without, and diacritizes
+only the characters with disparities between the two inferences. Further, we
+introduce novel indicators for measuring partial diacritization quality (SR,
+PDER, HDER, ERE), essential for establishing this as a machine learning task.
+Lastly, we introduce TD2, a Transformer-variant of an established model which
+offers a markedly different performance profile on our proposed indicators
+compared to all other known systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 equations, 5 tables, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">114</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Simple Open-Vocabulary Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihang Lai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-vocabulary semantic segmentation models aim to accurately assign a
+semantic label to each pixel in an image from a set of arbitrary
+open-vocabulary texts. In order to learn such pixel-level alignment, current
+approaches typically rely on a combination of (i) image-level VL model (e.g.
+CLIP), (ii) ground truth masks, and (iii) custom grouping encoders. In this
+paper, we introduce S-Seg, a novel model that can achieve surprisingly strong
+performance without depending on any of the above elements. S-Seg leverages
+pseudo-mask and language to train a MaskFormer, and can be easily trained from
+publicly available image-text datasets. Contrary to prior works, our model
+directly trains for pixel-level features and language alignment. Once trained,
+S-Seg generalizes well to multiple testing datasets without requiring
+fine-tuning. In addition, S-Seg has the extra benefits of scalability with data
+and consistently improvement when augmented with self-training. We believe that
+our simple yet effective approach will serve as a solid baseline for future
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at: https://github.com/zlai0/S-Seg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Less Could Be Better: Parameter-efficient Fine-tuning Advances Medical
+  Vision Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyu Lian, Hong-Yu Zhou, Yizhou Yu, Liansheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) that was initially developed for
+exploiting pre-trained large language models has recently emerged as an
+effective approach to perform transfer learning on computer vision tasks.
+However, the effectiveness of PEFT on medical vision foundation models is still
+unclear and remains to be explored. As a proof of concept, we conducted a
+detailed empirical study on applying PEFT to chest radiography foundation
+models. Specifically, we delved into LoRA, a representative PEFT method, and
+compared it against full-parameter fine-tuning (FFT) on two self-supervised
+radiography foundation models across three well-established chest radiograph
+datasets. Our results showed that LoRA outperformed FFT in 13 out of 18
+transfer learning tasks by at most 2.9% using fewer than 1% tunable parameters.
+Combining LoRA with foundation models, we set up new state-of-the-art on a
+range of data-efficient learning tasks, such as an AUROC score of 80.6% using
+1% labeled data on NIH ChestX-ray14. We hope this study can evoke more
+attention from the community in the use of PEFT for transfer learning on
+medical imaging tasks. Code and models are available at
+https://github.com/RL4M/MED-PEFT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Connecting the Dots: Leveraging Spatio-Temporal Graph Neural Networks
+  for Accurate Bangla Sign Language Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haz Sameen Shahgir, Khondker Salman Sayeed, Md Toki Tahmid, Tanjeem Azwad Zaman, Md. Zarif Ul Alam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Deep Learning and Computer Vision have been successfully
+leveraged to serve marginalized communities in various contexts. One such area
+is Sign Language - a primary means of communication for the deaf community.
+However, so far, the bulk of research efforts and investments have gone into
+American Sign Language, and research activity into low-resource sign languages
+- especially Bangla Sign Language - has lagged significantly. In this research
+paper, we present a new word-level Bangla Sign Language dataset - BdSL40 -
+consisting of 611 videos over 40 words, along with two different approaches:
+one with a 3D Convolutional Neural Network model and another with a novel Graph
+Neural Network approach for the classification of BdSL40 dataset. This is the
+first study on word-level BdSL recognition, and the dataset was transcribed
+from Indian Sign Language (ISL) using the Bangla Sign Language Dictionary
+(1997). The proposed GNN model achieved an F1 score of 89%. The study
+highlights the significant lexical and semantic similarity between BdSL, West
+Bengal Sign Language, and ISL, and the lack of word-level datasets for BdSL in
+the literature. We release the dataset and source code to stimulate further
+research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CheXagent: Towards a Foundation Model for Chest X-Ray Interpretation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihong Chen, Maya Varma, Jean-Benoit Delbrouck, Magdalini Paschali, Louis Blankemeier, Dave Van Veen, Jeya Maria Jose Valanarasu, Alaa Youssef, Joseph Paul Cohen, Eduardo Pontes Reis, Emily B. Tsai, Andrew Johnston, Cameron Olsen, Tanishq Mathew Abraham, Sergios Gatidis, Akshay S. Chaudhari, Curtis Langlotz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chest X-rays (CXRs) are the most frequently performed imaging test in
+clinical practice. Recent advances in the development of vision-language
+foundation models (FMs) give rise to the possibility of performing automated
+CXR interpretation, which can assist physicians with clinical decision-making
+and improve patient outcomes. However, developing FMs that can accurately
+interpret CXRs is challenging due to the (1) limited availability of
+large-scale vision-language datasets in the medical image domain, (2) lack of
+vision and language encoders that can capture the complexities of medical data,
+and (3) absence of evaluation frameworks for benchmarking the abilities of FMs
+on CXR interpretation. In this work, we address these challenges by first
+introducing \emph{CheXinstruct} - a large-scale instruction-tuning dataset
+curated from 28 publicly-available datasets. We then present \emph{CheXagent} -
+an instruction-tuned FM capable of analyzing and summarizing CXRs. To build
+CheXagent, we design a clinical large language model (LLM) for parsing
+radiology reports, a vision encoder for representing CXR images, and a network
+to bridge the vision and language modalities. Finally, we introduce
+\emph{CheXbench} - a novel benchmark designed to systematically evaluate FMs
+across 8 clinically-relevant CXR interpretation tasks. Extensive quantitative
+evaluations and qualitative reviews with five expert radiologists demonstrate
+that CheXagent outperforms previously-developed general- and medical-domain FMs
+on CheXbench tasks. Furthermore, in an effort to improve model transparency, we
+perform a fairness evaluation across factors of sex, race and age to highlight
+potential performance disparities. Our project is at
+\url{https://stanford-aimi.github.io/chexagent.html}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OK-Robot: What Really Matters in Integrating Open-Knowledge Models for
+  Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiqi Liu, Yaswanth Orru, Chris Paxton, Nur Muhammad Mahi Shafiullah, Lerrel Pinto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remarkable progress has been made in recent years in the fields of vision,
+language, and robotics. We now have vision models capable of recognizing
+objects based on language queries, navigation systems that can effectively
+control mobile systems, and grasping models that can handle a wide range of
+objects. Despite these advancements, general-purpose applications of robotics
+still lag behind, even though they rely on these fundamental capabilities of
+recognition, navigation, and grasping. In this paper, we adopt a systems-first
+approach to develop a new Open Knowledge-based robotics framework called
+OK-Robot. By combining Vision-Language Models (VLMs) for object detection,
+navigation primitives for movement, and grasping primitives for object
+manipulation, OK-Robot offers a integrated solution for pick-and-drop
+operations without requiring any training. To evaluate its performance, we run
+OK-Robot in 10 real-world home environments. The results demonstrate that
+OK-Robot achieves a 58.5% success rate in open-ended pick-and-drop tasks,
+representing a new state-of-the-art in Open Vocabulary Mobile Manipulation
+(OVMM) with nearly 1.8x the performance of prior work. On cleaner, uncluttered
+environments, OK-Robot's performance increases to 82%. However, the most
+important insight gained from OK-Robot is the critical role of nuanced details
+when combining Open Knowledge systems like VLMs with robotic modules. Videos of
+our experiments are available on our website: https://ok-robot.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LONEStar: The Lunar Flashlight Optical Navigation Experiment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12198v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12198v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Krause, Ava Thrasher, Priyal Soni, Liam Smego, Reuben Isaac, Jennifer Nolan, Micah Pledger, E. Glenn Lightsey, W. Jud Ready, John Christian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper documents the results from the highly successful Lunar flashlight
+Optical Navigation Experiment with a Star tracker (LONEStar). Launched in
+December 2022, Lunar Flashlight (LF) was a NASA-funded technology demonstration
+mission. After a propulsion system anomaly prevented capture in lunar orbit, LF
+was ejected from the Earth-Moon system and into heliocentric space. NASA
+subsequently transferred ownership of LF to Georgia Tech to conduct an unfunded
+extended mission to demonstrate further advanced technology objectives,
+including LONEStar. From August-December 2023, the LONEStar team performed
+on-orbit calibration of the optical instrument and a number of different OPNAV
+experiments. This campaign included the processing of nearly 400 images of star
+fields, Earth and Moon, and four other planets (Mercury, Mars, Jupiter, and
+Saturn). LONEStar provided the first on-orbit demonstrations of heliocentric
+navigation using only optical observations of planets. Of special note is the
+successful in-flight demonstration of (1) instantaneous triangulation with
+simultaneous sightings of two planets with the LOST algorithm and (2) dynamic
+triangulation with sequential sightings of multiple planets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Broiler-Net: A Deep Convolutional Framework for Broiler Behavior
+  Analysis in Poultry Houses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tahereh Zarrat Ehsan, Seyed Mehdi Mohtavipour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting anomalies in poultry houses is crucial for maintaining optimal
+chicken health conditions, minimizing economic losses and bolstering
+profitability. This paper presents a novel real-time framework for analyzing
+chicken behavior in cage-free poultry houses to detect abnormal behaviors.
+Specifically, two significant abnormalities, namely inactive broiler and
+huddling behavior, are investigated in this study. The proposed framework
+comprises three key steps: (1) chicken detection utilizing a state-of-the-art
+deep learning model, (2) tracking individual chickens across consecutive frames
+with a fast tracker module, and (3) detecting abnormal behaviors within the
+video stream. Experimental studies are conducted to evaluate the efficacy of
+the proposed algorithm in accurately assessing chicken behavior. The results
+illustrate that our framework provides a precise and efficient solution for
+real-time anomaly detection, facilitating timely interventions to maintain
+chicken health and enhance overall productivity on poultry farms. Github:
+https://github.com/TaherehZarratEhsan/Chicken-Behavior-Analysis
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Single-View 3D Human Digitalization with Large Reconstruction Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenzhen Weng, Jingyuan Liu, Hao Tan, Zhan Xu, Yang Zhou, Serena Yeung-Levy, Jimei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce Human-LRM, a single-stage feed-forward Large
+Reconstruction Model designed to predict human Neural Radiance Fields (NeRF)
+from a single image. Our approach demonstrates remarkable adaptability in
+training using extensive datasets containing 3D scans and multi-view capture.
+Furthermore, to enhance the model's applicability for in-the-wild scenarios
+especially with occlusions, we propose a novel strategy that distills
+multi-view reconstruction into single-view via a conditional triplane diffusion
+model. This generative extension addresses the inherent variations in human
+body shapes when observed from a single view, and makes it possible to
+reconstruct the full body human from an occluded image. Through extensive
+experiments, we show that Human-LRM surpasses previous methods by a significant
+margin on several benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning
+  Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyuan Chen, Zhuo Xu, Sean Kirmani, Brian Ichter, Danny Driess, Pete Florence, Dorsa Sadigh, Leonidas Guibas, Fei Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and reasoning about spatial relationships is a fundamental
+capability for Visual Question Answering (VQA) and robotics. While Vision
+Language Models (VLM) have demonstrated remarkable performance in certain VQA
+benchmarks, they still lack capabilities in 3D spatial reasoning, such as
+recognizing quantitative relationships of physical objects like distances or
+size differences. We hypothesize that VLMs' limited spatial reasoning
+capability is due to the lack of 3D spatial knowledge in training data and aim
+to solve this problem by training VLMs with Internet-scale spatial reasoning
+data. To this end, we present a system to facilitate this approach. We first
+develop an automatic 3D spatial VQA data generation framework that scales up to
+2 billion VQA examples on 10 million real-world images. We then investigate
+various factors in the training recipe, including data quality, training
+pipeline, and VLM architecture. Our work features the first internet-scale 3D
+spatial reasoning dataset in metric space. By training a VLM on such data, we
+significantly enhance its ability on both qualitative and quantitative spatial
+VQA. Finally, we demonstrate that this VLM unlocks novel downstream
+applications in chain-of-thought spatial reasoning and robotics due to its
+quantitative estimation capability. Project website:
+https://spatial-vlm.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semi-supervised segmentation of land cover images using nonlinear
+  canonical correlation analysis with multiple features and t-SNE 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12164v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12164v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Wei, James Xiao, Yichao Zhang, Xia Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image segmentation is a clustering task whereby each pixel is assigned a
+cluster label. Remote sensing data usually consists of multiple bands of
+spectral images in which there exist semantically meaningful land cover
+subregions, co-registered with other source data such as LIDAR (LIght Detection
+And Ranging) data, where available. This suggests that, in order to account for
+spatial correlation between pixels, a feature vector associated with each pixel
+may be a vectorized tensor representing the multiple bands and a local patch as
+appropriate. Similarly, multiple types of texture features based on a pixel's
+local patch would also be beneficial for encoding locally statistical
+information and spatial variations, without necessarily labelling pixel-wise a
+large amount of ground truth, then training a supervised model, which is
+sometimes impractical. In this work, by resorting to label only a small
+quantity of pixels, a new semi-supervised segmentation approach is proposed.
+Initially, over all pixels, an image data matrix is created in high dimensional
+feature space. Then, t-SNE projects the high dimensional data onto 3D
+embedding. By using radial basis functions as input features, which use the
+labelled data samples as centres, to pair with the output class labels, a
+modified canonical correlation analysis algorithm, referred to as RBF-CCA, is
+introduced which learns the associated projection matrix via the small labelled
+data set. The associated canonical variables, obtained for the full image, are
+applied by k-means clustering algorithm. The proposed semi-supervised RBF-CCA
+algorithm has been implemented on several remotely sensed multispectral images,
+demonstrating excellent segmentation results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated facial recognition system using deep learning for pain
+  assessment in adults with cerebral palsy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12161v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12161v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Álvaro Sabater-Gárriz, F. Xavier Gaya-Morey, José María Buades-Rubio, Cristina Manresa Yee, Pedro Montoya, Inmaculada Riquelme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Pain assessment in individuals with neurological conditions,
+especially those with limited self-report ability and altered facial
+expressions, presents challenges. Existing measures, relying on direct
+observation by caregivers, lack sensitivity and specificity. In cerebral palsy,
+pain is a common comorbidity and a reliable evaluation protocol is crucial.
+Thus, having an automatic system that recognizes facial expressions could be of
+enormous help when diagnosing pain in this type of patient.
+  Objectives: 1) to build a dataset of facial pain expressions in individuals
+with cerebral palsy, and 2) to develop an automated facial recognition system
+based on deep learning for pain assessment addressed to this population.
+  Methods: Ten neural networks were trained on three pain image databases,
+including the UNBC-McMaster Shoulder Pain Expression Archive Database, the
+Multimodal Intensity Pain Dataset, and the Delaware Pain Database.
+Additionally, a curated dataset (CPPAIN) was created, consisting of 109
+preprocessed facial pain expression images from individuals with cerebral
+palsy, categorized by two physiotherapists using the Facial Action Coding
+System observational scale.
+  Results: InceptionV3 exhibited promising performance on the CP-PAIN dataset,
+achieving an accuracy of 62.67% and an F1 score of 61.12%. Explainable
+artificial intelligence techniques revealed consistent essential features for
+pain identification across models.
+  Conclusion: This study demonstrates the potential of deep learning models for
+robust pain detection in populations with neurological conditions and
+communication disabilities. The creation of a larger dataset specific to
+cerebral palsy would further enhance model accuracy, offering a valuable tool
+for discerning subtle and idiosyncratic pain expressions. The insights gained
+could extend to other complex neurological conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VRMN-bD: A Multi-modal Natural Behavior <span class="highlight-title">Dataset</span> of Immersive Human Fear
+  Responses in VR Stand-up Interactive Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        He Zhang, Xinyang Li, Yuanxi Sun, Xinyi Fu, Christine Qiu, John M. Carroll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and recognizing emotions are important and challenging issues
+in the metaverse era. Understanding, identifying, and predicting fear, which is
+one of the fundamental human emotions, in virtual reality (VR) environments
+plays an essential role in immersive game development, scene development, and
+next-generation virtual human-computer interaction applications. In this
+article, we used VR horror games as a medium to analyze fear emotions by
+collecting multi-modal data (posture, audio, and physiological signals) from 23
+players. We used an LSTM-based model to predict fear with accuracies of 65.31%
+and 90.47% under 6-level classification (no fear and five different levels of
+fear) and 2-level classification (no fear and fear), respectively. We
+constructed a multi-modal natural behavior dataset of immersive human fear
+responses (VRMN-bD) and compared it with existing relevant advanced datasets.
+The results show that our dataset has fewer limitations in terms of collection
+method, data scale and audience scope. We are unique and advanced in targeting
+multi-modal datasets of fear and behavior in VR stand-up interactive
+environments. Moreover, we discussed the implications of this work for
+communities and applications. The dataset and pre-trained model are available
+at https://github.com/KindOPSTAR/VRMN-bD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE VR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Out-of-Distribution Detection & Applications With Ablated Learned
+  Temperature Energy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Will LeVine, Benjamin Pikus, Jacob Phillips, Berk Norman, Fernando Amat Gil, Sean Hendryx
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As deep neural networks become adopted in high-stakes domains, it is crucial
+to be able to identify when inference inputs are Out-of-Distribution (OOD) so
+that users can be alerted of likely drops in performance and calibration
+despite high confidence. Among many others, existing methods use the following
+two scores to do so without training on any apriori OOD examples: a learned
+temperature and an energy score. In this paper we introduce Ablated Learned
+Temperature Energy (or "AbeT" for short), a method which combines these prior
+methods in novel ways with effective modifications. Due to these contributions,
+AbeT lowers the False Positive Rate at $95\%$ True Positive Rate (FPR@95) by
+$35.39\%$ in classification (averaged across all ID and OOD datasets measured)
+compared to state of the art without training networks in multiple stages or
+requiring hyperparameters or test-time backward passes. We additionally provide
+empirical insights as to how our model learns to distinguish between
+In-Distribution (ID) and OOD samples while only being explicitly trained on ID
+samples via exposure to misclassified ID examples at training time. Lastly, we
+show the efficacy of our method in identifying predicted bounding boxes and
+pixels corresponding to OOD objects in object detection and semantic
+segmentation, respectively - with an AUROC increase of $5.15\%$ in object
+detection and both a decrease in FPR@95 of $41.48\%$ and an increase in AUPRC
+of $34.20\%$ on average in semantic segmentation compared to previous state of
+the art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepCERES: A Deep learning method for cerebellar lobule segmentation
+  using ultra-high resolution multimodal MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Morell-Ortega, Marina Ruiz-Perez, Marien Gadea, Roberto Vivo-Hernando, Gregorio Rubio, Fernando Aparici, Mariam de la Iglesia-Vaya, Gwenaelle Catheline, Pierrick Coupé, José V. Manjón
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel multimodal and high-resolution human brain
+cerebellum lobule segmentation method. Unlike current tools that operate at
+standard resolution ($1 \text{ mm}^{3}$) or using mono-modal data, the proposed
+method improves cerebellum lobule segmentation through the use of a multimodal
+and ultra-high resolution ($0.125 \text{ mm}^{3}$) training dataset. To develop
+the method, first, a database of semi-automatically labelled cerebellum lobules
+was created to train the proposed method with ultra-high resolution T1 and T2
+MR images. Then, an ensemble of deep networks has been designed and developed,
+allowing the proposed method to excel in the complex cerebellum lobule
+segmentation task, improving precision while being memory efficient. Notably,
+our approach deviates from the traditional U-Net model by exploring alternative
+architectures. We have also integrated deep learning with classical machine
+learning methods incorporating a priori knowledge from multi-atlas
+segmentation, which improved precision and robustness. Finally, a new online
+pipeline, named DeepCERES, has been developed to make available the proposed
+method to the scientific community requiring as input only a single T1 MR image
+at standard resolution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CloSe: A 3D Clothing Segmentation <span class="highlight-title">Dataset</span> and Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrije Antić, Garvita Tiwari, Batuhan Ozcomlekci, Riccardo Marin, Gerard Pons-Moll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Clothing modeling and datasets play crucial role in the entertainment,
+animation, and digital fashion industries. Existing work often lacks detailed
+semantic understanding or uses synthetic datasets, lacking realism and
+personalization. To address this, we first introduce CloSe-D: a novel
+large-scale dataset containing 3D clothing segmentation of 3167 scans, covering
+a range of 18 distinct clothing classes. Additionally, we propose CloSe-Net,
+the first learning-based 3D clothing segmentation model for fine-grained
+segmentation from colored point clouds. CloSe-Net uses local point features,
+body-clothing correlation, and a garment-class and point features-based
+attention module, improving performance over baselines and prior work. The
+proposed attention module enables our model to learn appearance and
+geometry-dependent clothing prior from data. We further validate the efficacy
+of our approach by successfully segmenting publicly available datasets of
+people in clothing. We also introduce CloSe-T, a 3D interactive tool for
+refining segmentation labels. Combining the tool with CloSe-T in a continual
+learning setup demonstrates improved generalization on real-world data.
+Dataset, model, and tool can be found at
+https://virtualhumans.mpi-inf.mpg.de/close3dv24/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HomeRobot Open Vocabulary Mobile Manipulation Challenge 2023 Participant
+  Report (Team KuzHum) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Volodymyr Kuzma, Vladyslav Humennyy, Ruslan Partsey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We report an improvements to NeurIPS 2023 HomeRobot: Open Vocabulary Mobile
+Manipulation (OVMM) Challenge reinforcement learning baseline. More
+specifically, we propose more accurate semantic segmentation module, along with
+better place skill policy, and high-level heuristic that outperforms the
+baseline by 2.4% of overall success rate (sevenfold improvement) and 8.2% of
+partial success rate (1.75 times improvement) on Test Standard split of the
+challenge dataset. With aforementioned enhancements incorporated our agent
+scored 3rd place in the challenge on both simulation and real-world stages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Look, Listen and Recognise: Character-Aware Audio-Visual Subtitling <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Korbar, Jaesung Huh, Andrew Zisserman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of this paper is automatic character-aware subtitle generation.
+Given a video and a minimal amount of metadata, we propose an audio-visual
+method that generates a full transcript of the dialogue, with precise speech
+timestamps, and the character speaking identified. The key idea is to first use
+audio-visual cues to select a set of high-precision audio exemplars for each
+character, and then use these exemplars to classify all speech segments by
+speaker identity. Notably, the method does not require face detection or
+tracking. We evaluate the method over a variety of TV sitcoms, including
+Seinfeld, Fraiser and Scrubs. We envision this system being useful for the
+automatic generation of subtitles to improve the accessibility of the vast
+amount of videos available on modern streaming services. Project page :
+\url{https://www.robots.ox.ac.uk/~vgg/research/look-listen-recognise/}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Momentum-SAM: Sharpness Aware Minimization without Computational
+  Overhead 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marlon Becker, Frederick Altrock, Benjamin Risse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently proposed optimization algorithm for deep neural networks
+Sharpness Aware Minimization (SAM) suggests perturbing parameters before
+gradient calculation by a gradient ascent step to guide the optimization into
+parameter space regions of flat loss. While significant generalization
+improvements and thus reduction of overfitting could be demonstrated, the
+computational costs are doubled due to the additionally needed gradient
+calculation, making SAM unfeasible in case of limited computationally
+capacities. Motivated by Nesterov Accelerated Gradient (NAG) we propose
+Momentum-SAM (MSAM), which perturbs parameters in the direction of the
+accumulated momentum vector to achieve low sharpness without significant
+computational overhead or memory demands over SGD or Adam. We evaluate MSAM in
+detail and reveal insights on separable mechanisms of NAG, SAM and MSAM
+regarding training optimization and generalization. Code is available at
+https://github.com/MarlonBecker/MSAM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stereo-Matching Knowledge Distilled Monocular Depth Estimation Filtered
+  by Multiple Disparity Consistency <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Woonghyun Ka, Jae Young Lee, Jaehyun Choi, Junmo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In stereo-matching knowledge distillation methods of the self-supervised
+monocular depth estimation, the stereo-matching network's knowledge is
+distilled into a monocular depth network through pseudo-depth maps. In these
+methods, the learning-based stereo-confidence network is generally utilized to
+identify errors in the pseudo-depth maps to prevent transferring the errors.
+However, the learning-based stereo-confidence networks should be trained with
+ground truth (GT), which is not feasible in a self-supervised setting. In this
+paper, we propose a method to identify and filter errors in the pseudo-depth
+map using multiple disparity maps by checking their consistency without the
+need for GT and a training process. Experimental results show that the proposed
+method outperforms the previous methods and works well on various
+configurations by filtering out erroneous areas where the stereo-matching is
+vulnerable, especially such as textureless regions, occlusion boundaries, and
+reflective surfaces.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2024. The first two authors are equally contributed</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robustness to distribution shifts of compressed networks for edge
+  devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lulan Shen, Ali Edalati, Brett Meyer, Warren Gross, James J. Clark
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is necessary to develop efficient DNNs deployed on edge devices with
+limited computation resources. However, the compressed networks often execute
+new tasks in the target domain, which is different from the source domain where
+the original network is trained. It is important to investigate the robustness
+of compressed networks in two types of data distribution shifts: domain shifts
+and adversarial perturbations. In this study, we discover that compressed
+models are less robust to distribution shifts than their original networks.
+Interestingly, larger networks are more vulnerable to losing robustness than
+smaller ones, even when they are compressed to a similar size as the smaller
+networks. Furthermore, compact networks obtained by knowledge distillation are
+much more robust to distribution shifts than pruned networks. Finally,
+post-training quantization is a reliable method for achieving significant
+robustness to distribution shifts, and it outperforms both pruned and distilled
+models in terms of robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling Stereo-Confidence Out of the End-to-End Stereo-Matching Network
+  via Disparity Plane Sweep <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jae Young Lee, Woonghyun Ka, Jaehyun Choi, Junmo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel stereo-confidence that can be measured externally to
+various stereo-matching networks, offering an alternative input modality choice
+of the cost volume for learning-based approaches, especially in safety-critical
+systems. Grounded in the foundational concepts of disparity definition and the
+disparity plane sweep, the proposed stereo-confidence method is built upon the
+idea that any shift in a stereo-image pair should be updated in a corresponding
+amount shift in the disparity map. Based on this idea, the proposed
+stereo-confidence method can be summarized in three folds. 1) Using the
+disparity plane sweep, multiple disparity maps can be obtained and treated as a
+3-D volume (predicted disparity volume), like the cost volume is constructed.
+2) One of these disparity maps serves as an anchor, allowing us to define a
+desirable (or ideal) disparity profile at every spatial point. 3) By comparing
+the desirable and predicted disparity profiles, we can quantify the level of
+matching ambiguity between left and right images for confidence measurement.
+Extensive experimental results using various stereo-matching networks and
+datasets demonstrate that the proposed stereo-confidence method not only shows
+competitive performance on its own but also consistent performance improvements
+when it is used as an input modality for learning-based stereo-confidence
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2024. The first two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Face Interaction Graph Networks to Real World Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatiana Lopez-Guevara, Yulia Rubanova, William F. Whitney, Tobias Pfaff, Kimberly Stachenfeld, Kelsey R. Allen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately simulating real world object dynamics is essential for various
+applications such as robotics, engineering, graphics, and design. To better
+capture complex real dynamics such as contact and friction, learned simulators
+based on graph networks have recently shown great promise. However, applying
+these learned simulators to real scenes comes with two major challenges: first,
+scaling learned simulators to handle the complexity of real world scenes which
+can involve hundreds of objects each with complicated 3D shapes, and second,
+handling inputs from perception rather than 3D state information. Here we
+introduce a method which substantially reduces the memory required to run
+graph-based learned simulators. Based on this memory-efficient simulation
+model, we then present a perceptual interface in the form of editable NeRFs
+which can convert real-world scenes into a structured representation that can
+be processed by graph network simulator. We show that our method uses
+substantially less memory than previous graph-based simulators while retaining
+their accuracy, and that the simulators learned in synthetic environments can
+be applied to real world scenes captured from multiple camera angles. This
+paves the way for expanding the application of learned simulators to settings
+where only perceptual information is available at inference time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Observation-Guided Meteorological Field Downscaling at Station Scale: A
+  Benchmark and a New Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zili Liu, Hao Chen, Lei Bai, Wenyuan Li, Keyan Chen, Zhengyi Wang, Wanli Ouyang, Zhengxia Zou, Zhenwei Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Downscaling (DS) of meteorological variables involves obtaining
+high-resolution states from low-resolution meteorological fields and is an
+important task in weather forecasting. Previous methods based on deep learning
+treat downscaling as a super-resolution task in computer vision and utilize
+high-resolution gridded meteorological fields as supervision to improve
+resolution at specific grid scales. However, this approach has struggled to
+align with the continuous distribution characteristics of meteorological
+fields, leading to an inherent systematic bias between the downscaled results
+and the actual observations at meteorological stations. In this paper, we
+extend meteorological downscaling to arbitrary scattered station scales,
+establish a brand new benchmark and dataset, and retrieve meteorological states
+at any given station location from a coarse-resolution meteorological field.
+Inspired by data assimilation techniques, we integrate observational data into
+the downscaling process, providing multi-scale observational priors. Building
+on this foundation, we propose a new downscaling model based on hypernetwork
+architecture, namely HyperDS, which efficiently integrates different
+observational information into the model training, achieving continuous scale
+modeling of the meteorological field. Through extensive experiments, our
+proposed method outperforms other specially designed baseline models on
+multiple surface variables. Notably, the mean squared error (MSE) for wind
+speed and surface pressure improved by 67% and 19.5% compared to other methods.
+We will release the dataset and code subsequently.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Denoising Diffusion Model for Blind Image Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xudong Li, Jingyuan Zheng, Runze Hu, Yan Zhang, Ke Li, Yunhang Shen, Xiawu Zheng, Yutao Liu, ShengChuan Zhang, Pingyang Dai, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blind Image Quality Assessment (BIQA) aims to evaluate image quality in line
+with human perception, without reference benchmarks. Currently, deep learning
+BIQA methods typically depend on using features from high-level tasks for
+transfer learning. However, the inherent differences between BIQA and these
+high-level tasks inevitably introduce noise into the quality-aware features. In
+this paper, we take an initial step towards exploring the diffusion model for
+feature denoising in BIQA, namely Perceptual Feature Diffusion for IQA
+(PFD-IQA), which aims to remove noise from quality-aware features.
+Specifically, (i) We propose a {Perceptual Prior Discovery and Aggregation
+module to establish two auxiliary tasks to discover potential low-level
+features in images that are used to aggregate perceptual text conditions for
+the diffusion model. (ii) We propose a Perceptual Prior-based Feature
+Refinement strategy, which matches noisy features to predefined denoising
+trajectories and then performs exact feature denoising based on text
+conditions. Extensive experiments on eight standard BIQA datasets demonstrate
+the superior performance to the state-of-the-art BIQA methods, i.e., achieving
+the PLCC values of 0.935 ( vs. 0.905 in KADID) and 0.922 ( vs. 0.894 in LIVEC).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding
+  Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ge Zhang, Xinrun Du, Bei Chen, Yiming Liang, Tongxu Luo, Tianyu Zheng, Kang Zhu, Yuyang Cheng, Chunpu Xu, Shuyue Guo, Haoran Zhang, Xingwei Qu, Junjie Wang, Ruibin Yuan, Yizhi Li, Zekun Wang, Yudong Liu, Yu-Hsuan Tsai, Fengji Zhang, Chenghua Lin, Wenhao Huang, Wenhu Chen, Jie Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the capabilities of large multimodal models (LMMs) continue to advance,
+evaluating the performance of LMMs emerges as an increasing need. Additionally,
+there is an even larger gap in evaluating the advanced knowledge and reasoning
+abilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU,
+a new Chinese Massive Multi-discipline Multimodal Understanding benchmark
+designed to evaluate LMMs on tasks demanding college-level subject knowledge
+and deliberate reasoning in a Chinese context. CMMMU is inspired by and
+strictly follows the annotation and analysis pattern of MMMU.
+  CMMMU includes 12k manually collected multimodal questions from college
+exams, quizzes, and textbooks, covering six core disciplines: Art & Design,
+Business, Science, Health & Medicine, Humanities & Social Science, and Tech &
+Engineering, like its companion, MMMU. These questions span 30 subjects and
+comprise 39 highly heterogeneous image types, such as charts, diagrams, maps,
+tables, music sheets, and chemical structures.
+  CMMMU focuses on complex perception and reasoning with domain-specific
+knowledge in the Chinese context. We evaluate 11 open-source LLMs and one
+proprietary GPT-4V(ision). Even GPT-4V only achieves accuracies of 42%,
+indicating a large space for improvement. CMMMU will boost the community to
+build the next-generation LMMs towards expert artificial intelligence and
+promote the democratization of LMMs by providing diverse language contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Large Multimodal Models against Common Corruptions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Zhang, Tianyu Pang, Chao Du, Yi Ren, Bo Li, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This technical report aims to fill a deficiency in the assessment of large
+multimodal models (LMMs) by specifically examining the self-consistency of
+their outputs when subjected to common corruptions. We investigate the
+cross-modal interactions between text, image, and speech, encompassing four
+essential generation tasks: text-to-image, image-to-text, text-to-speech, and
+speech-to-text. We create a comprehensive benchmark, named MMCBench, that
+covers more than 100 popular LMMs (totally over 150 model checkpoints). A
+thorough evaluation under common corruptions is critical for practical
+deployment and facilitates a better understanding of the reliability of
+cutting-edge LMMs. The benchmarking code is available at
+https://github.com/sail-sg/MMCBench
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Saliency Enhanced Feature Fusion based multiscale RGB-D Salient Object
+  Detection Network <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11914v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11914v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Huang, Qingyi Zhao, Yan Xing, Sihua Gao, Weifeng Xu, Yuxiang Zhang, Wei Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiscale convolutional neural network (CNN) has demonstrated remarkable
+capabilities in solving various vision problems. However, fusing features of
+different scales alwaysresults in large model sizes, impeding the application
+of multiscale CNNs in RGB-D saliency detection. In this paper, we propose a
+customized feature fusion module, called Saliency Enhanced Feature Fusion
+(SEFF), for RGB-D saliency detection. SEFF utilizes saliency maps of the
+neighboring scales to enhance the necessary features for fusing, resulting in
+more representative fused features. Our multiscale RGB-D saliency detector uses
+SEFF and processes images with three different scales. SEFF is used to fuse the
+features of RGB and depth images, as well as the features of decoders at
+different scales. Extensive experiments on five benchmark datasets have
+demonstrated the superiority of our method over ten SOTA saliency detectors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accpeted by 2024 IEEE International Conference on Acoustics, Speech,
+  and Signal Processing (ICASSP 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large receptive field strategy and important feature extraction strategy
+  in 3D object detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11913v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11913v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leichao Cui, Xiuxian Li, Min Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The enhancement of 3D object detection is pivotal for precise environmental
+perception and improved task execution capabilities in autonomous driving.
+LiDAR point clouds, offering accurate depth information, serve as a crucial
+information for this purpose. Our study focuses on key challenges in 3D target
+detection. To tackle the challenge of expanding the receptive field of a 3D
+convolutional kernel, we introduce the Dynamic Feature Fusion Module (DFFM).
+This module achieves adaptive expansion of the 3D convolutional kernel's
+receptive field, balancing the expansion with acceptable computational loads.
+This innovation reduces operations, expands the receptive field, and allows the
+model to dynamically adjust to different object requirements. Simultaneously,
+we identify redundant information in 3D features. Employing the Feature
+Selection Module (FSM) quantitatively evaluates and eliminates non-important
+features, achieving the separation of output box fitting and feature
+extraction. This innovation enables the detector to focus on critical features,
+resulting in model compression, reduced computational burden, and minimized
+candidate frame interference. Extensive experiments confirm that both DFFM and
+FSM not only enhance current benchmarks, particularly in small target
+detection, but also accelerate network performance. Importantly, these modules
+exhibit effective complementarity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Training-Free Defense Framework for Robust Learned Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Myungseo Song, Jinyoung Choi, Bohyung Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the robustness of learned image compression models against
+adversarial attacks and present a training-free defense technique based on
+simple image transform functions. Recent learned image compression models are
+vulnerable to adversarial attacks that result in poor compression rate, low
+reconstruction quality, or weird artifacts. To address the limitations, we
+propose a simple but effective two-way compression algorithm with random input
+transforms, which is conveniently applicable to existing image compression
+models. Unlike the na\"ive approaches, our approach preserves the original
+rate-distortion performance of the models on clean images. Moreover, the
+proposed algorithm requires no additional training or modification of existing
+models, making it more practical. We demonstrate the effectiveness of the
+proposed techniques through extensive experiments under multiple compression
+models, evaluation metrics, and attack scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages and 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Feasibility of Standard Facial Expression Recognition in
+  Individuals with Moderate to Severe Intellectual Disabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        F. Xavier Gaya-Morey, Silvia Ramis, Jose M. Buades-Rubio, Cristina Manresa-Yee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research has underscored the increasing preference of users for
+human-like interactions with machines. Consequently, facial expression
+recognition has gained significance as a means of imparting social robots with
+the capacity to discern the emotional states of users. In this investigation,
+we assess the suitability of deep learning approaches, known for their
+remarkable performance in this domain, for recognizing facial expressions in
+individuals with intellectual disabilities, which has not been yet studied in
+the literature, to the best of our knowledge. To address this objective, we
+train a set of twelve distinct convolutional neural networks in different
+approaches, including an ensemble of datasets without individuals with
+intellectual disabilities and a dataset featuring such individuals. Our
+examination of the outcomes achieved by the various models under distinct
+training conditions, coupled with a comprehensive analysis of critical facial
+regions during expression recognition facilitated by explainable artificial
+intelligence techniques, revealed significant distinctions in facial
+expressions between individuals with and without intellectual disabilities, as
+well as among individuals with intellectual disabilities. Remarkably, our
+findings demonstrate the feasibility of facial expression recognition within
+this population through tailored user-specific training methodologies, which
+enable the models to effectively address the unique expressions of each user.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detect-Order-Construct: A Tree Construction based Approach for
+  Hierarchical Document Structure Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Wang, Kai Hu, Zhuoyao Zhong, Lei Sun, Qiang Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document structure analysis (aka document layout analysis) is crucial for
+understanding the physical layout and logical structure of documents, with
+applications in information retrieval, document summarization, knowledge
+extraction, etc. In this paper, we concentrate on Hierarchical Document
+Structure Analysis (HDSA) to explore hierarchical relationships within
+structured documents created using authoring software employing hierarchical
+schemas, such as LaTeX, Microsoft Word, and HTML. To comprehensively analyze
+hierarchical document structures, we propose a tree construction based approach
+that addresses multiple subtasks concurrently, including page object detection
+(Detect), reading order prediction of identified objects (Order), and the
+construction of intended hierarchical structure (Construct). We present an
+effective end-to-end solution based on this framework to demonstrate its
+performance. To assess our approach, we develop a comprehensive benchmark
+called Comp-HRDoc, which evaluates the above subtasks simultaneously. Our
+end-to-end system achieves state-of-the-art performance on two large-scale
+document layout analysis datasets (PubLayNet and DocLayNet), a high-quality
+hierarchical document structure reconstruction dataset (HRDoc), and our
+Comp-HRDoc benchmark. The Comp-HRDoc benchmark will be released to facilitate
+further research in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LKFormer: Large Kernel <span class="highlight-title">Transformer</span> for Infrared Image Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiwei Qin, Kang Yan, Changmiao Wang, Ruiquan Ge, Yong Peng, Kai Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the broad application of infrared technology across diverse fields,
+there is an increasing emphasis on investigating super-resolution techniques
+for infrared images within the realm of deep learning. Despite the impressive
+results of current Transformer-based methods in image super-resolution tasks,
+their reliance on the self-attentive mechanism intrinsic to the Transformer
+architecture results in images being treated as one-dimensional sequences,
+thereby neglecting their inherent two-dimensional structure. Moreover, infrared
+images exhibit a uniform pixel distribution and a limited gradient range,
+posing challenges for the model to capture effective feature information.
+Consequently, we suggest a potent Transformer model, termed Large Kernel
+Transformer (LKFormer), to address this issue. Specifically, we have designed a
+Large Kernel Residual Depth-wise Convolutional Attention (LKRDA) module with
+linear complexity. This mainly employs depth-wise convolution with large
+kernels to execute non-local feature modeling, thereby substituting the
+standard self-attentive layer. Additionally, we have devised a novel
+feed-forward network structure called Gated-Pixel Feed-Forward Network (GPFN)
+to augment the LKFormer's capacity to manage the information flow within the
+network. Comprehensive experimental results reveal that our method surpasses
+the most advanced techniques available, using fewer parameters and yielding
+considerably superior performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MOSformer: Momentum encoder-based inter-slice fusion <span class="highlight-title">transformer</span> for
+  medical image segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        De-Xing Huang, Xiao-Hu Zhou, Xiao-Liang Xie, Shi-Qi Liu, Zhen-Qiu Feng, Mei-Jiang Gui, Hao Li, Tian-Yu Xiang, Xiu-Ling Liu, Zeng-Guang Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation takes an important position in various clinical
+applications. Deep learning has emerged as the predominant solution for
+automated segmentation of volumetric medical images. 2.5D-based segmentation
+models bridge computational efficiency of 2D-based models and spatial
+perception capabilities of 3D-based models. However, prevailing 2.5D-based
+models often treat each slice equally, failing to effectively learn and exploit
+inter-slice information, resulting in suboptimal segmentation performances. In
+this paper, a novel Momentum encoder-based inter-slice fusion transformer
+(MOSformer) is proposed to overcome this issue by leveraging inter-slice
+information at multi-scale feature maps extracted by different encoders.
+Specifically, dual encoders are employed to enhance feature distinguishability
+among different slices. One of the encoders is moving-averaged to maintain the
+consistency of slice representations. Moreover, an IF-Swin transformer module
+is developed to fuse inter-slice multi-scale features. The MOSformer is
+evaluated on three benchmark datasets (Synapse, ACDC, and AMOS), establishing a
+new state-of-the-art with 85.63%, 92.19%, and 85.43% of DSC, respectively.
+These promising results indicate its competitiveness in medical image
+segmentation. Codes and models of MOSformer will be made publicly available
+upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SignVTCL: Multi-Modal Continuous Sign Language Recognition Enhanced by
+  Visual-Textual Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Chen, Jiaze Wang, Ziyu Guo, Jinpeng Li, Donghao Zhou, Bian Wu, Chenyong Guan, Guangyong Chen, Pheng-Ann Heng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sign language recognition (SLR) plays a vital role in facilitating
+communication for the hearing-impaired community. SLR is a weakly supervised
+task where entire videos are annotated with glosses, making it challenging to
+identify the corresponding gloss within a video segment. Recent studies
+indicate that the main bottleneck in SLR is the insufficient training caused by
+the limited availability of large-scale datasets. To address this challenge, we
+present SignVTCL, a multi-modal continuous sign language recognition framework
+enhanced by visual-textual contrastive learning, which leverages the full
+potential of multi-modal data and the generalization ability of language model.
+SignVTCL integrates multi-modal data (video, keypoints, and optical flow)
+simultaneously to train a unified visual backbone, thereby yielding more robust
+visual representations. Furthermore, SignVTCL contains a visual-textual
+alignment approach incorporating gloss-level and sentence-level alignment to
+ensure precise correspondence between visual features and glosses at the level
+of individual glosses and sentence. Experimental results conducted on three
+datasets, Phoenix-2014, Phoenix-2014T, and CSL-Daily, demonstrate that SignVTCL
+achieves state-of-the-art results compared with previous methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Fusion of Multi-view Remote Sensing data for Optimal Sub-field
+  Crop Yield Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11844v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11844v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Mena, Deepak Pathak, Hiba Najjar, Cristhian Sanchez, Patrick Helber, Benjamin Bischke, Peter Habelitz, Miro Miranda, Jayanth Siddamsetty, Marlon Nuske, Marcela Charfuelan, Diego Arenas, Michaela Vollmer, Andreas Dengel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate crop yield prediction is of utmost importance for informed
+decision-making in agriculture, aiding farmers, and industry stakeholders.
+However, this task is complex and depends on multiple factors, such as
+environmental conditions, soil properties, and management practices. Combining
+heterogeneous data views poses a fusion challenge, like identifying the
+view-specific contribution to the predictive task. We present a novel
+multi-view learning approach to predict crop yield for different crops
+(soybean, wheat, rapeseed) and regions (Argentina, Uruguay, and Germany). Our
+multi-view input data includes multi-spectral optical images from Sentinel-2
+satellites and weather data as dynamic features during the crop growing season,
+complemented by static features like soil properties and topographic
+information. To effectively fuse the data, we introduce a Multi-view Gated
+Fusion (MVGF) model, comprising dedicated view-encoders and a Gated Unit (GU)
+module. The view-encoders handle the heterogeneity of data sources with varying
+temporal resolutions by learning a view-specific representation. These
+representations are adaptively fused via a weighted sum. The fusion weights are
+computed for each sample by the GU using a concatenation of the
+view-representations. The MVGF model is trained at sub-field level with 10 m
+resolution pixels. Our evaluations show that the MVGF outperforms conventional
+models on the same task, achieving the best results by incorporating all the
+data sources, unlike the usual fusion results in the literature. For Argentina,
+the MVGF model achieves an R2 value of 0.68 at sub-field yield prediction,
+while at field level evaluation (comparing field averages), it reaches around
+0.80 across different countries. The GU module learned different weights based
+on the country and crop-type, aligning with the variable significance of each
+data source to the prediction task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Human-like Similarities of Automatic Facial Expression
+  Recognition: An Empirical Exploration through Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        F. Xavier Gaya-Morey, Silvia Ramis-Guarinos, Cristina Manresa-Yee, Jose M. Buades-Rubio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expression recognition is vital for human behavior analysis, and deep
+learning has enabled models that can outperform humans. However, it is unclear
+how closely they mimic human processing. This study aims to explore the
+similarity between deep neural networks and human perception by comparing
+twelve different networks, including both general object classifiers and
+FER-specific models. We employ an innovative global explainable AI method to
+generate heatmaps, revealing crucial facial regions for the twelve networks
+trained on six facial expressions. We assess these results both quantitatively
+and qualitatively, comparing them to ground truth masks based on Friesen and
+Ekman's description and among them. We use Intersection over Union (IoU) and
+normalized correlation coefficients for comparisons. We generate 72 heatmaps to
+highlight critical regions for each expression and architecture. Qualitatively,
+models with pre-trained weights show more similarity in heatmaps compared to
+those without pre-training. Specifically, eye and nose areas influence certain
+facial expressions, while the mouth is consistently important across all models
+and expressions. Quantitatively, we find low average IoU values (avg. 0.2702)
+across all expressions and architectures. The best-performing architecture
+averages 0.3269, while the worst-performing one averages 0.2066. Dendrograms,
+built with the normalized correlation coefficient, reveal two main clusters for
+most expressions: models with pre-training and models without pre-training.
+Findings suggest limited alignment between human and AI facial expression
+recognition, with network architectures influencing the similarity, as similar
+architectures prioritize similar facial regions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Fair Evaluation of Various Deep Learning-Based Document Image
+  Binarization Approaches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11831v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11831v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richin Sukesh, Mathias Seuret, Anguelos Nicolaou, Martin Mayr, Vincent Christlein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Binarization of document images is an important pre-processing step in the
+field of document analysis. Traditional image binarization techniques usually
+rely on histograms or local statistics to identify a valid threshold to
+differentiate between different aspects of the image. Deep learning techniques
+are able to generate binarized versions of the images by learning
+context-dependent features that are less error-prone to degradation typically
+occurring in document images. In recent years, many deep learning-based methods
+have been developed for document binarization. But which one to choose? There
+have been no studies that compare these methods rigorously. Therefore, this
+work focuses on the evaluation of different deep learning-based methods under
+the same evaluation protocol. We evaluate them on different Document Image
+Binarization Contest (DIBCO) datasets and obtain very heterogeneous results. We
+show that the DE-GAN model was able to perform better compared to other models
+when evaluated on the DIBCO2013 dataset while DP-LinkNet performed best on the
+DIBCO2017 dataset. The 2-StageGAN performed best on the DIBCO2018 dataset while
+SauvolaNet outperformed the others on the DIBCO2019 challenge. Finally, we make
+the code, all models and evaluation publicly available
+(https://github.com/RichSu95/Document_Binarization_Collection) to ensure
+reproducibility and simplify future binarization evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>DAS 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Centered Kernel Alignment in Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11824v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11824v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikai Zhou, Yunhang Shen, Shitong Shao, Huanran Chen, Linrui Gong, Shaohui Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation has emerged as a highly effective method for bridging
+the representation discrepancy between large-scale models and lightweight
+models. Prevalent approaches involve leveraging appropriate metrics to minimize
+the divergence or distance between the knowledge extracted from the teacher
+model and the knowledge learned by the student model. Centered Kernel Alignment
+(CKA) is widely used to measure representation similarity and has been applied
+in several knowledge distillation methods. However, these methods are complex
+and fail to uncover the essence of CKA, thus not answering the question of how
+to use CKA to achieve simple and effective distillation properly. This paper
+first provides a theoretical perspective to illustrate the effectiveness of
+CKA, which decouples CKA to the upper bound of Maximum Mean Discrepancy~(MMD)
+and a constant term. Drawing from this, we propose a novel Relation-Centered
+Kernel Alignment~(RCKA) framework, which practically establishes a connection
+between CKA and MMD. Furthermore, we dynamically customize the application of
+CKA based on the characteristics of each task, with less computational source
+yet comparable performance than the previous methods. The extensive experiments
+on the CIFAR-100, ImageNet-1k, and MS-COCO demonstrate that our method achieves
+state-of-the-art performance on almost all teacher-student pairs for image
+classification and object detection, validating the effectiveness of our
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symbrain: A large-scale <span class="highlight-title">dataset</span> of MRI images for neonatal brain
+  symmetry analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11814v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11814v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arnaud Gucciardi, Safouane El Ghazouali, Francesca Venturini, Vida Groznik, Umberto Michelucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an annotated dataset of brain MRI images designed to
+advance the field of brain symmetry study. Magnetic resonance imaging (MRI) has
+gained interest in analyzing brain symmetry in neonatal infants, and challenges
+remain due to the vast size differences between fetal and adult brains.
+Classification methods for brain structural MRI use scales and visual cues to
+assess hemisphere symmetry, which can help diagnose neonatal patients by
+comparing hemispheres and anatomical regions of interest in the brain. Using
+the Developing Human Connectome Project dataset, this work presents a dataset
+comprising cerebral images extracted as slices across selected portions of
+interest for clinical evaluation . All the extracted images are annotated with
+the brain's midline. All the extracted images are annotated with the brain's
+midline. From the assumption that a decrease in symmetry is directly related to
+possible clinical pathologies, the dataset can contribute to a more precise
+diagnosis because it can be used to train deep learning model application in
+neonatal cerebral MRI anomaly detection from postnatal infant scans thanks to
+computer vision. Such models learn to identify and classify anomalies by
+identifying potential asymmetrical patterns in medical MRI images. Furthermore,
+this dataset can contribute to the research and development of methods using
+the relative symmetry of the two brain hemispheres for crucial diagnosis and
+treatment planning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures, Dataset Paper, Medical AI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local Agnostic Video Explanations: a Study on the Applicability of
+  Removal-Based Explanations to Video 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11796v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11796v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        F. Xavier Gaya-Morey, Jose M. Buades-Rubio, Cristina Manresa-Yee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable artificial intelligence techniques are becoming increasingly
+important with the rise of deep learning applications in various domains. These
+techniques aim to provide a better understanding of complex "black box" models
+and enhance user trust while maintaining high learning performance. While many
+studies have focused on explaining deep learning models in computer vision for
+image input, video explanations remain relatively unexplored due to the
+temporal dimension's complexity. In this paper, we present a unified framework
+for local agnostic explanations in the video domain. Our contributions include:
+(1) Extending a fine-grained explanation framework tailored for computer vision
+data, (2) Adapting six existing explanation techniques to work on video data by
+incorporating temporal information and enabling local explanations, and (3)
+Conducting an evaluation and comparison of the adapted explanation methods
+using different models and datasets. We discuss the possibilities and choices
+involved in the removal-based explanation process for visual data. The
+adaptation of six explanation methods for video is explained, with comparisons
+to existing approaches. We evaluate the performance of the methods using
+automated metrics and user-based evaluation, showing that 3D RISE, 3D LIME, and
+3D Kernel SHAP outperform other methods. By decomposing the explanation process
+into manageable steps, we facilitate the study of each choice's impact and
+allow for further refinement of explanation methods to suit specific datasets
+and models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SemPLeS: Semantic <span class="highlight-title">Prompt</span> Learning for Weakly-Supervised Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11791v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11791v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ci-Siang Lin, Chien-Yi Wang, Yu-Chiang Frank Wang, Min-Hung Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation
+models using training image data with only image-level supervision. Since
+precise pixel-level annotations are not accessible, existing methods typically
+focus on producing pseudo masks for training segmentation models by refining
+CAM-like heatmaps. However, the produced heatmaps may only capture
+discriminative image regions of target object categories or the associated
+co-occurring backgrounds. To address the issues, we propose a Semantic Prompt
+Learning for WSSS (SemPLeS) framework, which learns to effectively prompt the
+CLIP space to enhance the semantic alignment between the segmented regions and
+the target object categories. More specifically, we propose Contrastive Prompt
+Learning and Class-associated Semantic Refinement to learn the prompts that
+adequately describe and suppress the image backgrounds associated with each
+target object category. In this way, our proposed framework is able to perform
+better semantic matching between object regions and the associated text labels,
+resulting in desired pseudo masks for training the segmentation model. The
+proposed SemPLeS framework achieves SOTA performance on the standard WSSS
+benchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the
+semantic visualization of our learned prompts. The codes will be released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for Computer Vision based Activity Recognition and Fall
+  Detection of the Elderly: a Systematic <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        F. Xavier Gaya-Morey, Cristina Manresa-Yee, Jose M. Buades-Rubio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the percentage of elderly people in developed countries increases
+worldwide, the healthcare of this collective is a worrying matter, especially
+if it includes the preservation of their autonomy. In this direction, many
+studies are being published on Ambient Assisted Living (AAL) systems, which
+help to reduce the preoccupations raised by the independent living of the
+elderly. In this study, a systematic review of the literature is presented on
+fall detection and Human Activity Recognition (HAR) for the elderly, as the two
+main tasks to solve to guarantee the safety of elderly people living alone. To
+address the current tendency to perform these two tasks, the review focuses on
+the use of Deep Learning (DL) based approaches on computer vision data. In
+addition, different collections of data like DL models, datasets or hardware
+(e.g. depth or thermal cameras) are gathered from the reviewed studies and
+provided for reference in future studies. Strengths and weaknesses of existing
+approaches are also discussed and, based on them, our recommendations for
+future works are provided.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Full-Body Motion Reconstruction with Sparse Sensing from Graph
+  Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiyu Yao, Zongkai Wu, Li Yi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating 3D full-body pose from sparse sensor data is a pivotal technique
+employed for the reconstruction of realistic human motions in Augmented Reality
+and Virtual Reality. However, translating sparse sensor signals into
+comprehensive human motion remains a challenge since the sparsely distributed
+sensors in common VR systems fail to capture the motion of full human body. In
+this paper, we use well-designed Body Pose Graph (BPG) to represent the human
+body and translate the challenge into a prediction problem of graph missing
+nodes. Then, we propose a novel full-body motion reconstruction framework based
+on BPG. To establish BPG, nodes are initially endowed with features extracted
+from sparse sensor signals. Features from identifiable joint nodes across
+diverse sensors are amalgamated and processed from both temporal and spatial
+perspectives. Temporal dynamics are captured using the Temporal Pyramid
+Structure, while spatial relations in joint movements inform the spatial
+attributes. The resultant features serve as the foundational elements of the
+BPG nodes. To further refine the BPG, node features are updated through a graph
+neural network that incorporates edge reflecting varying joint relations. Our
+method's effectiveness is evidenced by the attained state-of-the-art
+performance, particularly in lower body motion, outperforming other baseline
+methods. Additionally, an ablation study validates the efficacy of each module
+in our proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Position Reasoning Network for Referring Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianjian Cao, Beiya Dai, Yulin Li, Xiameng Qin, Jingdong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given an image and a natural language expression as input, the goal of
+referring image segmentation is to segment the foreground masks of the entities
+referred by the expression. Existing methods mainly focus on interactive
+learning between vision and language to enhance the multi-modal representations
+for global context reasoning. However, predicting directly in pixel-level space
+can lead to collapsed positioning and poor segmentation results. Its main
+challenge lies in how to explicitly model entity localization, especially for
+non-salient entities. In this paper, we tackle this problem by executing a
+Collaborative Position Reasoning Network (CPRN) via the proposed novel
+Row-and-Column interactive (RoCo) and Guided Holistic interactive (Holi)
+modules. Specifically, RoCo aggregates the visual features into the row- and
+column-wise features corresponding two directional axes respectively. It offers
+a fine-grained matching behavior that perceives the associations between the
+linguistic features and two decoupled visual features to perform position
+reasoning over a hierarchical space. Holi integrates features of the two
+modalities by a cross-modal attention mechanism, which suppresses the
+irrelevant redundancy under the guide of positioning information from RoCo.
+Thus, with the incorporation of RoCo and Holi modules, CPRN captures the visual
+details of position reasoning so that the model can achieve more accurate
+segmentation. To our knowledge, this is the first work that explicitly focuses
+on position reasoning modeling. We also validate the proposed method on three
+evaluation datasets. It consistently outperforms existing state-of-the-art
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Concealed Object Segmentation with Hierarchical Coherence Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengyang Xiao, Pan Zhang, Chunming He, Runze Hu, Yutao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concealed object segmentation (COS) is a challenging task that involves
+localizing and segmenting those concealed objects that are visually blended
+with their surrounding environments. Despite achieving remarkable success,
+existing COS segmenters still struggle to achieve complete segmentation results
+in extremely concealed scenarios. In this paper, we propose a Hierarchical
+Coherence Modeling (HCM) segmenter for COS, aiming to address this incomplete
+segmentation limitation. In specific, HCM promotes feature coherence by
+leveraging the intra-stage coherence and cross-stage coherence modules,
+exploring feature correlations at both the single-stage and contextual levels.
+Additionally, we introduce the reversible re-calibration decoder to detect
+previously undetected parts in low-confidence regions, resulting in further
+enhancing segmentation performance. Extensive experiments conducted on three
+COS tasks, including camouflaged object detection, polyp image segmentation,
+and transparent object detection, demonstrate the promising results achieved by
+the proposed HCM segmenter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CICAI 2023. 13 pages, 6 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Multi-view Stereo with Late Cost Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiang Wu, Rui Li, Yu Zhu, Wenxun Zhao, Jinqiu Sun, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pairwise matching cost aggregation is a crucial step for modern
+learning-based Multi-view Stereo (MVS). Prior works adopt an early aggregation
+scheme, which adds up pairwise costs into an intermediate cost. However, we
+analyze that this process can degrade informative pairwise matchings, thereby
+blocking the depth network from fully utilizing the original geometric matching
+cues.To address this challenge, we present a late aggregation approach that
+allows for aggregating pairwise costs throughout the network feed-forward
+process, achieving accurate estimations with only minor changes of the plain
+CasMVSNet.Instead of building an intermediate cost by weighted sum, late
+aggregation preserves all pairwise costs along a distinct view channel. This
+enables the succeeding depth network to fully utilize the crucial geometric
+cues without loss of cost fidelity. Grounded in the new aggregation scheme, we
+propose further techniques addressing view order dependence inside the
+preserved cost, handling flexible testing views, and improving the depth
+filtering process. Despite its technical simplicity, our method improves
+significantly upon the baseline cascade-based approach, achieving comparable
+results with state-of-the-art methods with favorable computation overhead.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and models are available at https://github.com/Wuuu3511/LAMVSNET</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-level Cross-modal Alignment for Image Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11740v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11740v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Qiu, Qin Zhang, Xiaojun Chen, Shaotian Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the cross-modal pretraining model has been employed to produce
+meaningful pseudo-labels to supervise the training of an image clustering
+model. However, numerous erroneous alignments in a cross-modal pre-training
+model could produce poor-quality pseudo-labels and degrade clustering
+performance. To solve the aforementioned issue, we propose a novel
+\textbf{Multi-level Cross-modal Alignment} method to improve the alignments in
+a cross-modal pretraining model for downstream tasks, by building a smaller but
+better semantic space and aligning the images and texts in three levels, i.e.,
+instance-level, prototype-level, and semantic-level. Theoretical results show
+that our proposed method converges, and suggests effective means to reduce the
+expected clustering risk of our method. Experimental results on five benchmark
+datasets clearly show the superiority of our new method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EmerDiff: Emerging Pixel-level Semantic Knowledge in Diffusion Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11739v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11739v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koichi Namekata, Amirmojtaba Sabour, Sanja Fidler, Seung Wook Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have recently received increasing research attention for
+their remarkable transfer abilities in semantic segmentation tasks. However,
+generating fine-grained segmentation masks with diffusion models often requires
+additional training on annotated datasets, leaving it unclear to what extent
+pre-trained diffusion models alone understand the semantic relations of their
+generated images. To address this question, we leverage the semantic knowledge
+extracted from Stable Diffusion (SD) and aim to develop an image segmentor
+capable of generating fine-grained segmentation maps without any additional
+training. The primary difficulty stems from the fact that semantically
+meaningful feature maps typically exist only in the spatially lower-dimensional
+layers, which poses a challenge in directly extracting pixel-level semantic
+relations from these feature maps. To overcome this issue, our framework
+identifies semantic correspondences between image pixels and spatial locations
+of low-dimensional feature maps by exploiting SD's generation process and
+utilizes them for constructing image-resolution segmentation maps. In extensive
+experiments, the produced segmentation maps are demonstrated to be well
+delineated and capture detailed parts of the images, indicating the existence
+of highly accurate pixel-level semantic knowledge in diffusion models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. Project page: https://kmcode1.github.io/Projects/EmerDiff/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MetaSeg: Content-Aware Meta-Net for Omni-Supervised Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenwang Jiang, Jianan Li, Ying Wang, Wenxuan Wu, Jizhou Zhang, Bo Huang, Tingfa Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Noisy labels, inevitably existing in pseudo segmentation labels generated
+from weak object-level annotations, severely hampers model optimization for
+semantic segmentation. Previous works often rely on massive hand-crafted losses
+and carefully-tuned hyper-parameters to resist noise, suffering poor
+generalization capability and high model complexity. Inspired by recent
+advances in meta learning, we argue that rather than struggling to tolerate
+noise hidden behind clean labels passively, a more feasible solution would be
+to find out the noisy regions actively, so as to simply ignore them during
+model optimization. With this in mind, this work presents a novel meta learning
+based semantic segmentation method, MetaSeg, that comprises a primary
+content-aware meta-net (CAM-Net) to sever as a noise indicator for an arbitrary
+segmentation model counterpart. Specifically, CAM-Net learns to generate
+pixel-wise weights to suppress noisy regions with incorrect pseudo labels while
+highlighting clean ones by exploiting hybrid strengthened features from image
+content, providing straightforward and reliable guidance for optimizing the
+segmentation model. Moreover, to break the barrier of time-consuming training
+when applying meta learning to common large segmentation models, we further
+present a new decoupled training strategy that optimizes different model layers
+in a divide-and-conquer manner. Extensive experiments on object, medical,
+remote sensing and human segmentation shows that our method achieves superior
+performance, approaching that of fully supervised settings, which paves a new
+promising way for omni-supervised semantic segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Colorectal Polyp Segmentation in the Deep Learning Era: A Comprehensive
+  <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyu Wu, Fengmao Lv, Chenglizhao Chen, Aimin Hao, Shuo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Colorectal polyp segmentation (CPS), an essential problem in medical image
+analysis, has garnered growing research attention. Recently, the deep
+learning-based model completely overwhelmed traditional methods in the field of
+CPS, and more and more deep CPS methods have emerged, bringing the CPS into the
+deep learning era. To help the researchers quickly grasp the main techniques,
+datasets, evaluation metrics, challenges, and trending of deep CPS, this paper
+presents a systematic and comprehensive review of deep-learning-based CPS
+methods from 2014 to 2023, a total of 115 technical papers. In particular, we
+first provide a comprehensive review of the current deep CPS with a novel
+taxonomy, including network architectures, level of supervision, and learning
+paradigm. More specifically, network architectures include eight subcategories,
+the level of supervision comprises six subcategories, and the learning paradigm
+encompasses 12 subcategories, totaling 26 subcategories. Then, we provided a
+comprehensive analysis the characteristics of each dataset, including the
+number of datasets, annotation types, image resolution, polyp size, contrast
+values, and polyp location. Following that, we summarized CPS's commonly used
+evaluation metrics and conducted a detailed analysis of 40 deep SOTA models,
+including out-of-distribution generalization and attribute-based performance
+analysis. Finally, we discussed deep learning-based CPS methods' main
+challenges and opportunities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Out-of-Distribution Samples via Conditional Distribution
+  Entropy with Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11726v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11726v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanwen Feng, Wenlong Chen, Ao Ke, Yilong Ren, Xike Xie, S. Kevin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When deploying a trained machine learning model in the real world, it is
+inevitable to receive inputs from out-of-distribution (OOD) sources. For
+instance, in continual learning settings, it is common to encounter OOD samples
+due to the non-stationarity of a domain. More generally, when we have access to
+a set of test inputs, the existing rich line of OOD detection solutions,
+especially the recent promise of distance-based methods, falls short in
+effectively utilizing the distribution information from training samples and
+test inputs. In this paper, we argue that empirical probability distributions
+that incorporate geometric information from both training samples and test
+inputs can be highly beneficial for OOD detection in the presence of test
+inputs available. To address this, we propose to model OOD detection as a
+discrete optimal transport problem. Within the framework of optimal transport,
+we propose a novel score function known as the \emph{conditional distribution
+entropy} to quantify the uncertainty of a test input being an OOD sample. Our
+proposal inherits the merits of certain distance-based methods while
+eliminating the reliance on distribution assumptions, a-prior knowledge, and
+specific training mechanisms. Extensive experiments conducted on benchmark
+datasets demonstrate that our method outperforms its competitors in OOD
+detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Augmenting Prototype Network with TransMix for Few-shot Hyperspectral
+  Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11724v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11724v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chun Liu, Longwei Yang, Dongmei Dong, Zheng Li, Wei Yang, Zhigang Han, Jiayao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot hyperspectral image classification aims to identify the classes of
+each pixel in the images by only marking few of these pixels. And in order to
+obtain the spatial-spectral joint features of each pixel, the fixed-size
+patches centering around each pixel are often used for classification. However,
+observing the classification results of existing methods, we found that
+boundary patches corresponding to the pixels which are located at the boundary
+of the objects in the hyperspectral images, are hard to classify. These
+boundary patchs are mixed with multi-class spectral information. Inspired by
+this, we propose to augment the prototype network with TransMix for few-shot
+hyperspectrial image classification(APNT). While taking the prototype network
+as the backbone, it adopts the transformer as feature extractor to learn the
+pixel-to-pixel relation and pay different attentions to different pixels. At
+the same time, instead of directly using the patches which are cut from the
+hyperspectral images for training, it randomly mixs up two patches to imitate
+the boundary patches and uses the synthetic patches to train the model, with
+the aim to enlarge the number of hard training samples and enhance their
+diversity. And by following the data agumentation technique TransMix, the
+attention returned by the transformer is also used to mix up the labels of two
+patches to generate better labels for synthetic patches. Compared with existing
+methods, the proposed method has demonstrated sate of the art performance and
+better robustness for few-shot hyperspectral image classification in our
+experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SFC: Shared Feature Calibration in Weakly Supervised Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11719v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11719v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinqiao Zhao, Feilong Tang, Xiaoyang Wang, Jimin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-level weakly supervised semantic segmentation has received increasing
+attention due to its low annotation cost. Existing methods mainly rely on Class
+Activation Mapping (CAM) to obtain pseudo-labels for training semantic
+segmentation models. In this work, we are the first to demonstrate that
+long-tailed distribution in training data can cause the CAM calculated through
+classifier weights over-activated for head classes and under-activated for tail
+classes due to the shared features among head- and tail- classes. This degrades
+pseudo-label quality and further influences final semantic segmentation
+performance. To address this issue, we propose a Shared Feature Calibration
+(SFC) method for CAM generation. Specifically, we leverage the class prototypes
+that carry positive shared features and propose a Multi-Scaled
+Distribution-Weighted (MSDW) consistency loss for narrowing the gap between the
+CAMs generated through classifier weights and class prototypes during training.
+The MSDW loss counterbalances over-activation and under-activation by
+calibrating the shared features in head-/tail-class classifier weights.
+Experimental results show that our SFC significantly improves CAM boundaries
+and achieves new state-of-the-art performances. The project is available at
+https://github.com/Barrett-python/SFC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MsSVT++: Mixed-scale Sparse Voxel <span class="highlight-title">Transformer</span> with Center Voting for 3D
+  Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianan Li, Shaocong Dong, Lihe Ding, Tingfa Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate 3D object detection in large-scale outdoor scenes, characterized by
+considerable variations in object scales, necessitates features rich in both
+long-range and fine-grained information. While recent detectors have utilized
+window-based transformers to model long-range dependencies, they tend to
+overlook fine-grained details. To bridge this gap, we propose MsSVT++, an
+innovative Mixed-scale Sparse Voxel Transformer that simultaneously captures
+both types of information through a divide-and-conquer approach. This approach
+involves explicitly dividing attention heads into multiple groups, each
+responsible for attending to information within a specific range. The outputs
+of these groups are subsequently merged to obtain final mixed-scale features.
+To mitigate the computational complexity associated with applying a
+window-based transformer in 3D voxel space, we introduce a novel Chessboard
+Sampling strategy and implement voxel sampling and gathering operations
+sparsely using a hash map. Moreover, an important challenge stems from the
+observation that non-empty voxels are primarily located on the surface of
+objects, which impedes the accurate estimation of bounding boxes. To overcome
+this challenge, we introduce a Center Voting module that integrates newly voted
+voxels enriched with mixed-scale contextual information towards the centers of
+the objects, thereby improving precise object localization. Extensive
+experiments demonstrate that our single-stage detector, built upon the
+foundation of MsSVT++, consistently delivers exceptional performance across
+diverse datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Medical Image Debiasing by Learning Adaptive Agreement from a Biased
+  Council 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luyang Luo, Xin Huang, Minghao Wang, Zhuoyue Wan, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning could be prone to learning shortcuts raised by dataset bias and
+result in inaccurate, unreliable, and unfair models, which impedes its adoption
+in real-world clinical applications. Despite its significance, there is a
+dearth of research in the medical image classification domain to address
+dataset bias. Furthermore, the bias labels are often agnostic, as identifying
+biases can be laborious and depend on post-hoc interpretation. This paper
+proposes learning Adaptive Agreement from a Biased Council (Ada-ABC), a
+debiasing framework that does not rely on explicit bias labels to tackle
+dataset bias in medical images. Ada-ABC develops a biased council consisting of
+multiple classifiers optimized with generalized cross entropy loss to learn the
+dataset bias. A debiasing model is then simultaneously trained under the
+guidance of the biased council. Specifically, the debiasing model is required
+to learn adaptive agreement with the biased council by agreeing on the
+correctly predicted samples and disagreeing on the wrongly predicted samples by
+the biased council. In this way, the debiasing model could learn the target
+attribute on the samples without spurious correlations while also avoiding
+ignoring the rich information in samples with spurious correlations. We
+theoretically demonstrated that the debiasing model could learn the target
+features when the biased model successfully captures dataset bias. Moreover, to
+our best knowledge, we constructed the first medical debiasing benchmark from
+four datasets containing seven different bias scenarios. Our extensive
+experiments practically showed that our proposed Ada-ABC outperformed
+competitive approaches, verifying its effectiveness in mitigating dataset bias
+for medical image classification. The codes and organized benchmark datasets
+will be made publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, 3 tables. Code and benchmark will be released
+  via https://github.com/LLYXC/Ada-ABC/tree/main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HG3-NeRF: Hierarchical Geometric, Semantic, and Photometric Guided
+  Neural Radiance Fields for Sparse View Inputs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelin Gao, Weichen Dai, Yu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRF) have garnered considerable attention as a
+paradigm for novel view synthesis by learning scene representations from
+discrete observations. Nevertheless, NeRF exhibit pronounced performance
+degradation when confronted with sparse view inputs, consequently curtailing
+its further applicability. In this work, we introduce Hierarchical Geometric,
+Semantic, and Photometric Guided NeRF (HG3-NeRF), a novel methodology that can
+address the aforementioned limitation and enhance consistency of geometry,
+semantic content, and appearance across different views. We propose
+Hierarchical Geometric Guidance (HGG) to incorporate the attachment of
+Structure from Motion (SfM), namely sparse depth prior, into the scene
+representations. Different from direct depth supervision, HGG samples volume
+points from local-to-global geometric regions, mitigating the misalignment
+caused by inherent bias in the depth prior. Furthermore, we draw inspiration
+from notable variations in semantic consistency observed across images of
+different resolutions and propose Hierarchical Semantic Guidance (HSG) to learn
+the coarse-to-fine semantic content, which corresponds to the coarse-to-fine
+scene representations. Experimental results demonstrate that HG3-NeRF can
+outperform other state-of-the-art methods on different standard benchmarks and
+achieve high-fidelity synthesis results for sparse view inputs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mastering Text-to-Image Diffusion: Recaptioning, Planning, and
+  Generating with Multimodal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11708v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11708v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Yang, Zhaochen Yu, Chenlin Meng, Minkai Xu, Stefano Ermon, Bin Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have exhibit exceptional performance in text-to-image
+generation and editing. However, existing methods often face challenges when
+handling complex text prompts that involve multiple objects with multiple
+attributes and relationships. In this paper, we propose a brand new
+training-free text-to-image generation/editing framework, namely Recaption,
+Plan and Generate (RPG), harnessing the powerful chain-of-thought reasoning
+ability of multimodal LLMs to enhance the compositionality of text-to-image
+diffusion models. Our approach employs the MLLM as a global planner to
+decompose the process of generating complex images into multiple simpler
+generation tasks within subregions. We propose complementary regional diffusion
+to enable region-wise compositional generation. Furthermore, we integrate
+text-guided image generation and editing within the proposed RPG in a
+closed-loop fashion, thereby enhancing generalization ability. Extensive
+experiments demonstrate our RPG outperforms state-of-the-art text-to-image
+diffusion models, including DALL-E 3 and SDXL, particularly in multi-category
+object composition and text-image semantic alignment. Notably, our RPG
+framework exhibits wide compatibility with various MLLM architectures (e.g.,
+MiniGPT-4) and diffusion backbones (e.g., ControlNet). Our code is available
+at: https://github.com/YangLing0818/RPG-DiffusionMaster
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project: https://github.com/YangLing0818/RPG-DiffusionMaster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EK-Net:Real-time Scene Text Detection with Expand Kernel Distance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyuan Zhu, Fagui Liu, Xi Chen, Quan Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, scene text detection has received significant attention due to its
+wide application. However, accurate detection in complex scenes of multiple
+scales, orientations, and curvature remains a challenge. Numerous detection
+methods adopt the Vatti clipping (VC) algorithm for multiple-instance training
+to address the issue of arbitrary-shaped text. Yet we identify several bias
+results from these approaches called the "shrinked kernel". Specifically, it
+refers to a decrease in accuracy resulting from an output that overly favors
+the text kernel. In this paper, we propose a new approach named Expand Kernel
+Network (EK-Net) with expand kernel distance to compensate for the previous
+deficiency, which includes three-stages regression to complete instance
+detection. Moreover, EK-Net not only realize the precise positioning of
+arbitrary-shaped text, but also achieve a trade-off between performance and
+speed. Evaluation results demonstrate that EK-Net achieves state-of-the-art or
+competitive performance compared to other advanced methods, e.g., F-measure of
+85.72% at 35.42 FPS on ICDAR 2015, F-measure of 85.75% at 40.13 FPS on CTW1500.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2024 IEEE International Conference on Acoustics, Speech and Signal
+  Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TIM: An Efficient Temporal Interaction Module for Spiking <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicheng Shen, Dongcheng Zhao, Guobin Shen, Yi Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNNs), as the third generation of neural networks,
+have gained prominence for their biological plausibility and computational
+efficiency, especially in processing diverse datasets. The integration of
+attention mechanisms, inspired by advancements in neural network architectures,
+has led to the development of Spiking Transformers. These have shown promise in
+enhancing SNNs' capabilities, particularly in the realms of both static and
+neuromorphic datasets. Despite their progress, a discernible gap exists in
+these systems, specifically in the Spiking Self Attention (SSA) mechanism's
+effectiveness in leveraging the temporal processing potential of SNNs. To
+address this, we introduce the Temporal Interaction Module (TIM), a novel,
+convolution-based enhancement designed to augment the temporal data processing
+abilities within SNN architectures. TIM's integration into existing SNN
+frameworks is seamless and efficient, requiring minimal additional parameters
+while significantly boosting their temporal information handling capabilities.
+Through rigorous experimentation, TIM has demonstrated its effectiveness in
+exploiting temporal information, leading to state-of-the-art performance across
+various neuromorphic datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10pages,6figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Memory-Efficient <span class="highlight-title">Prompt</span> Tuning for Incremental Histopathology
+  Classification <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhu, Kang Li, Lequan Yu, Pheng-Ann Heng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have made remarkable progress in histopathology
+classification. Based on current successes, contemporary works proposed to
+further upgrade the model towards a more generalizable and robust direction
+through incrementally learning from the sequentially delivered domains. Unlike
+previous parameter isolation based approaches that usually demand massive
+computation resources during model updating, we present a memory-efficient
+prompt tuning framework to cultivate model generalization potential in
+economical memory cost. For each incoming domain, we reuse the existing
+parameters of the initial classification model and attach lightweight trainable
+prompts into it for customized tuning. Considering the domain heterogeneity, we
+perform decoupled prompt tuning, where we adopt a domain-specific prompt for
+each domain to independently investigate its distinctive characteristics, and
+one domain-invariant prompt shared across all domains to continually explore
+the common content embedding throughout time. All domain-specific prompts will
+be appended to the prompt bank and isolated from further changes to prevent
+forgetting the distinctive features of early-seen domains. While the
+domain-invariant prompt will be passed on and iteratively evolve by
+style-augmented prompt refining to improve model generalization capability over
+time. In specific, we construct a graph with existing prompts and build a
+style-augmented graph attention network to guide the domain-invariant prompt
+exploring the overlapped latent embedding among all delivered domains for more
+domain generic representations. We have extensively evaluated our framework
+with two histopathology tasks, i.e., breast cancer metastasis classification
+and epithelium-stroma tissue classification, where our approach yielded
+superior performance and memory efficiency over the competing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MVSFormer++: Revealing the Devil in <span class="highlight-title">Transformer</span>'s Details for Multi-View
+  Stereo <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11673v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11673v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenjie Cao, Xinlin Ren, Yanwei Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in learning-based Multi-View Stereo (MVS) methods have
+prominently featured transformer-based models with attention mechanisms.
+However, existing approaches have not thoroughly investigated the profound
+influence of transformers on different MVS modules, resulting in limited depth
+estimation capabilities. In this paper, we introduce MVSFormer++, a method that
+prudently maximizes the inherent characteristics of attention to enhance
+various components of the MVS pipeline. Formally, our approach involves
+infusing cross-view information into the pre-trained DINOv2 model to facilitate
+MVS learning. Furthermore, we employ different attention mechanisms for the
+feature encoder and cost volume regularization, focusing on feature and spatial
+aggregations respectively. Additionally, we uncover that some design details
+would substantially impact the performance of transformer modules in MVS,
+including normalized 3D positional encoding, adaptive attention scaling, and
+the position of layer normalization. Comprehensive experiments on DTU,
+Tanks-and-Temples, BlendedMVS, and ETH3D validate the effectiveness of the
+proposed method. Notably, MVSFormer++ achieves state-of-the-art performance on
+the challenging DTU and Tanks-and-Temples benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RTA-Former: Reverse <span class="highlight-title">Transformer</span> Attention for Polyp Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikai Li, Murong Yi, Ali Uneri, Sihan Niu, Craig Jones
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Polyp segmentation is a key aspect of colorectal cancer prevention, enabling
+early detection and guiding subsequent treatments. Intelligent diagnostic
+tools, including deep learning solutions, are widely explored to streamline and
+potentially automate this process. However, even with many powerful network
+architectures, there still comes the problem of producing accurate edge
+segmentation. In this paper, we introduce a novel network, namely RTA-Former,
+that employs a transformer model as the encoder backbone and innovatively
+adapts Reverse Attention (RA) with a transformer stage in the decoder for
+enhanced edge segmentation. The results of the experiments illustrate that
+RTA-Former achieves state-of-the-art (SOTA) performance in five polyp
+segmentation datasets. The strong capability of RTA-Former holds promise in
+improving the accuracy of Transformer-based polyp segmentation, potentially
+leading to better clinical decisions and patient outcomes. Our code will be
+publicly available on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ActionHub: A Large-scale Action Video Description <span class="highlight-title">Dataset</span> for Zero-shot
+  Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Zhou, Junwei Liang, Kun-Yu Lin, Jinrui Yang, Wei-Shi Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot action recognition (ZSAR) aims to learn an alignment model between
+videos and class descriptions of seen actions that is transferable to unseen
+actions. The text queries (class descriptions) used in existing ZSAR works,
+however, are often short action names that fail to capture the rich semantics
+in the videos, leading to misalignment. With the intuition that video content
+descriptions (e.g., video captions) can provide rich contextual information of
+visual concepts in videos, we propose to utilize human annotated video
+descriptions to enrich the semantics of the class descriptions of each action.
+However, all existing action video description datasets are limited in terms of
+the number of actions, the semantics of video descriptions, etc. To this end,
+we collect a large-scale action video descriptions dataset named ActionHub,
+which covers a total of 1,211 common actions and provides 3.6 million action
+video descriptions. With the proposed ActionHub dataset, we further propose a
+novel Cross-modality and Cross-action Modeling (CoCo) framework for ZSAR, which
+consists of a Dual Cross-modality Alignment module and a Cross-action
+Invariance Mining module. Specifically, the Dual Cross-modality Alignment
+module utilizes both action labels and video descriptions from ActionHub to
+obtain rich class semantic features for feature alignment. The Cross-action
+Invariance Mining module exploits a cycle-reconstruction process between the
+class semantic feature spaces of seen actions and unseen actions, aiming to
+guide the model to learn cross-action invariant representations. Extensive
+experimental results demonstrate that our CoCo framework significantly
+outperforms the state-of-the-art on three popular ZSAR benchmarks (i.e.,
+Kinetics-ZSAR, UCF101 and HMDB51) under two different learning protocols in
+ZSAR. We will release our code, models, and the proposed ActionHub dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OnDev-LCT: On-Device Lightweight Convolutional <span class="highlight-title">Transformer</span>s towards
+  federated learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chu Myaet Thwal, Minh N. H. Nguyen, Ye Lin Tun, Seong Tae Kim, My T. Thai, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) has emerged as a promising approach to
+collaboratively train machine learning models across multiple edge devices
+while preserving privacy. The success of FL hinges on the efficiency of
+participating models and their ability to handle the unique challenges of
+distributed learning. While several variants of Vision Transformer (ViT) have
+shown great potential as alternatives to modern convolutional neural networks
+(CNNs) for centralized training, the unprecedented size and higher
+computational demands hinder their deployment on resource-constrained edge
+devices, challenging their widespread application in FL. Since client devices
+in FL typically have limited computing resources and communication bandwidth,
+models intended for such devices must strike a balance between model size,
+computational efficiency, and the ability to adapt to the diverse and non-IID
+data distributions encountered in FL. To address these challenges, we propose
+OnDev-LCT: Lightweight Convolutional Transformers for On-Device vision tasks
+with limited training data and resources. Our models incorporate image-specific
+inductive biases through the LCT tokenizer by leveraging efficient depthwise
+separable convolutions in residual linear bottleneck blocks to extract local
+features, while the multi-head self-attention (MHSA) mechanism in the LCT
+encoder implicitly facilitates capturing global representations of images.
+Extensive experiments on benchmark image datasets indicate that our models
+outperform existing lightweight vision models while having fewer parameters and
+lower computational demands, making them suitable for FL scenarios with data
+heterogeneity and communication bottlenecks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Neural Networks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PointGL: A Simple Global-Local Framework for Efficient Point Cloud
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianan Li, Jie Wang, Tingfa Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient analysis of point clouds holds paramount significance in real-world
+3D applications. Currently, prevailing point-based models adhere to the
+PointNet++ methodology, which involves embedding and abstracting point features
+within a sequence of spatially overlapping local point sets, resulting in
+noticeable computational redundancy. Drawing inspiration from the streamlined
+paradigm of pixel embedding followed by regional pooling in Convolutional
+Neural Networks (CNNs), we introduce a novel, uncomplicated yet potent
+architecture known as PointGL, crafted to facilitate efficient point cloud
+analysis. PointGL employs a hierarchical process of feature acquisition through
+two recursive steps. First, the Global Point Embedding leverages
+straightforward residual Multilayer Perceptrons (MLPs) to effectuate feature
+embedding for each individual point. Second, the novel Local Graph Pooling
+technique characterizes point-to-point relationships and abstracts regional
+representations through succinct local graphs. The harmonious fusion of
+one-time point embedding and parameter-free graph pooling contributes to
+PointGL's defining attributes of minimized model complexity and heightened
+efficiency. Our PointGL attains state-of-the-art accuracy on the ScanObjectNN
+dataset while exhibiting a runtime that is more than 5 times faster and
+utilizing only approximately 4% of the FLOPs and 30% of the parameters compared
+to the recent PointMLP model. The code for PointGL is available at
+https://github.com/Roywangj/PointGL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ M2-CLIP: A Multimodal, Multi-task Adapting Framework for Video Action
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengmeng Wang, Jiazheng Xing, Boyuan Jiang, Jun Chen, Jianbiao Mei, Xingxing Zuo, Guang Dai, Jingdong Wang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the rise of large-scale vision-language pretrained models like
+CLIP, coupled with the technology of Parameter-Efficient FineTuning (PEFT), has
+captured substantial attraction in video action recognition. Nevertheless,
+prevailing approaches tend to prioritize strong supervised performance at the
+expense of compromising the models' generalization capabilities during
+transfer. In this paper, we introduce a novel Multimodal, Multi-task CLIP
+adapting framework named \name to address these challenges, preserving both
+high supervised performance and robust transferability. Firstly, to enhance the
+individual modality architectures, we introduce multimodal adapters to both the
+visual and text branches. Specifically, we design a novel visual TED-Adapter,
+that performs global Temporal Enhancement and local temporal Difference
+modeling to improve the temporal representation capabilities of the visual
+encoder. Moreover, we adopt text encoder adapters to strengthen the learning of
+semantic label information. Secondly, we design a multi-task decoder with a
+rich set of supervisory signals to adeptly satisfy the need for strong
+supervised performance and generalization within a multimodal framework.
+Experimental results validate the efficacy of our approach, demonstrating
+exceptional performance in supervised learning while maintaining strong
+generalization in zero-shot scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Friends Across Time: Multi-Scale Action Segmentation <span class="highlight-title">Transformer</span> for
+  Surgical Phase Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bokai Zhang, Jiayuan Meng, Bin Cheng, Dean Biskup, Svetlana Petculescu, Angela Chapman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic surgical phase recognition is a core technology for modern
+operating rooms and online surgical video assessment platforms. Current
+state-of-the-art methods use both spatial and temporal information to tackle
+the surgical phase recognition task. Building on this idea, we propose the
+Multi-Scale Action Segmentation Transformer (MS-AST) for offline surgical phase
+recognition and the Multi-Scale Action Segmentation Causal Transformer
+(MS-ASCT) for online surgical phase recognition. We use ResNet50 or
+EfficientNetV2-M for spatial feature extraction. Our MS-AST and MS-ASCT can
+model temporal information at different scales with multi-scale temporal
+self-attention and multi-scale temporal cross-attention, which enhances the
+capture of temporal relationships between frames and segments. We demonstrate
+that our method can achieve 95.26% and 96.15% accuracy on the Cholec80 dataset
+for online and offline surgical phase recognition, respectively, which achieves
+new state-of-the-art results. Our method can also achieve state-of-the-art
+results on non-medical datasets in the video action segmentation domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zoom-shot: Fast and Efficient Unsupervised Zero-Shot Transfer of CLIP to
+  Vision Encoders with Multimodal Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Shipard, Arnold Wiliem, Kien Nguyen Thanh, Wei Xiang, Clinton Fookes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fusion of vision and language has brought about a transformative shift in
+computer vision through the emergence of Vision-Language Models (VLMs).
+However, the resource-intensive nature of existing VLMs poses a significant
+challenge. We need an accessible method for developing the next generation of
+VLMs. To address this issue, we propose Zoom-shot, a novel method for
+transferring the zero-shot capabilities of CLIP to any pre-trained vision
+encoder. We do this by exploiting the multimodal information (i.e. text and
+image) present in the CLIP latent space through the use of specifically
+designed multimodal loss functions. These loss functions are (1)
+cycle-consistency loss and (2) our novel prompt-guided knowledge distillation
+loss (PG-KD). PG-KD combines the concept of knowledge distillation with CLIP's
+zero-shot classification, to capture the interactions between text and image
+features. With our multimodal losses, we train a $\textbf{linear mapping}$
+between the CLIP latent space and the latent space of a pre-trained vision
+encoder, for only a $\textbf{single epoch}$. Furthermore, Zoom-shot is entirely
+unsupervised and is trained using $\textbf{unpaired}$ data. We test the
+zero-shot capabilities of a range of vision encoders augmented as new VLMs, on
+coarse and fine-grained classification datasets, outperforming the previous
+state-of-the-art in this problem domain. In our ablations, we find Zoom-shot
+allows for a trade-off between data and compute during training; and our
+state-of-the-art results can be obtained by reducing training from 20% to 1% of
+the ImageNet training data with 20 epochs. All code and models are available on
+GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Up Quantization-Aware Neural Architecture Search for Efficient
+  Deep Learning on the Edge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12350v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12350v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Lu, Hiram Rayo Torres Rodriguez, Sebastian Vogel, Nick van de Waterlaat, Pavol Jancura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Architecture Search (NAS) has become the de-facto approach for
+designing accurate and efficient networks for edge devices. Since models are
+typically quantized for edge deployment, recent work has investigated
+quantization-aware NAS (QA-NAS) to search for highly accurate and efficient
+quantized models. However, existing QA-NAS approaches, particularly few-bit
+mixed-precision (FB-MP) methods, do not scale to larger tasks. Consequently,
+QA-NAS has mostly been limited to low-scale tasks and tiny networks. In this
+work, we present an approach to enable QA-NAS (INT8 and FB-MP) on large-scale
+tasks by leveraging the block-wise formulation introduced by block-wise NAS. We
+demonstrate strong results for the semantic segmentation task on the Cityscapes
+dataset, finding FB-MP models 33% smaller and INT8 models 17.6% faster than
+DeepLabV3 (INT8) without compromising task performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Workshop on Compilers, Deployment, and Tooling for Edge
+  AI (CODAI '23 ), September 21, 2023, Hamburg, Germany</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OCT-SelfNet: A <span class="highlight-title">Self-Supervised</span> Framework with Multi-Modal <span class="highlight-title">Dataset</span>s for
+  Generalized and Robust Retinal Disease Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatema-E Jannat, Sina Gholami, Minhaj Nur Alam, Hamed Tabkhi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the revolutionary impact of AI and the development of locally trained
+algorithms, achieving widespread generalized learning from multi-modal data in
+medical AI remains a significant challenge. This gap hinders the practical
+deployment of scalable medical AI solutions. Addressing this challenge, our
+research contributes a self-supervised robust machine learning framework,
+OCT-SelfNet, for detecting eye diseases using optical coherence tomography
+(OCT) images. In this work, various data sets from various institutions are
+combined enabling a more comprehensive range of representation. Our method
+addresses the issue using a two-phase training approach that combines
+self-supervised pretraining and supervised fine-tuning with a mask autoencoder
+based on the SwinV2 backbone by providing a solution for real-world clinical
+deployment. Extensive experiments on three datasets with different encoder
+backbones, low data settings, unseen data settings, and the effect of
+augmentation show that our method outperforms the baseline model, Resnet-50 by
+consistently attaining AUC-ROC performance surpassing 77% across all tests,
+whereas the baseline model exceeds 54%. Moreover, in terms of the AUC-PR
+metric, our proposed method exceeded 42%, showcasing a substantial increase of
+at least 10% in performance compared to the baseline, which exceeded only 33%.
+This contributes to our understanding of our approach's potential and
+emphasizes its usefulness in clinical settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Learning and Cycle Consistency-based Transductive Transfer
+  Learning for Target Annotation <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shoaib Meraj Sami, Md Mahedi Hasan, Nasser M. Nasrabadi, Raghuveer Rao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Annotating automatic target recognition (ATR) is a highly challenging task,
+primarily due to the unavailability of labeled data in the target domain.
+Hence, it is essential to construct an optimal target domain classifier by
+utilizing the labeled information of the source domain images. The transductive
+transfer learning (TTL) method that incorporates a CycleGAN-based unpaired
+domain translation network has been previously proposed in the literature for
+effective ATR annotation. Although this method demonstrates great potential for
+ATR, it severely suffers from lower annotation performance, higher Fr\'echet
+Inception Distance (FID) score, and the presence of visual artifacts in the
+synthetic images. To address these issues, we propose a hybrid contrastive
+learning base unpaired domain translation (H-CUT) network that achieves a
+significantly lower FID score. It incorporates both attention and entropy to
+emphasize the domain-specific region, a noisy feature mixup module to generate
+high variational synthetic negative patches, and a modulated noise contrastive
+estimation (MoNCE) loss to reweight all negative patches using optimal
+transport for better performance. Our proposed contrastive learning and
+cycle-consistency-based TTL (C3TTL) framework consists of two H-CUT networks
+and two classifiers. It simultaneously optimizes cycle-consistency, MoNCE, and
+identity losses. In C3TTL, two H-CUT networks have been employed through a
+bijection mapping to feed the reconstructed source domain images into a
+pretrained classifier to guide the optimal target domain classifier. Extensive
+experimental analysis conducted on three ATR datasets demonstrates that the
+proposed C3TTL method is effective in annotating civilian and military
+vehicles, as well as ship targets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This Paper is Accepted in IEEE TRANSACTIONS ON AEROSPACE AND
+  ELECTRONIC SYSTEMS. This Arxiv version is an older version than the reviewed
+  version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Agent Dynamic Relational Reasoning for Social Robot Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Li, Chuanbo Hua, Hengbo Ma, Jinkyoo Park, Victoria Dax, Mykel J. Kochenderfer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social robot navigation can be helpful in various contexts of daily life but
+requires safe human-robot interactions and efficient trajectory planning. While
+modeling pairwise relations has been widely studied in multi-agent interacting
+systems, the ability to capture larger-scale group-wise activities is limited.
+In this paper, we propose a systematic relational reasoning approach with
+explicit inference of the underlying dynamically evolving relational
+structures, and we demonstrate its effectiveness for multi-agent trajectory
+prediction and social robot navigation. In addition to the edges between pairs
+of nodes (i.e., agents), we propose to infer hyperedges that adaptively connect
+multiple nodes to enable group-wise reasoning in an unsupervised manner. Our
+approach infers dynamically evolving relation graphs and hypergraphs to capture
+the evolution of relations, which the trajectory predictor employs to generate
+future states. Meanwhile, we propose to regularize the sharpness and sparsity
+of the learned relations and the smoothness of the relation evolution, which
+proves to enhance training stability and model performance. The proposed
+approach is validated on synthetic crowd simulations and real-world benchmark
+datasets. Experiments demonstrate that the approach infers reasonable relations
+and achieves state-of-the-art prediction performance. In addition, we present a
+deep reinforcement learning (DRL) framework for social robot navigation, which
+incorporates relational reasoning and trajectory prediction systematically. In
+a group-based crowd simulation, our method outperforms the strongest baseline
+by a significant margin in terms of safety, efficiency, and social compliance
+in dense, interactive scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 8 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Task Performance: Evaluating and Reducing the Flaws of Large
+  Multimodal Models with In-Context Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00647v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00647v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mustafa Shukor, Alexandre Rame, Corentin Dancette, Matthieu Cord
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following the success of Large Language Models (LLMs), Large Multimodal
+Models (LMMs), such as the Flamingo model and its subsequent competitors, have
+started to emerge as natural steps towards generalist agents. However,
+interacting with recent LMMs reveals major limitations that are hardly captured
+by the current evaluation benchmarks. Indeed, task performances (e.g., VQA
+accuracy) alone do not provide enough clues to understand their real
+capabilities, limitations, and to which extent such models are aligned to human
+expectations. To refine our understanding of those flaws, we deviate from the
+current evaluation paradigm, and (1) evaluate 10 recent open-source LMMs from
+3B up to 80B parameter scale, on 5 different axes; hallucinations, abstention,
+compositionality, explainability and instruction following. Our evaluation on
+these axes reveals major flaws in LMMs. While the current go-to solution to
+align these models is based on training, such as instruction tuning or RLHF, we
+rather (2) explore the training-free in-context learning (ICL) as a solution,
+and study how it affects these limitations. Based on our ICL study, (3) we push
+ICL further and propose new multimodal ICL variants such as; Multitask-ICL,
+Chain-of-Hindsight-ICL, and Self-Correcting-ICL. Our findings are as follows.
+(1) Despite their success, LMMs have flaws that remain unsolved with scaling
+alone. (2) The effect of ICL on LMMs flaws is nuanced; despite its
+effectiveness for improved explainability, answer abstention, ICL only slightly
+improves instruction following, does not improve compositional abilities, and
+actually even amplifies hallucinations. (3) The proposed ICL variants are
+promising as post-hoc approaches to efficiently tackle some of those flaws. The
+code is available here: https://github.com/mshukor/EvALign-ICL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. Project Page: https://evalign-icl.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpreting CLIP's Image Representation via Text-Based Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05916v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05916v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yossi Gandelsman, Alexei A. Efros, Jacob Steinhardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the CLIP image encoder by analyzing how individual model
+components affect the final representation. We decompose the image
+representation as a sum across individual image patches, model layers, and
+attention heads, and use CLIP's text representation to interpret the summands.
+Interpreting the attention heads, we characterize each head's role by
+automatically finding text representations that span its output space, which
+reveals property-specific roles for many heads (e.g. location or shape). Next,
+interpreting the image patches, we uncover an emergent spatial localization
+within CLIP. Finally, we use this understanding to remove spurious features
+from CLIP and to create a strong zero-shot image segmenter. Our results
+indicate that a scalable understanding of transformer models is attainable and
+can be used to repair and improve models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page and code:
+  https://yossigandelsman.github.io/clip_decomposition/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking the Robustness of Image Watermarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08573v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08573v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bang An, Mucong Ding, Tahseen Rabbani, Aakriti Agrawal, Yuancheng Xu, Chenghao Deng, Sicheng Zhu, Abdirisak Mohamed, Yuxin Wen, Tom Goldstein, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the weaknesses of image watermarking techniques. We
+present WAVES (Watermark Analysis Via Enhanced Stress-testing), a novel
+benchmark for assessing watermark robustness, overcoming the limitations of
+current evaluation methods.WAVES integrates detection and identification tasks,
+and establishes a standardized evaluation protocol comprised of a diverse range
+of stress tests. The attacks in WAVES range from traditional image distortions
+to advanced and novel variations of diffusive, and adversarial attacks. Our
+evaluation examines two pivotal dimensions: the degree of image quality
+degradation and the efficacy of watermark detection after attacks. We develop a
+series of Performance vs. Quality 2D plots, varying over several prominent
+image similarity metrics, which are then aggregated in a heuristically novel
+manner to paint an overall picture of watermark robustness and attack potency.
+Our comprehensive evaluation reveals previously undetected vulnerabilities of
+several modern watermarking algorithms. We envision WAVES as a toolkit for the
+future development of robust watermarking systems. The project is available at
+https://wavesbench.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Joint Hierarchical Priors and Adaptive Spatial Resolution for Efficient
+  Neural Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02273v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02273v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Ghorbel, Wassim Hamidouche, Luce Morin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the performance of neural image compression (NIC) has steadily
+improved thanks to the last line of study, reaching or outperforming
+state-of-the-art conventional codecs. Despite significant progress, current NIC
+methods still rely on ConvNet-based entropy coding, limited in modeling
+long-range dependencies due to their local connectivity and the increasing
+number of architectural biases and priors, resulting in complex underperforming
+models with high decoding latency. Motivated by the efficiency investigation of
+the Tranformer-based transform coding framework, namely SwinT-ChARM, we propose
+to enhance the latter, as first, with a more straightforward yet effective
+Tranformer-based channel-wise auto-regressive prior model, resulting in an
+absolute image compression transformer (ICT). Through the proposed ICT, we can
+capture both global and local contexts from the latent representations and
+better parameterize the distribution of the quantized latents. Further, we
+leverage a learnable scaling module with a sandwich ConvNeXt-based
+pre-/post-processor to accurately extract more compact latent codes while
+reconstructing higher-quality images. Extensive experimental results on
+benchmark datasets showed that the proposed framework significantly improves
+the trade-off between coding efficiency and decoder complexity over the
+versatile video coding (VVC) reference encoder (VTM-18.0) and the neural codec
+SwinT-ChARM. Moreover, we provide model scaling studies to verify the
+computational efficiency of our approach and conduct several objective and
+subjective analyses to bring to the fore the performance gap between the
+adaptive image compression transformer (AICT) and the neural codec SwinT-ChARM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DFU: scale-robust diffusion model for zero-shot super-resolution image
+  generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06144v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06144v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Havrilla, Kevin Rojas, Wenjing Liao, Molei Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion generative models have achieved remarkable success in generating
+images with a fixed resolution. However, existing models have limited ability
+to generalize to different resolutions when training data at those resolutions
+are not available. Leveraging techniques from operator learning, we present a
+novel deep-learning architecture, Dual-FNO UNet (DFU), which approximates the
+score operator by combining both spatial and spectral information at multiple
+resolutions. Comparisons of DFU to baselines demonstrate its scalability: 1)
+simultaneously training on multiple resolutions improves FID over training at
+any single fixed resolution; 2) DFU generalizes beyond its training
+resolutions, allowing for coherent, high-fidelity generation at
+higher-resolutions with the same model, i.e. zero-shot super-resolution
+image-generation; 3) we propose a fine-tuning strategy to further enhance the
+zero-shot super-resolution image-generation capability of our model, leading to
+a FID of 11.3 at 1.66 times the maximum training resolution on FFHQ, which no
+other method can come close to achieving.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UniLVSeg: Unified Left Ventricular Segmentation with Sparsely Annotated
+  Echocardiogram Videos through <span class="highlight-title">Self-Supervised</span> Temporal Masking and Weakly
+  Supervised Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00454v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00454v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fadillah Maani, Asim Ukaye, Nada Saadi, Numan Saeed, Mohammad Yaqub
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Echocardiography has become an indispensable clinical imaging modality for
+general heart health assessment. From calculating biomarkers such as ejection
+fraction to the probability of a patient's heart failure, accurate segmentation
+of the heart and its structures allows doctors to plan and execute treatments
+with greater precision and accuracy. However, achieving accurate and robust
+left ventricle segmentation is time-consuming and challenging due to different
+reasons. This work introduces a novel approach for consistent left ventricular
+(LV) segmentation from sparsely annotated echocardiogram videos. We achieve
+this through (1) self-supervised learning (SSL) using temporal masking followed
+by (2) weakly supervised training. We investigate two different segmentation
+approaches: 3D segmentation and a novel 2D superimage (SI). We demonstrate how
+our proposed method outperforms the state-of-the-art solutions by achieving a
+93.32% (95%CI 93.21-93.43%) dice score on a large-scale dataset
+(EchoNet-Dynamic) while being more efficient. To show the effectiveness of our
+approach, we provide extensive ablation studies, including pre-training
+settings and various deep learning backbones. Additionally, we discuss how our
+proposed methodology achieves high data utility by incorporating unlabeled
+frames in the training process. To help support the AI in medicine community,
+the complete solution with the source code will be made publicly available upon
+acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Effect of Intrinsic <span class="highlight-title">Dataset</span> Properties on Generalization: Unraveling
+  Learning Differences Between Natural and Medical Images <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08865v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08865v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Konz, Maciej A. Mazurowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates discrepancies in how neural networks learn from
+different imaging domains, which are commonly overlooked when adopting computer
+vision techniques from the domain of natural images to other specialized
+domains such as medical images. Recent works have found that the generalization
+error of a trained network typically increases with the intrinsic dimension
+($d_{data}$) of its training set. Yet, the steepness of this relationship
+varies significantly between medical (radiological) and natural imaging
+domains, with no existing theoretical explanation. We address this gap in
+knowledge by establishing and empirically validating a generalization scaling
+law with respect to $d_{data}$, and propose that the substantial scaling
+discrepancy between the two considered domains may be at least partially
+attributed to the higher intrinsic "label sharpness" ($K_F$) of medical imaging
+datasets, a metric which we propose. Next, we demonstrate an additional benefit
+of measuring the label sharpness of a training set: it is negatively correlated
+with the trained model's adversarial robustness, which notably leads to models
+for medical images having a substantially higher vulnerability to adversarial
+attack. Finally, we extend our $d_{data}$ formalism to the related metric of
+learned representation intrinsic dimension ($d_{repr}$), derive a
+generalization scaling law with respect to $d_{repr}$, and show that $d_{data}$
+serves as an upper bound for $d_{repr}$. Our theoretical results are supported
+by thorough experiments with six models and eleven natural and medical imaging
+datasets over a range of training set sizes. Our findings offer insights into
+the influence of intrinsic dataset properties on generalization, representation
+learning, and robustness in deep neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. Code:
+  https://github.com/mazurowski-lab/intrinsic-properties</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IPR-NeRF: Ownership Verification meets Neural Radiance Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09495v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09495v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Win Kent Ong, Kam Woh Ng, Chee Seng Chan, Yi Zhe Song, Tao Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Field (NeRF) models have gained significant attention in the
+computer vision community in the recent past with state-of-the-art visual
+quality and produced impressive demonstrations. Since then, technopreneurs have
+sought to leverage NeRF models into a profitable business. Therefore, NeRF
+models make it worth the risk of plagiarizers illegally copying,
+re-distributing, or misusing those models. This paper proposes a comprehensive
+intellectual property (IP) protection framework for the NeRF model in both
+black-box and white-box settings, namely IPR-NeRF. In the black-box setting, a
+diffusion-based solution is introduced to embed and extract the watermark via a
+two-stage optimization process. In the white-box setting, a designated digital
+signature is embedded into the weights of the NeRF model by adopting the sign
+loss objective. Our extensive experiments demonstrate that not only does our
+approach maintain the fidelity (\ie, the rendering quality) of IPR-NeRF models,
+but it is also robust against both ambiguity and removal attacks compared to
+prior arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Error on result tabulation for the state of the art method which
+  might cause misleading to the readers</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MixRT: Mixed Neural Representations For Real-Time NeRF Rendering <span class="chip">3DV'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11841v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11841v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaojian Li, Bichen Wu, Peter Vajda,  Yingyan,  Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Field (NeRF) has emerged as a leading technique for novel
+view synthesis, owing to its impressive photorealistic reconstruction and
+rendering capability. Nevertheless, achieving real-time NeRF rendering in
+large-scale scenes has presented challenges, often leading to the adoption of
+either intricate baked mesh representations with a substantial number of
+triangles or resource-intensive ray marching in baked representations. We
+challenge these conventions, observing that high-quality geometry, represented
+by meshes with substantial triangles, is not necessary for achieving
+photorealistic rendering quality. Consequently, we propose MixRT, a novel NeRF
+representation that includes a low-quality mesh, a view-dependent displacement
+map, and a compressed NeRF model. This design effectively harnesses the
+capabilities of existing graphics hardware, thus enabling real-time NeRF
+rendering on edge devices. Leveraging a highly-optimized WebGL-based rendering
+framework, our proposed MixRT attains real-time rendering speeds on edge
+devices (over 30 FPS at a resolution of 1280 x 720 on a MacBook M1 Pro laptop),
+better rendering quality (0.2 PSNR higher in indoor scenes of the Unbounded-360
+datasets), and a smaller storage size (less than 80% compared to
+state-of-the-art methods).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 3DV'24. Project Page: https://licj15.github.io/MixRT/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Forging Tokens for Improved Storage-efficient Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10105v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10105v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minhyun Lee, Song Park, Byeongho Heo, Dongyoon Han, Hyunjung Shim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Deep Neural Network (DNN) models have significantly
+improved performance across computer vision tasks. However, achieving highly
+generalizable and high-performing vision models requires extensive datasets,
+leading to large storage requirements. This storage challenge poses a critical
+bottleneck for scaling up vision models. Motivated by the success of discrete
+representations, SeiT proposes to use Vector-Quantized (VQ) feature vectors
+(i.e., tokens) as network inputs for vision classification. However, applying
+traditional data augmentations to tokens faces challenges due to input domain
+shift. To address this issue, we introduce TokenAdapt and ColorAdapt, simple
+yet effective token-based augmentation strategies. TokenAdapt realigns token
+embedding space for compatibility with spatial augmentations, preserving the
+model's efficiency without requiring fine-tuning. Additionally, ColorAdapt
+addresses color-based augmentations for tokens inspired by Adaptive Instance
+Normalization (AdaIN). We evaluate our approach across various scenarios,
+including storage-efficient ImageNet-1k classification, fine-grained
+classification, robustness benchmarks, and ADE-20k semantic segmentation.
+Experimental results demonstrate consistent performance improvement in diverse
+experiments. Code is available at https://github.com/naver-ai/tokenadapt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CapST: An Enhanced and Lightweight Model Attribution Approach for
+  Synthetic Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03782v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03782v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wasim Ahmad, Yan-Tsung Peng, Yuan-Hao Chang, Gaddisa Olani Ganfure, Sarwar Khan, Sahibzada Adil Shahzad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deepfake videos, generated through AI faceswapping techniques, have garnered
+considerable attention due to their potential for powerful impersonation
+attacks. While existing research primarily focuses on binary classification to
+discern between real and fake videos, however determining the specific
+generation model for a fake video is crucial for forensic investigation.
+Addressing this gap, this paper investigates the model attribution problem of
+Deepfake videos from a recently proposed dataset, Deepfakes from Different
+Models (DFDM), derived from various Autoencoder models. The dataset comprises
+6,450 Deepfake videos generated by five distinct models with variations in
+encoder, decoder, intermediate layer, input resolution, and compression ratio.
+This study formulates Deepfakes model attribution as a multiclass
+classification task, proposing a segment of VGG19 as a feature extraction
+backbone, known for its effectiveness in imagerelated tasks, while integrated a
+Capsule Network with a Spatio-Temporal attention mechanism. The Capsule module
+captures intricate hierarchies among features for robust identification of
+deepfake attributes. Additionally, the video-level fusion technique leverages
+temporal attention mechanisms to handle concatenated feature vectors,
+capitalizing on inherent temporal dependencies in deepfake videos. By
+aggregating insights across frames, our model gains a comprehensive
+understanding of video content, resulting in more precise predictions.
+Experimental results on the deepfake benchmark dataset (DFDM) demonstrate the
+efficacy of our proposed method, achieving up to a 4% improvement in accurately
+categorizing deepfake videos compared to baseline models while demanding fewer
+computational resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Rejected from jounal and will have to conduct several more
+  experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Generalized Multi-Modal Fusion Detection Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07064v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07064v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leichao Cui, Xiuxian Li, Min Meng, Xiaoyu Mo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR point clouds have become the most common data source in autonomous
+driving. However, due to the sparsity of point clouds, accurate and reliable
+detection cannot be achieved in specific scenarios. Because of their
+complementarity with point clouds, images are getting increasing attention.
+Although with some success, existing fusion methods either perform hard fusion
+or do not fuse in a direct manner. In this paper, we propose a generic 3D
+detection framework called MMFusion, using multi-modal features. The framework
+aims to achieve accurate fusion between LiDAR and images to improve 3D
+detection in complex scenes. Our framework consists of two separate streams:
+the LiDAR stream and the camera stream, which can be compatible with any
+single-modal feature extraction network. The Voxel Local Perception Module in
+the LiDAR stream enhances local feature representation, and then the
+Multi-modal Feature Fusion Module selectively combines feature output from
+different streams to achieve better fusion. Extensive experiments have shown
+that our framework not only outperforms existing benchmarks but also improves
+their detection, especially for detecting cyclists and pedestrians on KITTI
+benchmarks, with strong robustness and generalization capabilities. Hopefully,
+our work will stimulate more research into multi-modal fusion for autonomous
+driving tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Panoptic Scene Graph Generation with Semantics-Prototype Learning <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15567v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15567v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Li, Wei Ji, Yiming Wu, Mengze Li, You Qin, Lina Wei, Roger Zimmermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic Scene Graph Generation (PSG) parses objects and predicts their
+relationships (predicate) to connect human language and visual scenes. However,
+different language preferences of annotators and semantic overlaps between
+predicates lead to biased predicate annotations in the dataset, i.e. different
+predicates for same object pairs. Biased predicate annotations make PSG models
+struggle in constructing a clear decision plane among predicates, which greatly
+hinders the real application of PSG models. To address the intrinsic bias
+above, we propose a novel framework named ADTrans to adaptively transfer biased
+predicate annotations to informative and unified ones. To promise consistency
+and accuracy during the transfer process, we propose to measure the invariance
+of representations in each predicate class, and learn unbiased prototypes of
+predicates with different intensities. Meanwhile, we continuously measure the
+distribution changes between each presentation and its prototype, and
+constantly screen potential biased data. Finally, with the unbiased
+predicate-prototype representation embedding space, biased annotations are
+easily identified. Experiments show that ADTrans significantly improves the
+performance of benchmark models, achieving a new state-of-the-art performance,
+and shows great generalization and effectiveness on multiple datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics-guided Noise Neural Proxy for Practical Low-light Raw Image
+  Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09126v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09126v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansen Feng, Lizhi Wang, Yiqi Huang, Yuzhi Wang, Lin Zhu, Hua Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the mainstream practice for training low-light raw image denoising
+methods has shifted towards employing synthetic data. Noise modeling, which
+focuses on characterizing the noise distribution of real-world sensors,
+profoundly influences the effectiveness and practicality of synthetic data.
+Currently, physics-based noise modeling struggles to characterize the entire
+real noise distribution, while learning-based noise modeling impractically
+depends on paired real data. In this paper, we propose a novel strategy:
+learning the noise model from dark frames instead of paired real data, to break
+down the data dependency. Based on this strategy, we introduce an efficient
+physics-guided noise neural proxy (PNNP) to approximate the real-world sensor
+noise model. Specifically, we integrate physical priors into neural proxies and
+introduce three efficient techniques: physics-guided noise decoupling (PND),
+physics-guided proxy model (PPM), and differentiable distribution loss (DDL).
+PND decouples the dark frame into different components and handles different
+levels of noise flexibly, which reduces the complexity of noise modeling. PPM
+incorporates physical priors to constrain the generated noise, which promotes
+the accuracy of noise modeling. DDL provides explicit and reliable supervision
+for noise distribution, which promotes the precision of noise modeling. PNNP
+exhibits powerful potential in characterizing the real noise distribution.
+Extensive experiments on public datasets demonstrate superior performance in
+practical low-light raw image denoising. The code will be available at
+\url{https://github.com/fenghansen/PNNP}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Digital Fingerprinting of Microstructures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.13718v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.13718v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael D. White, Alexander Tarakanov, Christopher P. Race, Philip J. Withers, Kody J. H. Law
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding efficient means of fingerprinting microstructural information is a
+critical step towards harnessing data-centric machine learning approaches. A
+statistical framework is systematically developed for compressed
+characterisation of a population of images, which includes some classical
+computer vision methods as special cases. The focus is on materials
+microstructure. The ultimate purpose is to rapidly fingerprint sample images in
+the context of various high-throughput design/make/test scenarios. This
+includes, but is not limited to, quantification of the disparity between
+microstructures for quality control, classifying microstructures, predicting
+materials properties from image data and identifying potential processing
+routes to engineer new materials with specific properties. Here, we consider
+microstructure classification and utilise the resulting features over a range
+of related machine learning tasks, namely supervised, semi-supervised, and
+unsupervised learning.
+  The approach is applied to two distinct datasets to illustrate various
+aspects and some recommendations are made based on the findings. In particular,
+methods that leverage transfer learning with convolutional neural networks
+(CNNs), pretrained on the ImageNet dataset, are generally shown to outperform
+other methods. Additionally, dimensionality reduction of these CNN-based
+fingerprints is shown to have negligible impact on classification accuracy for
+the supervised learning approaches considered. In situations where there is a
+large dataset with only a handful of images labelled, graph-based label
+propagation to unlabelled data is shown to be favourable over discarding
+unlabelled data and performing supervised learning. In particular, label
+propagation by Poisson learning is shown to be highly effective at low label
+rates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Unsupervised Domain Adaptation for Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.00067v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.00067v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhijie Wang, Masanori Suganuma, Takayuki Okatani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (UDA) adapts a model trained on one domain
+(called source) to a novel domain (called target) using only unlabeled data.
+Due to its high annotation cost, researchers have developed many UDA methods
+for semantic segmentation, which assume no labeled sample is available in the
+target domain. We question the practicality of this assumption for two reasons.
+First, after training a model with a UDA method, we must somehow verify the
+model before deployment. Second, UDA methods have at least a few
+hyper-parameters that need to be determined. The surest solution to these is to
+evaluate the model using validation data, i.e., a certain amount of labeled
+target-domain samples. This question about the basic assumption of UDA leads us
+to rethink UDA from a data-centric point of view. Specifically, we assume we
+have access to a minimum level of labeled data. Then, we ask how much is
+necessary to find good hyper-parameters of existing UDA methods. We then
+consider what if we use the same data for supervised training of the same
+model, e.g., finetuning. We conducted experiments to answer these questions
+with popular scenarios, {GTA5, SYNTHIA}$\rightarrow$Cityscapes. We found that
+i) choosing good hyper-parameters needs only a few labeled images for some UDA
+methods whereas a lot more for others; and ii) simple finetuning works
+surprisingly well; it outperforms many UDA methods if only several dozens of
+labeled images are available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review in Pattern Recognition Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compressed 3D Gaussian Splatting for Accelerated Novel View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.02436v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.02436v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Niedermayr, Josef Stumpfegger, Rüdiger Westermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, high-fidelity scene reconstruction with an optimized 3D Gaussian
+splat representation has been introduced for novel view synthesis from sparse
+image sets. Making such representations suitable for applications like network
+streaming and rendering on low-power devices requires significantly reduced
+memory consumption as well as improved rendering efficiency. We propose a
+compressed 3D Gaussian splat representation that utilizes sensitivity-aware
+vector clustering with quantization-aware training to compress directional
+colors and Gaussian parameters. The learned codebooks have low bitrates and
+achieve a compression rate of up to $31\times$ on real-world scenes with only
+minimal degradation of visual quality. We demonstrate that the compressed splat
+representation can be efficiently rendered with hardware rasterization on
+lightweight GPUs at up to $4\times$ higher framerates than reported via an
+optimized GPU compute pipeline. Extensive experiments across multiple datasets
+demonstrate the robustness and rendering speed of the proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 2D-3D Interlaced <span class="highlight-title">Transformer</span> for Point Cloud Segmentation with
+  Scene-Level Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12817v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12817v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng-Kun Yang, Min-Hung Chen, Yung-Yu Chuang, Yen-Yu Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a Multimodal Interlaced Transformer (MIT) that jointly considers
+2D and 3D data for weakly supervised point cloud segmentation. Research studies
+have shown that 2D and 3D features are complementary for point cloud
+segmentation. However, existing methods require extra 2D annotations to achieve
+2D-3D information fusion. Considering the high annotation cost of point clouds,
+effective 2D and 3D feature fusion based on weakly supervised learning is in
+great demand. To this end, we propose a transformer model with two encoders and
+one decoder for weakly supervised point cloud segmentation using only
+scene-level class tags. Specifically, the two encoders compute the
+self-attended features for 3D point clouds and 2D multi-view images,
+respectively. The decoder implements interlaced 2D-3D cross-attention and
+carries out implicit 2D and 3D feature fusion. We alternately switch the roles
+of queries and key-value pairs in the decoder layers. It turns out that the 2D
+and 3D features are iteratively enriched by each other. Experiments show that
+it performs favorably against existing weakly supervised point cloud
+segmentation methods by a large margin on the S3DIS and ScanNet benchmarks. The
+project page will be available at https://jimmy15923.github.io/mit_web/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 (main + supp). Website:
+  https://jimmy15923.github.io/mit_web/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Learning in Medical Image Analysis: A Comprehensive <span class="highlight-title">Review</span> of
+  Recent Advancements and Future Prospects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.17004v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.17004v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pratibha Kumari, Joohi Chauhan, Afshin Bozorgpour, Boqiang Huang, Reza Azad, Dorit Merhof
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical imaging analysis has witnessed remarkable advancements even
+surpassing human-level performance in recent years, driven by the rapid
+development of advanced deep-learning algorithms. However, when the inference
+dataset slightly differs from what the model has seen during one-time training,
+the model performance is greatly compromised. The situation requires restarting
+the training process using both the old and the new data which is
+computationally costly, does not align with the human learning process, and
+imposes storage constraints and privacy concerns. Alternatively, continual
+learning has emerged as a crucial approach for developing unified and
+sustainable deep models to deal with new classes, tasks, and the drifting
+nature of data in non-stationary environments for various application areas.
+Continual learning techniques enable models to adapt and accumulate knowledge
+over time, which is essential for maintaining performance on evolving datasets
+and novel tasks. This systematic review paper provides a comprehensive overview
+of the state-of-the-art in continual learning techniques applied to medical
+imaging analysis. We present an extensive survey of existing research, covering
+topics including catastrophic forgetting, data drifts, stability, and
+plasticity requirements. Further, an in-depth discussion of key components of a
+continual learning framework such as continual learning scenarios, techniques,
+evaluation schemes, and metrics is provided. Continual learning techniques
+encompass various categories, including rehearsal, regularization,
+architectural, and hybrid strategies. We assess the popularity and
+applicability of continual learning categories in various medical sub-fields
+like radiology and histopathology...
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modality-missing RGBT Tracking via Invertible <span class="highlight-title">Prompt</span> Learning and A
+  High-quality Data Simulation Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.16244v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.16244v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andong Lu, Jiacong Zhao, Chenglong Li, Jin Tang, Bin Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current RGBT tracking researches mainly focus on the modality-complete
+scenarios, overlooking the modality-missing challenge in real-world scenes. In
+this work, we comprehensively investigate the impact of modality-missing
+challenge in RGBT tracking and propose a novel invertible prompt learning
+approach, which integrates the content-preserving prompts into a well-trained
+tracking model to adapt to various modality-missing scenarios, for
+modality-missing RGBT tracking. In particular, given one modality-missing
+scenario, we propose to utilize the available modality to generate the prompt
+of the missing modality to adapt to RGBT tracking model. However, the
+cross-modality gap between available and missing modalities usually causes
+semantic distortion and information loss in prompt generation. To handle this
+issue, we propose the invertible prompt learning scheme by incorporating the
+full reconstruction of the input available modality from the prompt in prompt
+generation model. Considering that there lacks a modality-missing RGBT tracking
+dataset and many modality-missing scenarios are difficult to capture, we design
+a high-quality data simulation method based on hierarchical combination schemes
+to generate real-world modality-missing data. Extensive experiments on three
+modality-missing datasets show that our method achieves significant performance
+improvements compared with state-of-the-art methods. We will release the code
+and simulation dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards the Detection of Diffusion Model Deepfakes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.14571v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.14571v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Ricker, Simon Damm, Thorsten Holz, Asja Fischer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the course of the past few years, diffusion models (DMs) have reached an
+unprecedented level of visual quality. However, relatively little attention has
+been paid to the detection of DM-generated images, which is critical to prevent
+adverse impacts on our society. In contrast, generative adversarial networks
+(GANs), have been extensively studied from a forensic perspective. In this
+work, we therefore take the natural next step to evaluate whether previous
+methods can be used to detect images generated by DMs. Our experiments yield
+two key findings: (1) state-of-the-art GAN detectors are unable to reliably
+distinguish real from DM-generated images, but (2) re-training them on
+DM-generated images allows for almost perfect detection, which remarkably even
+generalizes to GANs. Together with a feature space analysis, our results lead
+to the hypothesis that DMs produce fewer detectable artifacts and are thus more
+difficult to detect compared to GANs. One possible reason for this is the
+absence of grid-like frequency artifacts in DM-generated images, which are a
+known weakness of GANs. However, we make the interesting observation that
+diffusion models tend to underestimate high frequencies, which we attribute to
+the learning objective.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at VISAPP 2024. This is the extended version with additional
+  experiments and supplemental material. Code and data:
+  https://github.com/jonasricker/diffusion-model-deepfake-detection</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Model is Secretly a Training-free Open Vocabulary Semantic
+  Segmenter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02773v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02773v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinglong Wang, Xiawei Li, Jing Zhang, Qingyuan Xu, Qin Zhou, Qian Yu, Lu Sheng, Dong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pre-trained text-image discriminative models, such as CLIP, has been
+explored for open-vocabulary semantic segmentation with unsatisfactory results
+due to the loss of crucial localization information and awareness of object
+shapes. Recently, there has been a growing interest in expanding the
+application of generative models from generation tasks to semantic
+segmentation. These approaches utilize generative models either for generating
+annotated data or extracting features to facilitate semantic segmentation. This
+typically involves generating a considerable amount of synthetic data or
+requiring additional mask annotations. To this end, we uncover the potential of
+generative text-to-image diffusion models (e.g., Stable Diffusion) as highly
+efficient open-vocabulary semantic segmenters, and introduce a novel
+training-free approach named DiffSegmenter. The insight is that to generate
+realistic objects that are semantically faithful to the input text, both the
+complete object shapes and the corresponding semantics are implicitly learned
+by diffusion models. We discover that the object shapes are characterized by
+the self-attention maps while the semantics are indicated through the
+cross-attention maps produced by the denoising U-Net, forming the basis of our
+segmentation results.Additionally, we carefully design effective textual
+prompts and a category filtering mechanism to further enhance the segmentation
+results. Extensive experiments on three benchmark datasets show that the
+proposed DiffSegmenter achieves impressive results for open-vocabulary semantic
+segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LISA++: An Improved Baseline for Reasoning Segmentation with Large
+  Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.17240v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.17240v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Senqiao Yang, Tianyuan Qu, Xin Lai, Zhuotao Tian, Bohao Peng, Shu Liu, Jiaya Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While LISA effectively bridges the gap between segmentation and large
+language models to enable reasoning segmentation, it poses certain limitations:
+unable to distinguish different instances of the target region, and constrained
+by the pre-defined textual response formats. In this work, we introduce LISA++,
+an update to the existing LISA model, focusing on improving core
+functionalities while keeping the base architecture intact. The main
+enhancements in LISA++ include: \textbf{1) Enhanced Segmentation}: The instance
+segmentation ability has been added, providing a more detailed scene analysis
+along with the existing multi-region semantic segmentation. \textbf{2) More
+Natural Conversation}: Improved capability for multi-turn dialogue, with the
+ability to incorporate segmentation results directly into text responses, i.e.,
+Segmentation in Dialogue (SiD). These improvements are achieved by curating the
+existing samples of generic segmentation datasets, aimed specifically at
+enhancing the segmentation and conversational skills without structural change
+and additional data sources. Comparative analysis with the original LISA model
+shows significant advancements in these areas, positioning LISA++ as a notable
+upgrade in visual understanding and interaction. LISA++'s adaptability and
+improved features highlight the versatility of the mask-as-embedding paradigm
+proposed by LISA, and the potential as a foundational model for diverse
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Typo fixed</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SMILEtrack: SiMIlarity LEarning for Occlusion-Aware Multiple Object
+  Tracking <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08824v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08824v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Hsiang Wang, Jun-Wei Hsieh, Ping-Yang Chen, Ming-Ching Chang, Hung Hin So, Xin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recent progress in Multiple Object Tracking (MOT), several obstacles
+such as occlusions, similar objects, and complex scenes remain an open
+challenge. Meanwhile, a systematic study of the cost-performance tradeoff for
+the popular tracking-by-detection paradigm is still lacking. This paper
+introduces SMILEtrack, an innovative object tracker that effectively addresses
+these challenges by integrating an efficient object detector with a Siamese
+network-based Similarity Learning Module (SLM). The technical contributions of
+SMILETrack are twofold. First, we propose an SLM that calculates the appearance
+similarity between two objects, overcoming the limitations of feature
+descriptors in Separate Detection and Embedding (SDE) models. The SLM
+incorporates a Patch Self-Attention (PSA) block inspired by the vision
+Transformer, which generates reliable features for accurate similarity
+matching. Second, we develop a Similarity Matching Cascade (SMC) module with a
+novel GATE function for robust object matching across consecutive video frames,
+further enhancing MOT performance. Together, these innovations help SMILETrack
+achieve an improved trade-off between the cost ({\em e.g.}, running speed) and
+performance (e.g., tracking accuracy) over several existing state-of-the-art
+benchmarks, including the popular BYTETrack method. SMILETrack outperforms
+BYTETrack by 0.4-0.8 MOTA and 2.1-2.2 HOTA points on MOT17 and MOT20 datasets.
+Code is available at https://github.com/pingyang1117/SMILEtrack_Official
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our paper was accepted by AAAI2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From 2D Images to 3D Model:Weakly Supervised Multi-View Face
+  Reconstruction with Deep Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.03842v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.03842v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiguang Zhao, Chaolong Yang, Jianan Ye, Rui Zhang, Yuyao Yan, Xi Yang, Bin Dong, Amir Hussain, Kaizhu Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While weakly supervised multi-view face reconstruction (MVR) is garnering
+increased attention, one critical issue still remains open: how to effectively
+fuse multiple image information to reconstruct high-precision 3D models. In
+this regard, we propose a novel model called Deep Fusion MVR (DF-MVR) to
+reconstruct high-precision 3D facial shapes from multi-view images.
+Specifically, we introduce MulEn-Unet, a multi-view encoding to single decoding
+framework with skip connections and attention. This design allows for the
+extraction, integration, and compensation of deep features with attention from
+multi-view images. Furthermore, we adopt the involution kernel to enrich deep
+fusion features with channel features. In addition, we develop the face parse
+network to learn, identify, and emphasize the critical common face area within
+multi-view images. Experiments on Pixel-Face and Bosphorus datasets indicate
+the superiority of our model. Without 3D annotation, DF-MVR achieves 5.2% and
+3.0% RMSE improvement over the existing weakly supervised MVRs respectively on
+Pixel-Face and Bosphorus dataset. Code will be available publicly at
+https://github.com/weiguangzhao/DF_MVR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ETPNav: Evolving Topological Planning for Vision-Language Navigation in
+  Continuous Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03047v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03047v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong An, Hanqing Wang, Wenguan Wang, Zun Wang, Yan Huang, Keji He, Liang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language navigation is a task that requires an agent to follow
+instructions to navigate in environments. It becomes increasingly crucial in
+the field of embodied AI, with potential applications in autonomous navigation,
+search and rescue, and human-robot interaction. In this paper, we propose to
+address a more practical yet challenging counterpart setting - vision-language
+navigation in continuous environments (VLN-CE). To develop a robust VLN-CE
+agent, we propose a new navigation framework, ETPNav, which focuses on two
+critical skills: 1) the capability to abstract environments and generate
+long-range navigation plans, and 2) the ability of obstacle-avoiding control in
+continuous environments. ETPNav performs online topological mapping of
+environments by self-organizing predicted waypoints along a traversed path,
+without prior environmental experience. It privileges the agent to break down
+the navigation procedure into high-level planning and low-level control.
+Concurrently, ETPNav utilizes a transformer-based cross-modal planner to
+generate navigation plans based on topological maps and instructions. The plan
+is then performed through an obstacle-avoiding controller that leverages a
+trial-and-error heuristic to prevent navigation from getting stuck in
+obstacles. Experimental results demonstrate the effectiveness of the proposed
+method. ETPNav yields more than 10% and 20% improvements over prior
+state-of-the-art on R2R-CE and RxR-CE datasets, respectively. Our code is
+available at https://github.com/MarSaKi/ETPNav.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://github.com/MarSaKi/ETPNav</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ultrasound Image Segmentation of Thyroid Nodule via Latent Semantic
+  Feature Co-Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuewei Li, Yaqiao Zhu, Jie Gao, Xi Wei, Ruixuan Zhang, Yuan Tian, ZhiQiang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmentation of nodules in thyroid ultrasound imaging plays a crucial role in
+the detection and treatment of thyroid cancer. However, owing to the diversity
+of scanner vendors and imaging protocols in different hospitals, the automatic
+segmentation model, which has already demonstrated expert-level accuracy in the
+field of medical image segmentation, finds its accuracy reduced as the result
+of its weak generalization performance when being applied in clinically
+realistic environments. To address this issue, the present paper proposes ASTN,
+a framework for thyroid nodule segmentation achieved through a new type
+co-registration network. By extracting latent semantic information from the
+atlas and target images and utilizing in-depth features to accomplish the
+co-registration of nodules in thyroid ultrasound images, this framework can
+ensure the integrity of anatomical structure and reduce the impact on
+segmentation as the result of overall differences in image caused by different
+devices. In addition, this paper also provides an atlas selection algorithm to
+mitigate the difficulty of co-registration. As shown by the evaluation results
+collected from the datasets of different devices, thanks to the method we
+proposed, the model generalization has been greatly improved while maintaining
+a high level of segmentation accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semi-supervised Semantic Segmentation using Redesigned Self-Training for
+  White Blood Cell 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07278v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07278v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinh Quoc Luu, Duy Khanh Le, Huy Thanh Nguyen, Minh Thanh Nguyen, Thinh Tien Nguyen, Vinh Quang Dinh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI) in healthcare, especially in white blood cell
+cancer diagnosis, is hindered by two primary challenges: the lack of
+large-scale labeled datasets for white blood cell (WBC) segmentation and
+outdated segmentation methods. To address the first challenge, a
+semi-supervised learning framework should be brought to efficiently annotate
+the large dataset. In this work, we address this issue by proposing a novel
+self-training pipeline with the incorporation of FixMatch. We discover that by
+incorporating FixMatch in the self-training pipeline, the performance improves
+in the majority of cases. Our performance achieved the best performance with
+the self-training scheme with consistency on DeepLab-V3 architecture and
+ResNet-50, reaching 90.69%, 87.37%, and 76.49% on Zheng 1, Zheng 2, and LISC
+datasets, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LanguageBind: Extending Video-Language <span class="highlight-title">Pretrain</span>ing to N-modality by
+  Language-based Semantic Alignment <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01852v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01852v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, HongFa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, Wancai Zhang, Zhifeng Li, Wei Liu, Li Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The video-language (VL) pretraining has achieved remarkable improvement in
+multiple downstream tasks. However, the current VL pretraining framework is
+hard to extend to multiple modalities (N modalities, N>=3) beyond vision and
+language. We thus propose LanguageBind, taking the language as the bind across
+different modalities because the language modality is well-explored and
+contains rich semantics. Specifically, we freeze the language encoder acquired
+by VL pretraining, then train encoders for other modalities with contrastive
+learning. As a result, all modalities are mapped to a shared feature space,
+implementing multi-modal semantic alignment. While LanguageBind ensures that we
+can extend VL modalities to N modalities, we also need a high-quality dataset
+with alignment data pairs centered on language. We thus propose VIDAL-10M with
+Video, Infrared, Depth, Audio and their corresponding Language, naming as
+VIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with
+complete semantics rather than truncated segments from long videos, and all the
+video, depth, infrared, and audio modalities are aligned to their textual
+descriptions. LanguageBind has achieved superior performance on a wide range of
+15 benchmarks covering video, audio, depth, and infrared. Moreover, multiple
+experiments have provided evidence for the effectiveness of LanguageBind in
+achieving indirect alignment and complementarity among diverse modalities. Code
+address: https://github.com/PKU-YuanGroup/LanguageBind
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recurrent Generic Contour-based Instance Segmentation with Progressive
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08898v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08898v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Feng, Keyi Zhou, Wengang Zhou, Yufei Yin, Jiajun Deng, Qi Sun, Houqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contour-based instance segmentation has been actively studied, thanks to its
+flexibility and elegance in processing visual objects within complex
+backgrounds. In this work, we propose a novel deep network architecture, i.e.,
+PolySnake, for generic contour-based instance segmentation. Motivated by the
+classic Snake algorithm, the proposed PolySnake achieves superior and robust
+segmentation performance with an iterative and progressive contour refinement
+strategy. Technically, PolySnake introduces a recurrent update operator to
+estimate the object contour iteratively. It maintains a single estimate of the
+contour that is progressively deformed toward the object boundary. At each
+iteration, PolySnake builds a semantic-rich representation for the current
+contour and feeds it to the recurrent operator for further contour adjustment.
+Through the iterative refinements, the contour progressively converges to a
+stable status that tightly encloses the object instance. Beyond the scope of
+general instance segmentation, extensive experiments are conducted to validate
+the effectiveness and generalizability of our PolySnake in two additional
+specific task scenarios, including scene text detection and lane detection. The
+results demonstrate that the proposed PolySnake outperforms the existing
+advanced methods on several multiple prevalent benchmarks across the three
+tasks. The codes and pre-trained models are available at
+https://github.com/fh2019ustc/PolySnake
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Art of Camouflage: Few-shot Learning for Animal Detection and
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07444v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07444v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thanh-Danh Nguyen, Anh-Khoa Nguyen Vu, Nhat-Duy Nguyen, Vinh-Tiep Nguyen, Thanh Duc Ngo, Thanh-Toan Do, Minh-Triet Tran, Tam V. Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camouflaged object detection and segmentation is a new and challenging
+research topic in computer vision. There is a serious issue of lacking data of
+camouflaged objects such as camouflaged animals in natural scenes. In this
+paper, we address the problem of few-shot learning for camouflaged object
+detection and segmentation. To this end, we first collect a new dataset,
+CAMO-FS, for the benchmark. We then propose a novel method to efficiently
+detect and segment the camouflaged objects in the images. In particular, we
+introduce the instance triplet loss and the instance memory storage. The
+extensive experiments demonstrated that our proposed method achieves
+state-of-the-art performance on the newly collected dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under-review Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modulate Your Spectrum in <span class="highlight-title">Self-Supervised</span> Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16789v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16789v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Weng, Yunhao Ni, Tengwei Song, Jie Luo, Rao Muhammad Anwer, Salman Khan, Fahad Shahbaz Khan, Lei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whitening loss offers a theoretical guarantee against feature collapse in
+self-supervised learning (SSL) with joint embedding architectures. Typically,
+it involves a hard whitening approach, transforming the embedding and applying
+loss to the whitened output. In this work, we introduce Spectral Transformation
+(ST), a framework to modulate the spectrum of embedding and to seek for
+functions beyond whitening that can avoid dimensional collapse. We show that
+whitening is a special instance of ST by definition, and our empirical
+investigations unveil other ST instances capable of preventing collapse.
+Additionally, we propose a novel ST instance named IterNorm with trace loss
+(INTL). Theoretical analysis confirms INTL's efficacy in preventing collapse
+and modulating the spectrum of embedding toward equal-eigenvalues during
+optimization. Our experiments on ImageNet classification and COCO object
+detection demonstrate INTL's potential in learning superior representations.
+The code is available at https://github.com/winci-ai/INTL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024. The code is available at
+  https://github.com/winci-ai/intl</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion-Zero: Zero-Shot Moving Object Control Framework for
+  Diffusion-Based Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10150v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10150v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changgu Chen, Junwei Shu, Lianggangxu Chen, Gaoqi He, Changbo Wang, Yang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent large-scale pre-trained diffusion models have demonstrated a powerful
+generative ability to produce high-quality videos from detailed text
+descriptions. However, exerting control over the motion of objects in videos
+generated by any video diffusion model is a challenging problem. In this paper,
+we propose a novel zero-shot moving object trajectory control framework,
+Motion-Zero, to enable a bounding-box-trajectories-controlled text-to-video
+diffusion model. To this end, an initial noise prior module is designed to
+provide a position-based prior to improve the stability of the appearance of
+the moving object and the accuracy of position. In addition, based on the
+attention map of the U-net, spatial constraints are directly applied to the
+denoising process of diffusion models, which further ensures the positional and
+spatial consistency of moving objects during the inference. Furthermore,
+temporal consistency is guaranteed with a proposed shift temporal attention
+mechanism. Our method can be flexibly applied to various state-of-the-art video
+diffusion models without any training process. Extensive experiments
+demonstrate our proposed method can control the motion trajectories of objects
+and generate high-quality videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Computational Pathology: A <span class="highlight-title">Survey</span> <span class="highlight-title">Review</span> and The Way Forward 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05482v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05482v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahdi S. Hosseini, Babak Ehteshami Bejnordi, Vincent Quoc-Huy Trinh, Danial Hasan, Xingwen Li, Taehyo Kim, Haochen Zhang, Theodore Wu, Kajanan Chinniah, Sina Maghsoudlou, Ryan Zhang, Stephen Yang, Jiadai Zhu, Lyndon Chan, Samir Khaki, Andrei Buin, Fatemeh Chaji, Ala Salehi, Bich Ngoc Nguyen, Dimitris Samaras, Konstantinos N. Plataniotis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational Pathology CPath is an interdisciplinary science that augments
+developments of computational approaches to analyze and model medical
+histopathology images. The main objective for CPath is to develop
+infrastructure and workflows of digital diagnostics as an assistive CAD system
+for clinical pathology, facilitating transformational changes in the diagnosis
+and treatment of cancer that are mainly address by CPath tools. With
+evergrowing developments in deep learning and computer vision algorithms, and
+the ease of the data flow from digital pathology, currently CPath is witnessing
+a paradigm shift. Despite the sheer volume of engineering and scientific works
+being introduced for cancer image analysis, there is still a considerable gap
+of adopting and integrating these algorithms in clinical practice. This raises
+a significant question regarding the direction and trends that are undertaken
+in CPath. In this article we provide a comprehensive review of more than 800
+papers to address the challenges faced in problem design all-the-way to the
+application and implementation viewpoints. We have catalogued each paper into a
+model-card by examining the key works and challenges faced to layout the
+current landscape in CPath. We hope this helps the community to locate relevant
+works and facilitate understanding of the field's future directions. In a
+nutshell, we oversee the CPath developments in cycle of stages which are
+required to be cohesively linked together to address the challenges associated
+with such multidisciplinary science. We overview this cycle from different
+perspectives of data-centric, model-centric, and application-centric problems.
+We finally sketch remaining challenges and provide directions for future
+technical developments and clinical integration of CPath
+(https://github.com/AtlasAnalyticsLab/CPath_Survey).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in Elsevier Journal of Pathology Informatics (JPI) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Look, Remember and Reason: Grounded reasoning in videos with language
+  models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17778v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17778v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Apratim Bhattacharyya, Sunny Panchal, Mingu Lee, Reza Pourreza, Pulkit Madan, Roland Memisevic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal language models (LM) have recently shown promising performance in
+high-level reasoning tasks on videos. However, existing methods still fall
+short in tasks like causal or compositional spatiotemporal reasoning over
+actions, in which model predictions need to be grounded in fine-grained
+low-level details, such as object motions and object interactions. In this
+work, we propose training an LM end-to-end on low-level surrogate tasks,
+including object detection, re-identification, and tracking, to endow the model
+with the required low-level visual capabilities. We show that a two-stream
+video encoder with spatiotemporal attention is effective at capturing the
+required static and motion-based cues in the video. By leveraging the LM's
+ability to perform the low-level surrogate tasks, we can cast reasoning in
+videos as the three-step process of Look, Remember, Reason wherein visual
+information is extracted using low-level visual skills step-by-step and then
+integrated to arrive at a final answer. We demonstrate the effectiveness of our
+framework on diverse visual reasoning tasks from the ACRE, CATER,
+Something-Else and STAR datasets. Our approach is trainable end-to-end and
+surpasses state-of-the-art task-specific methods across these tasks by a large
+margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Implicit Neural Image Stitching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01409v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01409v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsu Kim, Jaewon Lee, Byeonghun Lee, Sunghoon Im, Kyong Hwan Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing frameworks for image stitching often provide visually reasonable
+stitchings. However, they suffer from blurry artifacts and disparities in
+illumination, depth level, etc. Although the recent learning-based stitchings
+relax such disparities, the required methods impose sacrifice of image
+qualities failing to capture high-frequency details for stitched images. To
+address the problem, we propose a novel approach, implicit Neural Image
+Stitching (NIS) that extends arbitrary-scale super-resolution. Our method
+estimates Fourier coefficients of images for quality-enhancing warps. Then, the
+suggested model blends color mismatches and misalignment in the latent space
+and decodes the features into RGB values of stitched images. Our experiments
+show that our approach achieves improvement in resolving the low-definition
+imaging of the previous deep image stitching with favorable accelerated
+image-enhancing methods. Our source code is available at
+https://github.com/minshu-kim/NIS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Point Cloud to Mesh Reconstruction for Deformable Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.02749v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.02749v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elham Amin Mansour, Hehui Zheng, Robert K. Katzschmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The world around us is full of soft objects we perceive and deform with
+dexterous hand movements. For a robotic hand to control soft objects, it has to
+acquire online state feedback of the deforming object. While RGB-D cameras can
+collect occluded point clouds at a rate of 30Hz, this does not represent a
+continuously trackable object surface. Hence, in this work, we developed a
+method that takes as input a template mesh which is the mesh of an object in
+its non-deformed state and a deformed point cloud of the same object, and then
+shapes the template mesh such that it matches the deformed point cloud. The
+reconstruction of meshes from point clouds has long been studied in the field
+of Computer graphics under 3D reconstruction and 4D reconstruction, however,
+both lack the speed and generalizability needed for robotics applications. Our
+model is designed using a point cloud auto-encoder and a Real-NVP architecture.
+Our trained model can perform mesh reconstruction and tracking at a rate of
+58Hz on a template mesh of 3000 vertices and a deformed point cloud of 5000
+points and is generalizable to the deformations of six different object
+categories which are assumed to be made of soft material in our experiments
+(scissors, hammer, foam brick, cleanser bottle, orange, and dice). The object
+meshes are taken from the YCB benchmark dataset. An instance of a downstream
+application can be the control algorithm for a robotic hand that requires
+online feedback from the state of the manipulated object which would allow
+online grasp adaptation in a closed-loop manner. Furthermore, the tracking
+capacity of our method can help in the system identification of deforming
+objects in a marker-free approach. In future work, we will extend our trained
+model to generalize beyond six object categories and additionally to real-world
+deforming point clouds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages with appendix,16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ZipIt! Merging Models from Different Tasks without Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03053v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03053v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Stoica, Daniel Bolya, Jakob Bjorner, Pratik Ramesh, Taylor Hearn, Judy Hoffman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Typical deep visual recognition models are capable of performing the one task
+they were trained on. In this paper, we tackle the extremely difficult problem
+of combining distinct models with different initializations, each solving a
+separate task, into one multi-task model without any additional training. Prior
+work in model merging permutes one model to the space of the other then
+averages them together. While this works for models trained on the same task,
+we find that this fails to account for the differences in models trained on
+disjoint tasks. Thus, we introduce "ZipIt!", a general method for merging two
+arbitrary models of the same architecture that incorporates two simple
+strategies. First, in order to account for features that aren't shared between
+models, we expand the model merging problem to allow for merging features
+within each model by defining a general "zip" operation. Second, we add support
+for partially zipping the models up until a specified layer, naturally creating
+a multi-head model. We find that these two changes combined account for 20-60%
+improvement over prior work, making it more feasible to merge models trained on
+disjoint tasks without retraining.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 2.75D: Boosting learning by representing 3D Medical imaging to 2D
+  features for small data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2002.04251v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2002.04251v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Wang, Ruisheng Su, Weiyi Xie, Wenjin Wang, Yi Xu, Ritse Mann, Jungong Han, Tao Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In medical-data driven learning, 3D convolutional neural networks (CNNs) have
+started to show superior performance to 2D CNNs in numerous deep learning
+tasks, proving the added value of 3D spatial information in feature
+representation. However, the difficulty in collecting more training samples to
+converge, more computational resources and longer execution time make this
+approach less applied. Also, applying transfer learning on 3D CNN is
+challenging due to a lack of publicly available pre-trained 3D models. To
+tackle these issues, we proposed a novel 2D strategical representation of
+volumetric data, namely 2.75D. In this work, the spatial information of 3D
+images is captured in a single 2D view by a spiral-spinning technique. As a
+result, 2D CNN networks can also be used to learn volumetric information.
+Besides, we can fully leverage pre-trained 2D CNNs for downstream vision
+problems. We also explore a multi-view 2.75D strategy, 2.75D 3 channels
+(2.75Dx3), to boost the advantage of 2.75D. We evaluated the proposed methods
+on three public datasets with different modalities or organs (Lung CT, Breast
+MRI, and Prostate MRI), against their 2D, 2.5D, and 3D counterparts in
+classification tasks. Results show that the proposed methods significantly
+outperform other counterparts when all methods were trained from scratch on the
+lung dataset. Such performance gain is more pronounced with transfer learning
+or in the case of limited training data. Our methods also achieved comparable
+performance on other datasets. In addition, our methods achieved a substantial
+reduction in time consumption of training and inference compared with the 2.5D
+or 3D method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RTFS-Net: Recurrent time-frequency modelling for efficient audio-visual
+  speech separation <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17189v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17189v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Pegg, Kai Li, Xiaolin Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-visual speech separation methods aim to integrate different modalities
+to generate high-quality separated speech, thereby enhancing the performance of
+downstream tasks such as speech recognition. Most existing state-of-the-art
+(SOTA) models operate in the time domain. However, their overly simplistic
+approach to modeling acoustic features often necessitates larger and more
+computationally intensive models in order to achieve SOTA performance. In this
+paper, we present a novel time-frequency domain audio-visual speech separation
+method: Recurrent Time-Frequency Separation Network (RTFS-Net), which applies
+its algorithms on the complex time-frequency bins yielded by the Short-Time
+Fourier Transform. We model and capture the time and frequency dimensions of
+the audio independently using a multi-layered RNN along each dimension.
+Furthermore, we introduce a unique attention-based fusion technique for the
+efficient integration of audio and visual information, and a new mask
+separation approach that takes advantage of the intrinsic spectral nature of
+the acoustic features for a clearer separation. RTFS-Net outperforms the
+previous SOTA method using only 10% of the parameters and 18% of the MACs. This
+is the first time-frequency domain audio-visual speech separation method to
+outperform all contemporary time-domain counterparts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Facial Action Unit Detection Through Jointly Learning Facial
+  Landmark Detection and Domain Separation and Reconstruction <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05207v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05207v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqiao Shang, Li Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently how to introduce large amounts of unlabeled facial images in the
+wild into supervised Facial Action Unit (AU) detection frameworks has become a
+challenging problem. In this paper, we propose a new AU detection framework
+where multi-task learning is introduced to jointly learn AU domain separation
+and reconstruction and facial landmark detection by sharing the parameters of
+homostructural facial extraction modules. In addition, we propose a new feature
+alignment scheme based on contrastive learning by simple projectors and an
+improved contrastive loss, which adds four additional intermediate supervisors
+to promote the feature reconstruction process. Experimental results on two
+benchmarks demonstrate our superiority against the state-of-the-art methods for
+AU detection in the wild.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure, published to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Template Free Reconstruction of Human-object Interaction with Procedural
+  Interaction Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianghui Xie, Bharat Lal Bhatnagar, Jan Eric Lenssen, Gerard Pons-Moll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing human-object interaction in 3D from a single RGB image is a
+challenging task and existing data driven methods do not generalize beyond the
+objects present in the carefully curated 3D interaction datasets. Capturing
+large-scale real data to learn strong interaction and 3D shape priors is very
+expensive due to the combinatorial nature of human-object interactions. In this
+paper, we propose ProciGen (Procedural interaction Generation), a method to
+procedurally generate datasets with both, plausible interaction and diverse
+object variation. We generate 1M+ human-object interaction pairs in 3D and
+leverage this large-scale data to train our HDM (Hierarchical Diffusion Model),
+a novel method to reconstruct interacting human and unseen objects, without any
+templates. Our HDM is an image-conditioned diffusion model that learns both
+realistic interaction and highly accurate human and object shapes. Experiments
+show that our HDM trained with ProciGen significantly outperforms prior methods
+that requires template meshes and that our dataset allows training methods with
+strong generalization ability to unseen object instances. Our code and data
+will be publicly released at:
+https://virtualhumans.mpi-inf.mpg.de/procigen-hdm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 18 figures. Project page:
+  https://virtualhumans.mpi-inf.mpg.de/procigen-hdm (updated the
+  acknowledgement)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">10</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Document-Level Relation Extraction with Context-Guided Link
+  Prediction <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Monika Jain, Raghava Mutharaju, Ramakanth Kavuluru, Kuldeep Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document-level relation extraction (DocRE) poses the challenge of identifying
+relationships between entities within a document as opposed to the traditional
+RE setting where a single sentence is input. Existing approaches rely on
+logical reasoning or contextual cues from entities. This paper reframes
+document-level RE as link prediction over a knowledge graph with distinct
+benefits: 1) Our approach combines entity context with document-derived logical
+reasoning, enhancing link prediction quality. 2) Predicted links between
+entities offer interpretability, elucidating employed reasoning. We evaluate
+our approach on three benchmark datasets: DocRED, ReDocRED, and DWIE. The
+results indicate that our proposed method outperforms the state-of-the-art
+models and suggests that incorporating context-based link prediction techniques
+can enhance the performance of document-level relation extraction models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Navigation: Inferring the Interlocking Map of Knowledge from
+  Research Trajectories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11742v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11742v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shibing Xiang, Bing Liu, Yurui Huang, Chaolin Tian, Xin Jiang, Yifang Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  "If I have seen further, it is by standing on the shoulders of giants," Isaac
+Newton's renowned statement hints that new knowledge builds upon existing
+foundations, which means there exists an interdependent relationship between
+knowledge, which, yet uncovered, is implied in the historical development of
+scientific systems for hundreds of years. By leveraging natural language
+processing techniques, this study introduces an innovative embedding scheme
+designed to infer the "knowledge interlocking map." This map, derived from the
+research trajectories of millions of scholars, reveals the intricate
+connections among knowledge. We validate that the inferred map effectively
+delineates disciplinary boundaries and captures the intricate relationships
+between diverse concepts. The utility of the interlocking map is showcased
+through multiple applications. Firstly, we demonstrated the multi-step analogy
+inferences within the knowledge space and the functional connectivity between
+concepts in different disciplines. Secondly, we trace the evolution of
+knowledge across domains, observing trends such as shifts from "Theoretical" to
+"Applied" or "Chemistry" to "Biomedical" along predefined functional
+directions. Lastly, by analyzing the high-dimensional knowledge network
+structure, we found that knowledge connects each other with shorter global
+pathways, and the interdisciplinary knowledge plays a critical role in
+accessibility of the global knowledge network. Our framework offers a novel
+approach to mining knowledge inheritance pathways in extensive scientific
+literature, which is of great significance for understanding scientific
+development patterns, tailoring scientific learning trajectories, and
+accelerating scientific progress.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 9 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain-Aware Cross-Attention for Cross-domain Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11705v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11705v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Luo, Shiwei Ma, Mingjun Nie, Changping Peng, Zhangang Lin, Jingping Shao, Qianfang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain recommendation (CDR) is an important method to improve
+recommender system performance, especially when observations in target domains
+are sparse. However, most existing cross-domain recommendations fail to fully
+utilize the target domain's special features and are hard to be generalized to
+new domains. The designed network is complex and is not suitable for rapid
+industrial deployment. Our method introduces a two-step domain-aware
+cross-attention, extracting transferable features of the source domain from
+different granularity, which allows the efficient expression of both domain and
+user interests. In addition, we simplify the training process, and our model
+can be easily deployed on new domains. We conduct experiments on both public
+datasets and industrial datasets, and the experimental results demonstrate the
+effectiveness of our method. We have also deployed the model in an online
+advertising system and observed significant improvements in both
+Click-Through-Rate (CTR) and effective cost per mille (ECPM).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal
+  Contrastive EHR Modelling with Hierarchical Regularisation <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heejoon Koo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting next visit diagnosis using Electronic Health Records (EHR) is an
+essential task in healthcare, critical for devising proactive future plans for
+both healthcare providers and patients. Nonetheless, many preceding studies
+have not sufficiently addressed the heterogeneous and hierarchical
+characteristics inherent in EHR data, inevitably leading to sub-optimal
+performance. To this end, we propose NECHO, a novel medical code-centric
+multimodal contrastive EHR learning framework with hierarchical regularisation.
+First, we integrate multifaceted information encompassing medical codes,
+demographics, and clinical notes using a tailored network design and a pair of
+bimodal contrastive losses, all of which pivot around a medical code
+representation. We also regularise modality-specific encoders using a parental
+level information in medical ontology to learn hierarchical structure of EHR
+data. A series of experiments on MIMIC-III data demonstrates effectiveness of
+our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EACL 2024 (The 18th Conference of the European Chapter of
+  the Association for Computational Linguistics)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Locality Sensitive Hashing for Structured Data: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.11209v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.11209v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Wu, Bin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data similarity (or distance) computation is a fundamental research topic
+which fosters a variety of similarity-based machine learning and data mining
+applications. In big data analytics, it is impractical to compute the exact
+similarity of data instances due to high computational cost. To this end, the
+Locality Sensitive Hashing (LSH) technique has been proposed to provide
+accurate estimators for various similarity measures between sets or vectors in
+an efficient manner without the learning process. Structured data (e.g.,
+sequences, trees and graphs), which are composed of elements and relations
+between the elements, are commonly seen in the real world, but the traditional
+LSH algorithms cannot preserve the structure information represented as
+relations between elements. In order to conquer the issue, researchers have
+been devoted to the family of the hierarchical LSH algorithms. In this paper,
+we explore the present progress of the research into hierarchical LSH from the
+following perspectives: 1) Data structures, where we review various
+hierarchical LSH algorithms for three typical data structures and uncover their
+inherent connections; 2) Applications, where we review the hierarchical LSH
+algorithms in multiple application scenarios; 3) Challenges, where we discuss
+some potential challenges as future directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analytical Modelling of Raw Data for Flow-Guided In-body Nanoscale
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16034v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16034v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillem Pascual, Filip Lemic, Carmen Delgado, Xavier Costa-Perez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in nanotechnology and material science are paving the way toward
+nanoscale devices that combine sensing, computing, data and energy storage, and
+wireless communication. In precision medicine, these nanodevices show promise
+for disease diagnostics, treatment, and monitoring from within the patients'
+bloodstreams. Assigning the location of a sensed biological event with the
+event itself, which is the main proposition of flow-guided in-body nanoscale
+localization, would be immensely beneficial from the perspective of precision
+medicine. The nanoscale nature of the nanodevices and the challenging
+environment that the bloodstream represents, result in current flow-guided
+localization approaches being constrained in their communication and
+energy-related capabilities. The communication and energy constraints of the
+nanodevices result in different features of raw data for flow-guided
+localization, in turn affecting its performance. An analytical modeling of the
+effects of imperfect communication and constrained energy causing intermittent
+operation of the nanodevices on the raw data produced by the nanodevices would
+be beneficial. Hence, we propose an analytical model of raw data for
+flow-guided localization, where the raw data is modeled as a function of
+communication and energy-related capabilities of the nanodevice. We evaluate
+the model by comparing its output with the one obtained through the utilization
+of a simulator for objective evaluation of flow-guided localization, featuring
+comparably higher level of realism. Our results across a number of scenarios
+and heterogeneous performance metrics indicate high similarity between the
+model and simulator-generated raw datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures, 4 tables, 16 references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Medication Recommendation via Domain Knowledge Informed Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19604v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19604v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicen Liu, Xiaolong Wang, Xianbing Zhao, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medication recommendation is a fundamental yet crucial branch of healthcare,
+which provides opportunities to support clinical physicians with more accurate
+medication prescriptions for patients with complex health conditions. Learning
+from electronic health records (EHR) to recommend medications is the most
+common way in previous studies. However, most of them neglect incorporating
+domain knowledge according to the clinical manifestations in the EHR of the
+patient. To address these issues, we propose a novel \textbf{D}omain
+\textbf{K}nowledge \textbf{I}nformed \textbf{Net}work (DKINet) to integrate
+domain knowledge with observable clinical manifestations of the patient, which
+is the first dynamic domain knowledge informed framework toward medication
+recommendation. In particular, we first design a knowledge-driven encoder to
+capture the domain information and then develop a data-driven encoder to
+integrate domain knowledge into the observable EHR. To endow the model with the
+capability of temporal decision, we design an explicit medication encoder for
+learning the longitudinal dependence of the patient. Extensive experiments on
+three publicly available datasets verify the superiority of our method. The
+code will be public upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A greedy approach for increased vehicle utilization in ridesharing
+  networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01225v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01225v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aqsa Ashraf Makhdomi, Iqra Altaf Gillani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, ridesharing platforms have become a prominent mode of
+transportation for the residents of urban areas. As a fundamental problem,
+route recommendation for these platforms is vital for their sustenance. The
+works done in this direction have recommended routes with higher passenger
+demand. Despite the existing works, statistics have suggested that these
+services cause increased greenhouse emissions compared to private vehicles as
+they roam around in search of riders. This analysis provides finer details
+regarding the functionality of ridesharing systems and it reveals that in the
+face of their boom, they have not utilized the vehicle capacity efficiently. We
+propose to overcome the above limitations and recommend routes that will fetch
+multiple passengers simultaneously which will result in increased vehicle
+utilization and thereby decrease the effect of these systems on the
+environment. As route recommendation is NP-hard, we propose a k-hop-based
+sliding window approximation algorithm that reduces the search space from
+entire road network to a window. We further demonstrate that maximizing
+expected demand is submodular and greedy algorithms can be used to optimize our
+objective function within a window. We evaluate our proposed model on
+real-world datasets and experimental results demonstrate superior performance
+by our proposed model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Streamlining Social Media Information Extraction for Public Health
+  Research with Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16001v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16001v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yining Hua, Shixu Lin, Minghui Li, Yujie Zhang, Dinah Foer, Siwen Wang, Peilin Zhou, Li Zhou, Jie Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: Social media-based public health research is crucial for epidemic
+surveillance, but most studies identify relevant corpora with keyword matching.
+This study develops a system to streamline the process of curating colloquial
+medical dictionaries. We demonstrate the pipeline by curating a UMLS-colloquial
+symptom dictionary from COVID-19-related tweets as proof of concept. Methods:
+COVID-19-related tweets from February 1, 2020, to April 30, 2022 were used. The
+pipeline includes three modules: a named entity recognition module to detect
+symptoms in tweets; an entity normalization module to aggregate detected
+entities; and a mapping module that iteratively maps entities to Unified
+Medical Language System concepts. A random 500 entity sample were drawn from
+the final dictionary for accuracy validation. Additionally, we conducted a
+symptom frequency distribution analysis to compare our dictionary to a
+pre-defined lexicon from previous research. Results: We identified 498,480
+unique symptom entity expressions from the tweets. Pre-processing reduces the
+number to 18,226. The final dictionary contains 38,175 unique expressions of
+symptoms that can be mapped to 966 UMLS concepts (accuracy = 95%). Symptom
+distribution analysis found that our dictionary detects more symptoms and is
+effective at identifying psychiatric disorders like anxiety and depression,
+often missed by pre-defined lexicons. Conclusion: This study advances public
+health research by implementing a novel, systematic pipeline for curating
+symptom lexicons from social media data. The final lexicon's high accuracy,
+validated by medical professionals, underscores the potential of this
+methodology to reliably interpret and categorize vast amounts of unstructured
+social media data into actionable medical insights across diverse linguistic
+and regional landscapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated full paper. Abstract presented at IEEE ICHI 2023 and AMIA
+  Annual Symposium 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preference and Concurrence Aware Bayesian Graph Neural Networks for
+  Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11486v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11486v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongjian Gu, Yaochen Hu, Yingxue Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-based collaborative filtering methods have prevailing performance for
+recommender systems since they can capture high-order information between users
+and items, in which the graphs are constructed from the observed user-item
+interactions that might miss links or contain spurious positive interactions in
+industrial scenarios. The Bayesian Graph Neural Network framework approaches
+this issue with generative models for the interaction graphs. The critical
+problem is to devise a proper family of graph generative models tailored to
+recommender systems. We propose an efficient generative model that jointly
+considers the preferences of users, the concurrence of items and some important
+graph structure information. Experiments on four popular benchmark datasets
+demonstrate the effectiveness of our proposed graph generative methods for
+recommender systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">137</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Simple Open-Vocabulary Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihang Lai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-vocabulary semantic segmentation models aim to accurately assign a
+semantic label to each pixel in an image from a set of arbitrary
+open-vocabulary texts. In order to learn such pixel-level alignment, current
+approaches typically rely on a combination of (i) image-level VL model (e.g.
+CLIP), (ii) ground truth masks, and (iii) custom grouping encoders. In this
+paper, we introduce S-Seg, a novel model that can achieve surprisingly strong
+performance without depending on any of the above elements. S-Seg leverages
+pseudo-mask and language to train a MaskFormer, and can be easily trained from
+publicly available image-text datasets. Contrary to prior works, our model
+directly trains for pixel-level features and language alignment. Once trained,
+S-Seg generalizes well to multiple testing datasets without requiring
+fine-tuning. In addition, S-Seg has the extra benefits of scalability with data
+and consistently improvement when augmented with self-training. We believe that
+our simple yet effective approach will serve as a solid baseline for future
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at: https://github.com/zlai0/S-Seg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Covariate Shift in Misspecified Regression with Applications
+  to Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philip Amortila, Tongyi Cao, Akshay Krishnamurthy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A pervasive phenomenon in machine learning applications is distribution
+shift, where training and deployment conditions for a machine learning model
+differ. As distribution shift typically results in a degradation in
+performance, much attention has been devoted to algorithmic interventions that
+mitigate these detrimental effects. In this paper, we study the effect of
+distribution shift in the presence of model misspecification, specifically
+focusing on $L_{\infty}$-misspecified regression and adversarial covariate
+shift, where the regression target remains fixed while the covariate
+distribution changes arbitrarily. We show that empirical risk minimization, or
+standard least squares regression, can result in undesirable misspecification
+amplification where the error due to misspecification is amplified by the
+density ratio between the training and testing distributions. As our main
+result, we develop a new algorithm -- inspired by robust optimization
+techniques -- that avoids this undesirable behavior, resulting in no
+misspecification amplification while still obtaining optimal statistical rates.
+As applications, we use this regression procedure to obtain new guarantees in
+offline and online reinforcement learning with misspecification and establish
+new separations between previously studied structural conditions and notions of
+coverage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rate-Distortion-Perception Tradeoff Based on the
+  Conditional-Distribution Perception Measure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sadaf Salehkalaibar, Jun Chen, Ashish Khisti, Wei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the rate-distortion-perception (RDP) tradeoff for a memoryless
+source model in the asymptotic limit of large block-lengths. Our perception
+measure is based on a divergence between the distributions of the source and
+reconstruction sequences conditioned on the encoder output, which was first
+proposed in [1], [2]. We consider the case when there is no shared randomness
+between the encoder and the decoder. For the case of discrete memoryless
+sources we derive a single-letter characterization of the RDP function, thus
+settling a problem that remains open for the marginal metric introduced in Blau
+and Michaeli [3] (with no shared randomness). Our achievability scheme is based
+on lossy source coding with a posterior reference map proposed in [4]. For the
+case of continuous valued sources under squared error distortion measure and
+squared quadratic Wasserstein perception measure we also derive a single-letter
+characterization and show that a noise-adding mechanism at the decoder suffices
+to achieve the optimal representation. For the case of zero perception loss, we
+show that our characterization interestingly coincides with the results for the
+marginal metric derived in [5], [6] and again demonstrate that zero perception
+loss can be achieved with a $3$-dB penalty in the minimum distortion. Finally
+we specialize our results to the case of Gaussian sources. We derive the RDP
+function for vector Gaussian sources and propose a waterfilling type solution.
+We also partially characterize the RDP function for a mixture of vector
+Gaussians.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval-Guided Reinforcement Learning for Boolean Circuit Minimization <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Animesh Basak Chowdhury, Marco Romanelli, Benjamin Tan, Ramesh Karri, Siddharth Garg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Logic synthesis, a pivotal stage in chip design, entails optimizing chip
+specifications encoded in hardware description languages like Verilog into
+highly efficient implementations using Boolean logic gates. The process
+involves a sequential application of logic minimization heuristics (``synthesis
+recipe"), with their arrangement significantly impacting crucial metrics such
+as area and delay. Addressing the challenge posed by the broad spectrum of
+design complexities - from variations of past designs (e.g., adders and
+multipliers) to entirely novel configurations (e.g., innovative processor
+instructions) - requires a nuanced `synthesis recipe` guided by human expertise
+and intuition. This study conducts a thorough examination of learning and
+search techniques for logic synthesis, unearthing a surprising revelation:
+pre-trained agents, when confronted with entirely novel designs, may veer off
+course, detrimentally affecting the search trajectory. We present ABC-RL, a
+meticulously tuned $\alpha$ parameter that adeptly adjusts recommendations from
+pre-trained agents during the search process. Computed based on similarity
+scores through nearest neighbor retrieval from the training dataset, ABC-RL
+yields superior synthesis recipes tailored for a wide array of hardware
+designs. Our findings showcase substantial enhancements in the
+Quality-of-result (QoR) of synthesized circuits, boasting improvements of up to
+24.8% compared to state-of-the-art techniques. Furthermore, ABC-RL achieves an
+impressive up to 9x reduction in runtime (iso-QoR) when compared to current
+state-of-the-art methodologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OK-Robot: What Really Matters in Integrating Open-Knowledge Models for
+  Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiqi Liu, Yaswanth Orru, Chris Paxton, Nur Muhammad Mahi Shafiullah, Lerrel Pinto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remarkable progress has been made in recent years in the fields of vision,
+language, and robotics. We now have vision models capable of recognizing
+objects based on language queries, navigation systems that can effectively
+control mobile systems, and grasping models that can handle a wide range of
+objects. Despite these advancements, general-purpose applications of robotics
+still lag behind, even though they rely on these fundamental capabilities of
+recognition, navigation, and grasping. In this paper, we adopt a systems-first
+approach to develop a new Open Knowledge-based robotics framework called
+OK-Robot. By combining Vision-Language Models (VLMs) for object detection,
+navigation primitives for movement, and grasping primitives for object
+manipulation, OK-Robot offers a integrated solution for pick-and-drop
+operations without requiring any training. To evaluate its performance, we run
+OK-Robot in 10 real-world home environments. The results demonstrate that
+OK-Robot achieves a 58.5% success rate in open-ended pick-and-drop tasks,
+representing a new state-of-the-art in Open Vocabulary Mobile Manipulation
+(OVMM) with nearly 1.8x the performance of prior work. On cleaner, uncluttered
+environments, OK-Robot's performance increases to 82%. However, the most
+important insight gained from OK-Robot is the critical role of nuanced details
+when combining Open Knowledge systems like VLMs with robotic modules. Videos of
+our experiments are available on our website: https://ok-robot.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ APT: Adaptive Pruning and Tuning <span class="highlight-title">Pretrain</span>ed Language Models for
+  Efficient Training and Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12200v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12200v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Zhao, Hannaneh Hajishirzi, Qingqing Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning and inference with large Language Models (LM) are generally known
+to be expensive. Parameter-efficient fine-tuning over pretrained LMs reduces
+training memory by updating a small number of LM parameters but does not
+improve inference efficiency. Structured pruning improves LM inference
+efficiency by removing consistent parameter blocks, yet often increases
+training memory and time. To improve both training and inference efficiency, we
+introduce APT that adaptively prunes and tunes parameters for the LMs. At the
+early stage of fine-tuning, APT dynamically adds salient tuning parameters for
+fast and accurate convergence while discarding unimportant parameters for
+efficiency. Compared to baselines, our experiments show that APT maintains up
+to 98% task performance when pruning RoBERTa and T5 models with 40% parameters
+left while keeping 86.4% LLaMA models' performance with 70% parameters
+remained. Furthermore, APT speeds up LMs fine-tuning by up to 8x and reduces
+large LMs memory training footprint by up to 70%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WARM: On the Benefits of Weight Averaged Reward Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Ramé, Nino Vieillard, Léonard Hussenot, Robert Dadashi, Geoffrey Cideron, Olivier Bachem, Johan Ferret
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aligning large language models (LLMs) with human preferences through
+reinforcement learning (RLHF) can lead to reward hacking, where LLMs exploit
+failures in the reward model (RM) to achieve seemingly high rewards without
+meeting the underlying objectives. We identify two primary challenges when
+designing RMs to mitigate reward hacking: distribution shifts during the RL
+process and inconsistencies in human preferences. As a solution, we propose
+Weight Averaged Reward Models (WARM), first fine-tuning multiple RMs, then
+averaging them in the weight space. This strategy follows the observation that
+fine-tuned weights remain linearly mode connected when sharing the same
+pre-training. By averaging weights, WARM improves efficiency compared to the
+traditional ensembling of predictions, while improving reliability under
+distribution shifts and robustness to preference inconsistencies. Our
+experiments on summarization tasks, using best-of-N and RL methods, shows that
+WARM improves the overall quality and alignment of LLM predictions; for
+example, a policy RL fine-tuned with WARM has a 79.4% win rate against a policy
+RL fine-tuned with a single RM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Neurons in <span class="highlight-title">GPT</span>2 Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12181v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12181v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wes Gurnee, Theo Horsley, Zifan Carl Guo, Tara Rezaei Kheirkhah, Qinyi Sun, Will Hathaway, Neel Nanda, Dimitris Bertsimas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A basic question within the emerging field of mechanistic interpretability is
+the degree to which neural networks learn the same underlying mechanisms. In
+other words, are neural mechanisms universal across different models? In this
+work, we study the universality of individual neurons across GPT2 models
+trained from different initial random seeds, motivated by the hypothesis that
+universal neurons are likely to be interpretable. In particular, we compute
+pairwise correlations of neuron activations over 100 million tokens for every
+neuron pair across five different seeds and find that 1-5\% of neurons are
+universal, that is, pairs of neurons which consistently activate on the same
+inputs. We then study these universal neurons in detail, finding that they
+usually have clear interpretations and taxonomize them into a small number of
+neuron families. We conclude by studying patterns in neuron weights to
+establish several universal functional roles of neurons in simple circuits:
+deactivating attention heads, changing the entropy of the next token
+distribution, and predicting the next token to (not) be within a particular
+set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DITTO: Diffusion Inference-Time T-Optimization for Music Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12179v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12179v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zachary Novack, Julian McAuley, Taylor Berg-Kirkpatrick, Nicholas J. Bryan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Diffusion Inference-Time T-Optimization (DITTO), a general-purpose
+frame-work for controlling pre-trained text-to-music diffusion models at
+inference-time via optimizing initial noise latents. Our method can be used to
+optimize through any differentiable feature matching loss to achieve a target
+(stylized) output and leverages gradient checkpointing for memory efficiency.
+We demonstrate a surprisingly wide-range of applications for music generation
+including inpainting, outpainting, and looping as well as intensity, melody,
+and musical structure control - all without ever fine-tuning the underlying
+model. When we compare our approach against related training, guidance, and
+optimization-based methods, we find DITTO achieves state-of-the-art performance
+on nearly all tasks, including outperforming comparable approaches on
+controllability, audio quality, and computational efficiency, thus opening the
+door for high-quality, flexible, training-free control of diffusion models.
+Sound examples can be found at https://DITTO-Music.github.io/web/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning
+  Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyuan Chen, Zhuo Xu, Sean Kirmani, Brian Ichter, Danny Driess, Pete Florence, Dorsa Sadigh, Leonidas Guibas, Fei Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and reasoning about spatial relationships is a fundamental
+capability for Visual Question Answering (VQA) and robotics. While Vision
+Language Models (VLM) have demonstrated remarkable performance in certain VQA
+benchmarks, they still lack capabilities in 3D spatial reasoning, such as
+recognizing quantitative relationships of physical objects like distances or
+size differences. We hypothesize that VLMs' limited spatial reasoning
+capability is due to the lack of 3D spatial knowledge in training data and aim
+to solve this problem by training VLMs with Internet-scale spatial reasoning
+data. To this end, we present a system to facilitate this approach. We first
+develop an automatic 3D spatial VQA data generation framework that scales up to
+2 billion VQA examples on 10 million real-world images. We then investigate
+various factors in the training recipe, including data quality, training
+pipeline, and VLM architecture. Our work features the first internet-scale 3D
+spatial reasoning dataset in metric space. By training a VLM on such data, we
+significantly enhance its ability on both qualitative and quantitative spatial
+VQA. Finally, we demonstrate that this VLM unlocks novel downstream
+applications in chain-of-thought spatial reasoning and robotics due to its
+quantitative estimation capability. Project website:
+https://spatial-vlm.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Personalized Over-the-Air Federated Learning with Personalized
+  Reconfigurable Intelligent Surfaces <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Mao, Aylin Yener
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over-the-air federated learning (OTA-FL) provides bandwidth-efficient
+learning by leveraging the inherent superposition property of wireless
+channels. Personalized federated learning balances performance for users with
+diverse datasets, addressing real-life data heterogeneity. We propose the first
+personalized OTA-FL scheme through multi-task learning, assisted by personal
+reconfigurable intelligent surfaces (RIS) for each user. We take a cross-layer
+approach that optimizes communication and computation resources for global and
+personalized tasks in time-varying channels with imperfect channel state
+information, using multi-task learning for non-i.i.d data. Our PROAR-PFed
+algorithm adaptively designs power, local iterations, and RIS configurations.
+We present convergence analysis for non-convex objectives and demonstrate that
+PROAR-PFed outperforms state-of-the-art on the Fashion-MNIST dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Copyright 2024 IEEE. Published in ICASSP 2024, 14-19 April, Seoul,
+  Korea. Personal use of this material is permitted. However, permission to
+  reprint/republish this material for advertising or promotional purposes or
+  for creating new collective works for resale or redistribution to servers or
+  lists, or to reuse any copyrighted component of this work in other works,
+  must be obtained from the IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VRMN-bD: A Multi-modal Natural Behavior <span class="highlight-title">Dataset</span> of Immersive Human Fear
+  Responses in VR Stand-up Interactive Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        He Zhang, Xinyang Li, Yuanxi Sun, Xinyi Fu, Christine Qiu, John M. Carroll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and recognizing emotions are important and challenging issues
+in the metaverse era. Understanding, identifying, and predicting fear, which is
+one of the fundamental human emotions, in virtual reality (VR) environments
+plays an essential role in immersive game development, scene development, and
+next-generation virtual human-computer interaction applications. In this
+article, we used VR horror games as a medium to analyze fear emotions by
+collecting multi-modal data (posture, audio, and physiological signals) from 23
+players. We used an LSTM-based model to predict fear with accuracies of 65.31%
+and 90.47% under 6-level classification (no fear and five different levels of
+fear) and 2-level classification (no fear and fear), respectively. We
+constructed a multi-modal natural behavior dataset of immersive human fear
+responses (VRMN-bD) and compared it with existing relevant advanced datasets.
+The results show that our dataset has fewer limitations in terms of collection
+method, data scale and audience scope. We are unique and advanced in targeting
+multi-modal datasets of fear and behavior in VR stand-up interactive
+environments. Moreover, we discussed the implications of this work for
+communities and applications. The dataset and pre-trained model are available
+at https://github.com/KindOPSTAR/VRMN-bD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE VR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluation of QCNN-LSTM for Disability Forecasting in Multiple Sclerosis
+  Using Sequential Multisequence MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John D. Mayfield, Issam El Naqa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Introduction Quantum Convolutional Neural Network (QCNN)-Long Short-Term
+Memory (LSTM) models were studied to provide sequential relationships for each
+timepoint in MRIs of patients with Multiple Sclerosis (MS). In this pilot
+study, we compared three QCNN-LSTM models for binary classification of MS
+disability benchmarked against classical neural network architectures. Our
+hypothesis is that quantum models will provide competitive performance. Methods
+Matrix Product State (MPS), reverse Multistate Entanglement Renormalization
+Ansatz (MERA), and Tree-Tensor Network (TTN) circuits were paired with LSTM
+layer to process near-annual MRI data of patients diagnosed with MS. These were
+benchmarked against a Visual Geometry Group (VGG)-LSTM and a Video Vision
+Transformer (ViViT). Predicted logits were measured against ground truth labels
+of each patient's Extended Disability Severity Score (EDSS) using binary
+cross-entropy loss. Training/validation/holdout testing was partitioned using
+5-fold cross validation with a total split of 60:20:20. Levene's test of
+variance was used to measure statistical difference and Student's t-test for
+paired model differences in mean. Results The MPS-LSTM, reverse MERA-LSTM, and
+TTN-LSTM had holdout testing ROC-AUC of 0.70, 0.77, and 0.81, respectively
+(p-value 0.915). VGG16-LSTM and ViViT performed similarly with ROC-AUC of 0.73
+and 0.77, respectively (p-value 0.631). Overall variance and mean were not
+statistically significant (p-value 0.713), however, time to train was
+significantly faster for the QCNN-LSTMs (39.4 sec per fold vs. 224 and 218,
+respectively, p-value <0.001). Conclusion QCNN-LSTM models perform
+competitively to their classical counterparts with greater efficiency in train
+time. Clinically, these can add value in terms of efficiency to time-dependent
+deep learning prediction of disease progression based upon medical imaging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeuroSynt: A Neuro-symbolic Portfolio Solver for Reactive Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Cosler, Christopher Hahn, Ayham Omar, Frederik Schmitt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce NeuroSynt, a neuro-symbolic portfolio solver framework for
+reactive synthesis. At the core of the solver lies a seamless integration of
+neural and symbolic approaches to solving the reactive synthesis problem. To
+ensure soundness, the neural engine is coupled with model checkers verifying
+the predictions of the underlying neural models. The open-source implementation
+of NeuroSynt provides an integration framework for reactive synthesis in which
+new neural and state-of-the-art symbolic approaches can be seamlessly
+integrated. Extensive experiments demonstrate its efficacy in handling
+challenging specifications, enhancing the state-of-the-art reactive synthesis
+solvers, with NeuroSynt contributing novel solves in the current SYNTCOMP
+benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Out-of-Distribution Detection & Applications With Ablated Learned
+  Temperature Energy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Will LeVine, Benjamin Pikus, Jacob Phillips, Berk Norman, Fernando Amat Gil, Sean Hendryx
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As deep neural networks become adopted in high-stakes domains, it is crucial
+to be able to identify when inference inputs are Out-of-Distribution (OOD) so
+that users can be alerted of likely drops in performance and calibration
+despite high confidence. Among many others, existing methods use the following
+two scores to do so without training on any apriori OOD examples: a learned
+temperature and an energy score. In this paper we introduce Ablated Learned
+Temperature Energy (or "AbeT" for short), a method which combines these prior
+methods in novel ways with effective modifications. Due to these contributions,
+AbeT lowers the False Positive Rate at $95\%$ True Positive Rate (FPR@95) by
+$35.39\%$ in classification (averaged across all ID and OOD datasets measured)
+compared to state of the art without training networks in multiple stages or
+requiring hyperparameters or test-time backward passes. We additionally provide
+empirical insights as to how our model learns to distinguish between
+In-Distribution (ID) and OOD samples while only being explicitly trained on ID
+samples via exposure to misclassified ID examples at training time. Lastly, we
+show the efficacy of our method in identifying predicted bounding boxes and
+pixels corresponding to OOD objects in object detection and semantic
+segmentation, respectively - with an AUROC increase of $5.15\%$ in object
+detection and both a decrease in FPR@95 of $41.48\%$ and an increase in AUPRC
+of $34.20\%$ on average in semantic segmentation compared to previous state of
+the art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extracting Formulae in Many-Valued Logic from Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yani Zhang, Helmut Bölcskei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new perspective on deep ReLU networks, namely as circuit
+counterparts of Lukasiewicz infinite-valued logic -- a many-valued (MV)
+generalization of Boolean logic. An algorithm for extracting formulae in MV
+logic from deep ReLU networks is presented. As the algorithm applies to
+networks with general, in particular also real-valued, weights, it can be used
+to extract logical formulae from deep ReLU networks trained on data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On-Time Delivery in Crowdshipping Systems: An Agent-Based Approach Using
+  Streaming Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12108v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12108v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeremias Dötterl, Ralf Bruns, Jürgen Dunkel, Sascha Ossowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In parcel delivery, the "last mile" from the parcel hub to the customer is
+costly, especially for time-sensitive delivery tasks that have to be completed
+within hours after arrival. Recently, crowdshipping has attracted increased
+attention as a new alternative to traditional delivery modes. In crowdshipping,
+private citizens ("the crowd") perform short detours in their daily lives to
+contribute to parcel delivery in exchange for small incentives. However,
+achieving desirable crowd behavior is challenging as the crowd is highly
+dynamic and consists of autonomous, self-interested individuals. Leveraging
+crowdshipping for time-sensitive deliveries remains an open challenge. In this
+paper, we present an agent-based approach to on-time parcel delivery with
+crowds. Our system performs data stream processing on the couriers' smartphone
+sensor data to predict delivery delays. Whenever a delay is predicted, the
+system attempts to forge an agreement for transferring the parcel from the
+current deliverer to a more promising courier nearby. Our experiments show that
+through accurate delay predictions and purposeful task transfers many delays
+can be prevented that would occur without our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LearnedWMP: Workload Memory Prediction Using Distribution of Query
+  Templates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaikh Quader, Andres Jaramillo, Sumona Mukhopadhyay, Ghadeer Abuoda, Calisto Zuzarte, David Kalmuk, Marin Litoiu, Manos Papagelis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a modern DBMS, working memory is frequently the limiting factor when
+processing in-memory analytic query operations such as joins, sorting, and
+aggregation. Existing resource estimation approaches for a DBMS estimate the
+resource consumption of a query by computing an estimate of each individual
+database operator in the query execution plan. Such an approach is slow and
+error-prone as it relies upon simplifying assumptions, such as uniformity and
+independence of the underlying data. Additionally, the existing approach
+focuses on individual queries separately and does not factor in other queries
+in the workload that may be executed concurrently. In this research, we are
+interested in query performance optimization under concurrent execution of a
+batch of queries (a workload). Specifically, we focus on predicting the memory
+demand for a workload rather than providing separate estimates for each query
+within it. We introduce the problem of workload memory prediction and formalize
+it as a distribution regression problem. We propose Learned Workload Memory
+Prediction (LearnedWMP) to improve and simplify estimating the working memory
+demands of workloads. Through a comprehensive experimental evaluation, we show
+that LearnedWMP reduces the memory estimation error of the
+state-of-the-practice method by up to 47.6%. Compared to an alternative
+single-query model, during training and inferencing, the LearnedWMP model and
+its variants were 3x to 10x faster. Moreover, LearnedWMP-based models were at
+least 50% smaller in most cases. Overall, the results demonstrate the
+advantages of the LearnedWMP approach and its potential for a broader impact on
+query performance optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ West-of-N: Synthetic Preference Generation for Improved Reward Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alizée Pace, Jonathan Mallinson, Eric Malmi, Sebastian Krause, Aliaksei Severyn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of reinforcement learning from human feedback (RLHF) in language
+model alignment is strongly dependent on the quality of the underlying reward
+model. In this paper, we present a novel approach to improve reward model
+quality by generating synthetic preference data, thereby augmenting the
+training dataset with on-policy, high-quality preference pairs. Motivated by
+the promising results of Best-of-N sampling strategies in language model
+training, we extend their application to reward model training. This results in
+a self-training strategy to generate preference pairs by selecting the best and
+worst candidates in a pool of responses to a given query. Empirically, we find
+that this approach improves the performance of any reward model, with an effect
+comparable to the addition of a similar quantity of human preference data. This
+work opens up new avenues of research for improving RLHF for language model
+alignment, by offering synthetic preference generation as a solution to reward
+modeling challenges.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Reinforcement Learning Based Unmanned Aerial Vehicle (UAV)
+  Trajectory Design for 3D UAV Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujiao Zhu, Mingzhe Chen, Sihua Wang, Ye Hu, Yuchen Liu, Changchuan Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, the problem of using one active unmanned aerial vehicle (UAV)
+and four passive UAVs to localize a 3D target UAV in real time is investigated.
+In the considered model, each passive UAV receives reflection signals from the
+target UAV, which are initially transmitted by the active UAV. The received
+reflection signals allow each passive UAV to estimate the signal transmission
+distance which will be transmitted to a base station (BS) for the estimation of
+the position of the target UAV. Due to the movement of the target UAV, each
+active/passive UAV must optimize its trajectory to continuously localize the
+target UAV. Meanwhile, since the accuracy of the distance estimation depends on
+the signal-to-noise ratio of the transmission signals, the active UAV must
+optimize its transmit power. This problem is formulated as an optimization
+problem whose goal is to jointly optimize the transmit power of the active UAV
+and trajectories of both active and passive UAVs so as to maximize the target
+UAV positioning accuracy. To solve this problem, a Z function decomposition
+based reinforcement learning (ZD-RL) method is proposed. Compared to value
+function decomposition based RL (VD-RL), the proposed method can find the
+probability distribution of the sum of future rewards to accurately estimate
+the expected value of the sum of future rewards thus finding better transmit
+power of the active UAV and trajectories for both active and passive UAVs and
+improving target UAV positioning accuracy. Simulation results show that the
+proposed ZD-RL method can reduce the positioning errors by up to 39.4% and
+64.6%, compared to VD-RL and independent deep RL methods, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated
+  Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhimanyu Hans, Avi Schwarzschild, Valeriia Cherepanova, Hamid Kazemi, Aniruddha Saha, Micah Goldblum, Jonas Geiping, Tom Goldstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting text generated by modern large language models is thought to be
+hard, as both LLMs and humans can exhibit a wide range of complex behaviors.
+However, we find that a score based on contrasting two closely related language
+models is highly accurate at separating human-generated and machine-generated
+text. Based on this mechanism, we propose a novel LLM detector that only
+requires simple calculations using a pair of pre-trained LLMs. The method,
+called Binoculars, achieves state-of-the-art accuracy without any training
+data. It is capable of spotting machine text from a range of modern LLMs
+without any model-specific modifications. We comprehensively evaluate
+Binoculars on a number of text sources and in varied situations. Over a wide
+range of document types, Binoculars detects over 90% of generated samples from
+ChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being
+trained on any ChatGPT data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, code available at https://github.com/ahans30/Binoculars</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond TreeSHAP: Efficient Computation of Any-Order Shapley Interactions
+  for Tree Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Muschalik, Fabian Fumagalli, Barbara Hammer, Eyke Hüllermeier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While shallow decision trees may be interpretable, larger ensemble models
+like gradient-boosted trees, which often set the state of the art in machine
+learning problems involving tabular data, still remain black box models. As a
+remedy, the Shapley value (SV) is a well-known concept in explainable
+artificial intelligence (XAI) research for quantifying additive feature
+attributions of predictions. The model-specific TreeSHAP methodology solves the
+exponential complexity for retrieving exact SVs from tree-based models.
+Expanding beyond individual feature attribution, Shapley interactions reveal
+the impact of intricate feature interactions of any order. In this work, we
+present TreeSHAP-IQ, an efficient method to compute any-order additive Shapley
+interactions for predictions of tree-based models. TreeSHAP-IQ is supported by
+a mathematical framework that exploits polynomial arithmetic to compute the
+interaction scores in a single recursive traversal of the tree, akin to Linear
+TreeSHAP. We apply TreeSHAP-IQ on state-of-the-art tree ensembles and explore
+interactions on well-established benchmark datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resource-constrained stereo singing voice cancellation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clara Borrelli, James Rae, Dogac Basaran, Matt McVicar, Mehrez Souden, Matthias Mauch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of stereo singing voice cancellation, a subtask of music
+source separation, whose goal is to estimate an instrumental background from a
+stereo mix. We explore how to achieve performance similar to large
+state-of-the-art source separation networks starting from a small, efficient
+model for real-time speech separation. Such a model is useful when memory and
+compute are limited and singing voice processing has to run with limited
+look-ahead. In practice, this is realised by adapting an existing mono model to
+handle stereo input. Improvements in quality are obtained by tuning model
+parameters and expanding the training set. Moreover, we highlight the benefits
+a stereo model brings by introducing a new metric which detects attenuation
+inconsistencies between channels. Our approach is evaluated using objective
+offline metrics and a large-scale MUSHRA trial, confirming the effectiveness of
+our techniques in stringent listening tests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Dimension Strikes Back with Gradients: Generalization of Gradient
+  Methods in Stochastic Convex Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matan Schliserman, Uri Sherman, Tomer Koren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the generalization performance of gradient methods in the
+fundamental stochastic convex optimization setting, focusing on its dimension
+dependence. First, for full-batch gradient descent (GD) we give a construction
+of a learning problem in dimension $d=O(n^2)$, where the canonical version of
+GD (tuned for optimal performance of the empirical risk) trained with $n$
+training examples converges, with constant probability, to an approximate
+empirical risk minimizer with $\Omega(1)$ population excess risk. Our bound
+translates to a lower bound of $\Omega (\sqrt{d})$ on the number of training
+examples required for standard GD to reach a non-trivial test error, answering
+an open question raised by Feldman (2016) and Amir, Koren, and Livni (2021b)
+and showing that a non-trivial dimension dependence is unavoidable.
+Furthermore, for standard one-pass stochastic gradient descent (SGD), we show
+that an application of the same construction technique provides a similar
+$\Omega(\sqrt{d})$ lower bound for the sample complexity of SGD to reach a
+non-trivial empirical error, despite achieving optimal test performance. This
+again provides an exponential improvement in the dimension dependence compared
+to previous work (Koren, Livni, Mansour, and Sherman, 2022), resolving an open
+question left therein.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NEUROSEC: FPGA-Based Neuromorphic Audio Security 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Murat Isik, Hiruna Vishwamith, Yusuf Sur, Kayode Inadagbo, I. Can Dikmen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neuromorphic systems, inspired by the complexity and functionality of the
+human brain, have gained interest in academic and industrial attention due to
+their unparalleled potential across a wide range of applications. While their
+capabilities herald innovation, it is imperative to underscore that these
+computational paradigms, analogous to their traditional counterparts, are not
+impervious to security threats. Although the exploration of neuromorphic
+methodologies for image and video processing has been rigorously pursued, the
+realm of neuromorphic audio processing remains in its early stages. Our results
+highlight the robustness and precision of our FPGA-based neuromorphic system.
+Specifically, our system showcases a commendable balance between desired signal
+and background noise, efficient spike rate encoding, and unparalleled
+resilience against adversarial attacks such as FGSM and PGD. A standout feature
+of our framework is its detection rate of 94%, which, when compared to other
+methodologies, underscores its greater capability in identifying and mitigating
+threats within 5.39 dB, a commendable SNR ratio. Furthermore, neuromorphic
+computing and hardware security serve many sensor domains in mission-critical
+and privacy-preserving applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Audio processing, FPGA, Hardware Security, Neuromorphic Computing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fourier Transporter: Bi-Equivariant Robotic Manipulation in 3D 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojie Huang, Owen Howell, Xupeng Zhu, Dian Wang, Robin Walters, Robert Platt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many complex robotic manipulation tasks can be decomposed as a sequence of
+pick and place actions. Training a robotic agent to learn this sequence over
+many different starting conditions typically requires many iterations or
+demonstrations, especially in 3D environments. In this work, we propose Fourier
+Transporter (\ours{}) which leverages the two-fold $\SE(d)\times\SE(d)$
+symmetry in the pick-place problem to achieve much higher sample efficiency.
+\ours{} is an open-loop behavior cloning method trained using expert
+demonstrations to predict pick-place actions on new environments. \ours{} is
+constrained to incorporate symmetries of the pick and place actions
+independently. Our method utilizes a fiber space Fourier transformation that
+allows for memory-efficient construction. We test our proposed network on the
+RLbench benchmark and achieve state-of-the-art results across various tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Momentum-SAM: Sharpness Aware Minimization without Computational
+  Overhead 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marlon Becker, Frederick Altrock, Benjamin Risse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently proposed optimization algorithm for deep neural networks
+Sharpness Aware Minimization (SAM) suggests perturbing parameters before
+gradient calculation by a gradient ascent step to guide the optimization into
+parameter space regions of flat loss. While significant generalization
+improvements and thus reduction of overfitting could be demonstrated, the
+computational costs are doubled due to the additionally needed gradient
+calculation, making SAM unfeasible in case of limited computationally
+capacities. Motivated by Nesterov Accelerated Gradient (NAG) we propose
+Momentum-SAM (MSAM), which perturbs parameters in the direction of the
+accumulated momentum vector to achieve low sharpness without significant
+computational overhead or memory demands over SGD or Adam. We evaluate MSAM in
+detail and reveal insights on separable mechanisms of NAG, SAM and MSAM
+regarding training optimization and generalization. Code is available at
+https://github.com/MarlonBecker/MSAM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Visual-Tactile Representation Learning through
+  <span class="highlight-title">Self-Supervised</span> Contrastive <span class="highlight-title">Pre-Train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vedant Dave, Fotios Lygerakis, Elmar Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapidly evolving field of robotics necessitates methods that can
+facilitate the fusion of multiple modalities. Specifically, when it comes to
+interacting with tangible objects, effectively combining visual and tactile
+sensory data is key to understanding and navigating the complex dynamics of the
+physical world, enabling a more nuanced and adaptable response to changing
+environments. Nevertheless, much of the earlier work in merging these two
+sensory modalities has relied on supervised methods utilizing datasets labeled
+by humans.This paper introduces MViTac, a novel methodology that leverages
+contrastive learning to integrate vision and touch sensations in a
+self-supervised fashion. By availing both sensory inputs, MViTac leverages
+intra and inter-modality losses for learning representations, resulting in
+enhanced material property classification and more adept grasping prediction.
+Through a series of experiments, we showcase the effectiveness of our method
+and its superiority over existing state-of-the-art self-supervised and
+supervised techniques. In evaluating our methodology, we focus on two distinct
+tasks: material classification and grasping success prediction. Our results
+indicate that MViTac facilitates the development of improved modality encoders,
+yielding more robust representations as evidenced by linear probing
+assessments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robustness to distribution shifts of compressed networks for edge
+  devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lulan Shen, Ali Edalati, Brett Meyer, Warren Gross, James J. Clark
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is necessary to develop efficient DNNs deployed on edge devices with
+limited computation resources. However, the compressed networks often execute
+new tasks in the target domain, which is different from the source domain where
+the original network is trained. It is important to investigate the robustness
+of compressed networks in two types of data distribution shifts: domain shifts
+and adversarial perturbations. In this study, we discover that compressed
+models are less robust to distribution shifts than their original networks.
+Interestingly, larger networks are more vulnerable to losing robustness than
+smaller ones, even when they are compressed to a similar size as the smaller
+networks. Furthermore, compact networks obtained by knowledge distillation are
+much more robust to distribution shifts than pruned networks. Finally,
+post-training quantization is a reliable method for achieving significant
+robustness to distribution shifts, and it outperforms both pruned and distilled
+models in terms of robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TurboSVM-FL: Boosting Federated Learning through SVM Aggregation for
+  Lazy Clients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengdi Wang, Anna Bodonhelyi, Efe Bozkir, Enkelejda Kasneci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning is a distributed collaborative machine learning paradigm
+that has gained strong momentum in recent years. In federated learning, a
+central server periodically coordinates models with clients and aggregates the
+models trained locally by clients without necessitating access to local data.
+Despite its potential, the implementation of federated learning continues to
+encounter several challenges, predominantly the slow convergence that is
+largely due to data heterogeneity. The slow convergence becomes particularly
+problematic in cross-device federated learning scenarios where clients may be
+strongly limited by computing power and storage space, and hence counteracting
+methods that induce additional computation or memory cost on the client side
+such as auxiliary objective terms and larger training iterations can be
+impractical. In this paper, we propose a novel federated aggregation strategy,
+TurboSVM-FL, that poses no additional computation burden on the client side and
+can significantly accelerate convergence for federated classification task,
+especially when clients are "lazy" and train their models solely for few epochs
+for next global aggregation. TurboSVM-FL extensively utilizes support vector
+machine to conduct selective aggregation and max-margin spread-out
+regularization on class embeddings. We evaluate TurboSVM-FL on multiple
+datasets including FEMNIST, CelebA, and Shakespeare using user-independent
+validation with non-iid data distribution. Our results show that TurboSVM-FL
+can significantly outperform existing popular algorithms on convergence rate
+and reduce communication rounds while delivering better test metrics including
+accuracy, F1 score, and MCC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tensor-view Topological Graph Neural Network <span class="chip">AISTATS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Wen, Elynn Chen, Yuzhou Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph classification is an important learning task for graph-structured data.
+Graph neural networks (GNNs) have recently gained growing attention in graph
+learning and have shown significant improvements in many important graph
+problems. Despite their state-of-the-art performances, existing GNNs only use
+local information from a very limited neighborhood around each node, suffering
+from loss of multi-modal information and overheads of excessive computation. To
+address these issues, we propose a novel Tensor-view Topological Graph Neural
+Network (TTG-NN), a class of simple yet effective topological deep learning
+built upon persistent homology, graph convolution, and tensor operations. This
+new method incorporates tensor learning to simultaneously capture Tensor-view
+Topological (TT), as well as Tensor-view Graph (TG) structural information on
+both local and global levels. Computationally, to fully exploit graph topology
+and structure, we propose two flexible TT and TG representation learning
+modules that disentangle feature tensor aggregation and transformation and
+learn to preserve multi-modal structure with less computation. Theoretically,
+we derive high probability bounds on both the out-of-sample and in-sample mean
+squared approximation errors for our proposed Tensor Transformation Layer
+(TTL). Real data experiments show that the proposed TTG-NN outperforms 20
+state-of-the-art methods on various graph benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AISTATS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NLCG-Net: A Model-Based Zero-Shot Learning Framework for Undersampled
+  Quantitative MRI Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinrui Jiang, Yohan Jun, Jaejin Cho, Mengze Gao, Xingwang Yong, Berkin Bilgic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Typical quantitative MRI (qMRI) methods estimate parameter maps after image
+reconstructing, which is prone to biases and error propagation. We propose a
+Nonlinear Conjugate Gradient (NLCG) optimizer for model-based T2/T1 estimation,
+which incorporates U-Net regularization trained in a scan-specific manner. This
+end-to-end method directly estimates qMRI maps from undersampled k-space data
+using mono-exponential signal modeling with zero-shot scan-specific neural
+network regularization to enable high fidelity T1 and T2 mapping. T2 and T1
+mapping results demonstrate the ability of the proposed NLCG-Net to improve
+estimation quality compared to subspace reconstruction at high accelerations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, submitted to International Society for Magnetic
+  Resonance in Medicine 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HgbNet: predicting hemoglobin level/anemia degree from EHR data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuo Zhi, Moe Elbadawi, Adam Daneshmend, Mine Orlu, Abdul Basit, Andreas Demosthenous, Miguel Rodrigues
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anemia is a prevalent medical condition that typically requires invasive
+blood tests for diagnosis and monitoring. Electronic health records (EHRs) have
+emerged as valuable data sources for numerous medical studies. EHR-based
+hemoglobin level/anemia degree prediction is non-invasive and rapid but still
+faces some challenges due to the fact that EHR data is typically an irregular
+multivariate time series containing a significant number of missing values and
+irregular time intervals. To address these issues, we introduce HgbNet, a
+machine learning-based prediction model that emulates clinicians'
+decision-making processes for hemoglobin level/anemia degree prediction. The
+model incorporates a NanDense layer with a missing indicator to handle missing
+values and employs attention mechanisms to account for both local irregularity
+and global irregularity. We evaluate the proposed method using two real-world
+datasets across two use cases. In our first use case, we predict hemoglobin
+level/anemia degree at moment T+1 by utilizing records from moments prior to
+T+1. In our second use case, we integrate all historical records with
+additional selected test results at moment T+1 to predict hemoglobin
+level/anemia degree at the same moment, T+1. HgbNet outperforms the best
+baseline results across all datasets and use cases. These findings demonstrate
+the feasibility of estimating hemoglobin levels and anemia degree from EHR
+data, positioning HgbNet as an effective non-invasive anemia diagnosis solution
+that could potentially enhance the quality of life for millions of affected
+individuals worldwide. To our knowledge, HgbNet is the first machine learning
+model leveraging EHR data for hemoglobin level/anemia degree prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating Statistical Significance and Discriminative Power in Pattern
+  Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo Alexandre, Rafael S. Costa, Rui Henriques
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pattern discovery plays a central role in both descriptive and predictive
+tasks across multiple domains. Actionable patterns must meet rigorous
+statistical significance criteria and, in the presence of target variables,
+further uphold discriminative power. Our work addresses the underexplored area
+of guiding pattern discovery by integrating statistical significance and
+discriminative power criteria into state-of-the-art algorithms while preserving
+pattern quality. We also address how pattern quality thresholds, imposed by
+some algorithms, can be rectified to accommodate these additional criteria. To
+test the proposed methodology, we select the triclustering task as the guiding
+pattern discovery case and extend well-known greedy and multi-objective
+optimization triclustering algorithms, $\delta$-Trimax and TriGen, that use
+various pattern quality criteria, such as Mean Squared Residual (MSR), Least
+Squared Lines (LSL), and Multi Slope Measure (MSL). Results from three case
+studies show the role of the proposed methodology in discovering patterns with
+pronounced improvements of discriminative power and statistical significance
+without quality deterioration, highlighting its importance in supervisedly
+guiding the search. Although the proposed methodology is motivated over
+multivariate time series data, it can be straightforwardly extended to pattern
+discovery tasks involving multivariate, N-way (N>3), transactional, and
+sequential data structures.
+  Availability: The code is freely available at
+https://github.com/JupitersMight/MOF_Triclustering under the MIT license.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Expert-Driven Monitoring of Operational ML Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11993v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11993v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joran Leest, Claudia Raibulet, Ilias Gerostathopoulos, Patricia Lago
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Expert Monitoring, an approach that leverages domain expertise to
+enhance the detection and mitigation of concept drift in machine learning (ML)
+models. Our approach supports practitioners by consolidating domain expertise
+related to concept drift-inducing events, making this expertise accessible to
+on-call personnel, and enabling automatic adaptability with expert oversight.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Face Interaction Graph Networks to Real World Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatiana Lopez-Guevara, Yulia Rubanova, William F. Whitney, Tobias Pfaff, Kimberly Stachenfeld, Kelsey R. Allen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately simulating real world object dynamics is essential for various
+applications such as robotics, engineering, graphics, and design. To better
+capture complex real dynamics such as contact and friction, learned simulators
+based on graph networks have recently shown great promise. However, applying
+these learned simulators to real scenes comes with two major challenges: first,
+scaling learned simulators to handle the complexity of real world scenes which
+can involve hundreds of objects each with complicated 3D shapes, and second,
+handling inputs from perception rather than 3D state information. Here we
+introduce a method which substantially reduces the memory required to run
+graph-based learned simulators. Based on this memory-efficient simulation
+model, we then present a perceptual interface in the form of editable NeRFs
+which can convert real-world scenes into a structured representation that can
+be processed by graph network simulator. We show that our method uses
+substantially less memory than previous graph-based simulators while retaining
+their accuracy, and that the simulators learned in synthetic environments can
+be applied to real world scenes captured from multiple camera angles. This
+paves the way for expanding the application of learned simulators to settings
+where only perceptual information is available at inference time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Validation Conformal Risk Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kfir M. Cohen, Sangwoo Park, Osvaldo Simeone, Shlomo Shamai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conformal risk control (CRC) is a recently proposed technique that applies
+post-hoc to a conventional point predictor to provide calibration guarantees.
+Generalizing conformal prediction (CP), with CRC, calibration is ensured for a
+set predictor that is extracted from the point predictor to control a risk
+function such as the probability of miscoverage or the false negative rate. The
+original CRC requires the available data set to be split between training and
+validation data sets. This can be problematic when data availability is
+limited, resulting in inefficient set predictors. In this paper, a novel CRC
+method is introduced that is based on cross-validation, rather than on
+validation as the original CRC. The proposed cross-validation CRC (CV-CRC)
+extends a version of the jackknife-minmax from CP to CRC, allowing for the
+control of a broader range of risk functions. CV-CRC is proved to offer
+theoretical guarantees on the average risk of the set predictor. Furthermore,
+numerical experiments show that CV-CRC can reduce the average set size with
+respect to CRC when the available data are limited.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Evolutionary Algorithms and Reinforcement Learning: A
+  Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyi Li, Jianye Hao, Hongyao Tang, Xian Fu, Yan Zheng, Ke Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evolutionary Reinforcement Learning (ERL), which integrates Evolutionary
+Algorithms (EAs) and Reinforcement Learning (RL) for optimization, has
+demonstrated remarkable performance advancements. By fusing the strengths of
+both approaches, ERL has emerged as a promising research direction. This survey
+offers a comprehensive overview of the diverse research branches in ERL.
+Specifically, we systematically summarize recent advancements in relevant
+algorithms and identify three primary research directions: EA-assisted
+optimization of RL, RL-assisted optimization of EA, and synergistic
+optimization of EA and RL. Following that, we conduct an in-depth analysis of
+each research direction, organizing multiple research branches. We elucidate
+the problems that each branch aims to tackle and how the integration of EA and
+RL addresses these challenges. In conclusion, we discuss potential challenges
+and prospective future research directions across various research directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RUMBoost: Gradient Boosted Random Utility Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Salvadé, Tim Hillel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the RUMBoost model, a novel discrete choice modelling
+approach that combines the interpretability and behavioural robustness of
+Random Utility Models (RUMs) with the generalisation and predictive ability of
+deep learning methods. We obtain the full functional form of non-linear utility
+specifications by replacing each linear parameter in the utility functions of a
+RUM with an ensemble of gradient boosted regression trees. This enables
+piece-wise constant utility values to be imputed for all alternatives directly
+from the data for any possible combination of input variables. We introduce
+additional constraints on the ensembles to ensure three crucial features of the
+utility specifications: (i) dependency of the utilities of each alternative on
+only the attributes of that alternative, (ii) monotonicity of marginal
+utilities, and (iii) an intrinsically interpretable functional form, where the
+exact response of the model is known throughout the entire input space.
+Furthermore, we introduce an optimisation-based smoothing technique that
+replaces the piece-wise constant utility values of alternative attributes with
+monotonic piece-wise cubic splines to identify non-linear parameters with
+defined gradient. We demonstrate the potential of the RUMBoost model compared
+to various ML and Random Utility benchmark models for revealed preference mode
+choice data from London. The results highlight the great predictive performance
+and the direct interpretability of our proposed approach. Furthermore, the
+smoothed attribute utility functions allow for the calculation of various
+behavioural indicators and marginal utilities. Finally, we demonstrate the
+flexibility of our methodology by showing how the RUMBoost model can be
+extended to complex model specifications, including attribute interactions,
+correlation within alternative error terms and heterogeneity within the
+population.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Large Multimodal Models against Common Corruptions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Zhang, Tianyu Pang, Chao Du, Yi Ren, Bo Li, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This technical report aims to fill a deficiency in the assessment of large
+multimodal models (LMMs) by specifically examining the self-consistency of
+their outputs when subjected to common corruptions. We investigate the
+cross-modal interactions between text, image, and speech, encompassing four
+essential generation tasks: text-to-image, image-to-text, text-to-speech, and
+speech-to-text. We create a comprehensive benchmark, named MMCBench, that
+covers more than 100 popular LMMs (totally over 150 model checkpoints). A
+thorough evaluation under common corruptions is critical for practical
+deployment and facilitates a better understanding of the reliability of
+cutting-edge LMMs. The benchmarking code is available at
+https://github.com/sail-sg/MMCBench
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low-Tubal-Rank Tensor Recovery via Factorized Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11940v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11940v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyu Liu, Zhi Han, Yandong Tang, Xi-Le Zhao, Yao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers the problem of recovering a tensor with an underlying
+low-tubal-rank structure from a small number of corrupted linear measurements.
+Traditional approaches tackling such a problem require the computation of
+tensor Singular Value Decomposition (t-SVD), that is a computationally
+intensive process, rendering them impractical for dealing with large-scale
+tensors. Aim to address this challenge, we propose an efficient and effective
+low-tubal-rank tensor recovery method based on a factorization procedure akin
+to the Burer-Monteiro (BM) method. Precisely, our fundamental approach involves
+decomposing a large tensor into two smaller factor tensors, followed by solving
+the problem through factorized gradient descent (FGD). This strategy eliminates
+the need for t-SVD computation, thereby reducing computational costs and
+storage requirements. We provide rigorous theoretical analysis to ensure the
+convergence of FGD under both noise-free and noisy situations. Additionally, it
+is worth noting that our method does not require the precise estimation of the
+tensor tubal-rank. Even in cases where the tubal-rank is slightly
+overestimated, our approach continues to demonstrate robust performance. A
+series of experiments have been carried out to demonstrate that, as compared to
+other popular ones, our approach exhibits superior performance in multiple
+scenarios, in terms of the faster computational speed and the smaller
+convergence error.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Bigger the Better? Rethinking the Effective Model Scale in Long-term
+  Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinliang Deng, Xuan Song, Ivor W. Tsang, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-term time series forecasting (LTSF) represents a critical frontier in
+time series analysis, distinguished by its focus on extensive input sequences,
+in contrast to the constrained lengths typical of traditional approaches. While
+longer sequences inherently convey richer information, potentially enhancing
+predictive precision, prevailing techniques often respond by escalating model
+complexity. These intricate models can inflate into millions of parameters,
+incorporating parameter-intensive elements like positional encodings,
+feed-forward networks and self-attention mechanisms. This complexity, however,
+leads to prohibitive model scale, particularly given the time series data's
+semantic simplicity. Motivated by the pursuit of parsimony, our research
+employs conditional correlation and auto-correlation as investigative tools,
+revealing significant redundancies within the input data. Leveraging these
+insights, we introduce the HDformer, a lightweight Transformer variant enhanced
+with hierarchical decomposition. This novel architecture not only inverts the
+prevailing trend toward model expansion but also accomplishes precise
+forecasting with drastically fewer computations and parameters. Remarkably,
+HDformer outperforms existing state-of-the-art LTSF models, while requiring
+over 99\% fewer parameters. Through this work, we advocate a paradigm shift in
+LTSF, emphasizing the importance to tailor the model to the inherent dynamics
+of time series data-a timely reminder that in the realm of LTSF, bigger is not
+invariably better.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Deep Learning of Word-of-Mouth Text and Demographics to
+  Predict Customer Rating: Handling Consumer Heterogeneity in Marketing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junichiro Niimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the marketing field, understanding consumer heterogeneity, which is the
+internal or psychological difference among consumers that cannot be captured by
+behavioral logs, has long been a critical challenge. However, a number of
+consumers today usually post their evaluation on the specific product on the
+online platform, which can be the valuable source of such unobservable
+differences among consumers. Several previous studies have shown the validity
+of the analysis on text modality, but on the other hand, such analyses may not
+necessarily demonstrate sufficient predictive accuracy for text alone, as they
+may not include information readily available from cross-sectional data, such
+as consumer profile data. In addition, recent advances in machine learning
+techniques, such as large-scale language models (LLMs) and multimodal learning
+have made it possible to deal with the various kind of dataset simultaneously,
+including textual data and the traditional cross-sectional data, and the joint
+representations can be effectively obtained from multiple modalities.
+Therefore, this study constructs a product evaluation model that takes into
+account consumer heterogeneity by multimodal learning of online product reviews
+and consumer profile information. We also compare multiple models using
+different modalities or hyper-parameters to demonstrate the robustness of
+multimodal learning in marketing analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Review</span> of Physics-Informed Machine Learning Methods with Applications
+  to Condition Monitoring and Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11860v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11860v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuandi Wu, Brett Sicard, Stephen Andrew Gadsden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a comprehensive overview of PIML techniques in the
+context of condition monitoring. The central concept driving PIML is the
+incorporation of known physical laws and constraints into machine learning
+algorithms, enabling them to learn from available data while remaining
+consistent with physical principles. Through fusing domain knowledge with
+data-driven learning, PIML methods offer enhanced accuracy and interpretability
+in comparison to purely data-driven approaches. In this comprehensive survey,
+detailed examinations are performed with regard to the methodology by which
+known physical principles are integrated within machine learning frameworks, as
+well as their suitability for specific tasks within condition monitoring.
+Incorporation of physical knowledge into the ML model may be realized in a
+variety of methods, with each having its unique advantages and drawbacks. The
+distinct advantages and limitations of each methodology for the integration of
+physics within data-driven models are detailed, considering factors such as
+computational efficiency, model interpretability, and generalizability to
+different systems in condition monitoring and fault detection. Several case
+studies and works of literature utilizing this emerging concept are presented
+to demonstrate the efficacy of PIML in condition monitoring applications. From
+the literature reviewed, the versatility and potential of PIML in condition
+monitoring may be demonstrated. Novel PIML methods offer an innovative solution
+for addressing the complexities of condition monitoring and associated
+challenges. This comprehensive survey helps form the foundation for future work
+in the field. As the technology continues to advance, PIML is expected to play
+a crucial role in enhancing maintenance strategies, system reliability, and
+overall operational efficiency in engineering systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper has been submitted for review to the journal Expert Systems
+  with Applications (December 31, 2023). 90 pages, 22 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Labeling the Job Shop Scheduling Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Corsini, Angelo Porrello, Simone Calderara, Mauro Dell'Amico
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a Self-Supervised training strategy specifically
+designed for combinatorial problems. One of the main obstacles in applying
+supervised paradigms to such problems is the requirement of expensive target
+solutions as ground-truth, often produced with costly exact solvers. Inspired
+by Semi- and Self-Supervised learning, we show that it is possible to easily
+train generative models by sampling multiple solutions and using the best one
+according to the problem objective as a pseudo-label. In this way, we
+iteratively improve the model generation capability by relying only on its
+self-supervision, completely removing the need for optimality information. We
+prove the effectiveness of this Self-Labeling strategy on the Job Shop
+Scheduling (JSP), a complex combinatorial problem that is receiving much
+attention from the Reinforcement Learning community. We propose a generative
+model based on the well-known Pointer Network and train it with our strategy.
+Experiments on two popular benchmarks demonstrate the potential of this
+approach as the resulting models outperform constructive heuristics and current
+state-of-the-art Reinforcement Learning proposals.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Fusion of Multi-view Remote Sensing data for Optimal Sub-field
+  Crop Yield Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11844v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11844v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francisco Mena, Deepak Pathak, Hiba Najjar, Cristhian Sanchez, Patrick Helber, Benjamin Bischke, Peter Habelitz, Miro Miranda, Jayanth Siddamsetty, Marlon Nuske, Marcela Charfuelan, Diego Arenas, Michaela Vollmer, Andreas Dengel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate crop yield prediction is of utmost importance for informed
+decision-making in agriculture, aiding farmers, and industry stakeholders.
+However, this task is complex and depends on multiple factors, such as
+environmental conditions, soil properties, and management practices. Combining
+heterogeneous data views poses a fusion challenge, like identifying the
+view-specific contribution to the predictive task. We present a novel
+multi-view learning approach to predict crop yield for different crops
+(soybean, wheat, rapeseed) and regions (Argentina, Uruguay, and Germany). Our
+multi-view input data includes multi-spectral optical images from Sentinel-2
+satellites and weather data as dynamic features during the crop growing season,
+complemented by static features like soil properties and topographic
+information. To effectively fuse the data, we introduce a Multi-view Gated
+Fusion (MVGF) model, comprising dedicated view-encoders and a Gated Unit (GU)
+module. The view-encoders handle the heterogeneity of data sources with varying
+temporal resolutions by learning a view-specific representation. These
+representations are adaptively fused via a weighted sum. The fusion weights are
+computed for each sample by the GU using a concatenation of the
+view-representations. The MVGF model is trained at sub-field level with 10 m
+resolution pixels. Our evaluations show that the MVGF outperforms conventional
+models on the same task, achieving the best results by incorporating all the
+data sources, unlike the usual fusion results in the literature. For Argentina,
+the MVGF model achieves an R2 value of 0.68 at sub-field yield prediction,
+while at field level evaluation (comparing field averages), it reaches around
+0.80 across different countries. The GU module learned different weights based
+on the country and crop-type, aligning with the variable significance of each
+data source to the prediction task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Approximate Adaptive Kernel Convolution on Graphs <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11840v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11840v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaeyoon Sim, Sooyeon Jeon, InJun Choi, Guorong Wu, Won Hwa Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Various Graph Neural Networks (GNNs) have been successful in analyzing data
+in non-Euclidean spaces, however, they have limitations such as oversmoothing,
+i.e., information becomes excessively averaged as the number of hidden layers
+increases. The issue stems from the intrinsic formulation of conventional graph
+convolution where the nodal features are aggregated from a direct neighborhood
+per layer across the entire nodes in the graph. As setting different number of
+hidden layers per node is infeasible, recent works leverage a diffusion kernel
+to redefine the graph structure and incorporate information from farther nodes.
+Unfortunately, such approaches suffer from heavy diagonalization of a graph
+Laplacian or learning a large transform matrix. In this regards, we propose a
+diffusion learning framework, where the range of feature aggregation is
+controlled by the scale of a diffusion kernel. For efficient computation, we
+derive closed-form derivatives of approximations of the graph convolution with
+respect to the scale, so that node-wise range can be adaptively learned. With a
+downstream classifier, the entire framework is made trainable in an end-to-end
+manner. Our model is tested on various standard datasets for node-wise
+classification for the state-of-the-art performance, and it is also validated
+on a real-world brain network data for graph classifications to demonstrate its
+practicality for Alzheimer classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, Accepted to AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Privacy-Preserving Data Fusion for Traffic State Estimation: A Vertical
+  Federated Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiqing Wang, Kaidi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a privacy-preserving data fusion method for traffic state
+estimation (TSE). Unlike existing works that assume all data sources to be
+accessible by a single trusted party, we explicitly address data privacy
+concerns that arise in the collaboration and data sharing between multiple data
+owners, such as municipal authorities (MAs) and mobility providers (MPs). To
+this end, we propose a novel vertical federated learning (FL) approach, FedTSE,
+that enables multiple data owners to collaboratively train and apply a TSE
+model without having to exchange their private data. To enhance the
+applicability of the proposed FedTSE in common TSE scenarios with limited
+availability of ground-truth data, we further propose a privacy-preserving
+physics-informed FL approach, i.e., FedTSE-PI, that integrates traffic models
+into FL. Real-world data validation shows that the proposed methods can protect
+privacy while yielding similar accuracy to the oracle method without privacy
+considerations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse discovery of differential equations based on multi-fidelity
+  Gaussian process 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhuang Meng, Yue Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sparse identification of differential equations aims to compute the analytic
+expressions from the observed data explicitly. However, there exist two primary
+challenges. Firstly, it exhibits sensitivity to the noise in the observed data,
+particularly for the derivatives computations. Secondly, existing literature
+predominantly concentrates on single-fidelity (SF) data, which imposes
+limitations on its applicability due to the computational cost. In this paper,
+we present two novel approaches to address these problems from the view of
+uncertainty quantification. We construct a surrogate model employing the
+Gaussian process regression (GPR) to mitigate the effect of noise in the
+observed data, quantify its uncertainty, and ultimately recover the equations
+accurately. Subsequently, we exploit the multi-fidelity Gaussian processes
+(MFGP) to address scenarios involving multi-fidelity (MF), sparse, and noisy
+observed data. We demonstrate the robustness and effectiveness of our
+methodologies through several numerical experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hallucination is Inevitable: An Innate Limitation of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziwei Xu, Sanjay Jain, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hallucination has been widely recognized to be a significant drawback for
+large language models (LLMs). There have been many works that attempt to reduce
+the extent of hallucination. These efforts have mostly been empirical so far,
+which cannot answer the fundamental question whether it can be completely
+eliminated. In this paper, we formalize the problem and show that it is
+impossible to eliminate hallucination in LLMs. Specifically, we define a formal
+world where hallucination is defined as inconsistencies between a computable
+LLM and a computable ground truth function. By employing results from learning
+theory, we show that LLMs cannot learn all of the computable functions and will
+therefore always hallucinate. Since the formal world is a part of the real
+world which is much more complicated, hallucinations are also inevitable for
+real world LLMs. Furthermore, for real world LLMs constrained by provable time
+complexity, we describe the hallucination-prone tasks and empirically validate
+our claims. Finally, using the formal world framework, we discuss the possible
+mechanisms and efficacies of existing hallucination mitigators as well as the
+practical implications on the safe deployment of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization and Informativeness of Conformal Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Zecchin, Sangwoo Park, Osvaldo Simeone, Fredrik Hellström
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The safe integration of machine learning modules in decision-making processes
+hinges on their ability to quantify uncertainty. A popular technique to achieve
+this goal is conformal prediction (CP), which transforms an arbitrary base
+predictor into a set predictor with coverage guarantees. While CP certifies the
+predicted set to contain the target quantity with a user-defined tolerance, it
+does not provide control over the average size of the predicted sets, i.e.,
+over the informativeness of the prediction. In this work, a theoretical
+connection is established between the generalization properties of the base
+predictor and the informativeness of the resulting CP prediction sets. To this
+end, an upper bound is derived on the expected size of the CP set predictor
+that builds on generalization error bounds for the base predictor. The derived
+upper bound provides insights into the dependence of the average size of the CP
+set predictor on the amount of calibration data, the target reliability, and
+the generalization performance of the base predictor. The theoretical insights
+are validated using simple numerical regression and classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Distillation on Spatial-Temporal Graph Convolutional Network
+  for Traffic Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11798v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11798v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Izadi, Mehran Safayani, Abdolreza Mirzaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient real-time traffic prediction is crucial for reducing transportation
+time. To predict traffic conditions, we employ a spatio-temporal graph neural
+network (ST-GNN) to model our real-time traffic data as temporal graphs.
+Despite its capabilities, it often encounters challenges in delivering
+efficient real-time predictions for real-world traffic data. Recognizing the
+significance of timely prediction due to the dynamic nature of real-time data,
+we employ knowledge distillation (KD) as a solution to enhance the execution
+time of ST-GNNs for traffic prediction. In this paper, We introduce a cost
+function designed to train a network with fewer parameters (the student) using
+distilled data from a complex network (the teacher) while maintaining its
+accuracy close to that of the teacher. We use knowledge distillation,
+incorporating spatial-temporal correlations from the teacher network to enable
+the student to learn the complex patterns perceived by the teacher. However, a
+challenge arises in determining the student network architecture rather than
+considering it inadvertently. To address this challenge, we propose an
+algorithm that utilizes the cost function to calculate pruning scores,
+addressing small network architecture search issues, and jointly fine-tunes the
+network resulting from each pruning stage using KD. Ultimately, we evaluate our
+proposed ideas on two real-world datasets, PeMSD7 and PeMSD8. The results
+indicate that our method can maintain the student's accuracy close to that of
+the teacher, even with the retention of only $3\%$ of network parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safe and Generalized end-to-end Autonomous Driving System with
+  Reinforcement Learning and Demonstrations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11792v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11792v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuojin Tang, Xiaoyu Chen, YongQiang Li, Jianyu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An intelligent driving system should be capable of dynamically formulating
+appropriate driving strategies based on the current environment and vehicle
+status, while ensuring the security and reliability of the system. However,
+existing methods based on reinforcement learning and imitation learning suffer
+from low safety, poor generalization, and inefficient sampling. Additionally,
+they cannot accurately predict future driving trajectories, and the accurate
+prediction of future driving trajectories is a precondition for making optimal
+decisions. To solve these problems, in this paper, we introduce a Safe and
+Generalized end-to-end Autonomous Driving System (SGADS) for complex and
+various scenarios. Our SGADS incorporates variational inference with
+normalizing flows, enabling the intelligent vehicle to accurately predict
+future driving trajectories. Moreover, we propose the formulation of robust
+safety constraints. Furthermore, we combine reinforcement learning with
+demonstrations to augment search process of the agent. The experimental results
+demonstrate that our SGADS can significantly improve safety performance,
+exhibit strong generalization, and enhance the training efficiency of
+intelligent vehicles in complex urban scenarios compared to existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SemPLeS: Semantic <span class="highlight-title">Prompt</span> Learning for Weakly-Supervised Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11791v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11791v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ci-Siang Lin, Chien-Yi Wang, Yu-Chiang Frank Wang, Min-Hung Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation
+models using training image data with only image-level supervision. Since
+precise pixel-level annotations are not accessible, existing methods typically
+focus on producing pseudo masks for training segmentation models by refining
+CAM-like heatmaps. However, the produced heatmaps may only capture
+discriminative image regions of target object categories or the associated
+co-occurring backgrounds. To address the issues, we propose a Semantic Prompt
+Learning for WSSS (SemPLeS) framework, which learns to effectively prompt the
+CLIP space to enhance the semantic alignment between the segmented regions and
+the target object categories. More specifically, we propose Contrastive Prompt
+Learning and Class-associated Semantic Refinement to learn the prompts that
+adequately describe and suppress the image backgrounds associated with each
+target object category. In this way, our proposed framework is able to perform
+better semantic matching between object regions and the associated text labels,
+resulting in desired pseudo masks for training the segmentation model. The
+proposed SemPLeS framework achieves SOTA performance on the standard WSSS
+benchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the
+semantic visualization of our learned prompts. The codes will be released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LightDiC: A Simple yet Effective Approach for Large-scale Digraph
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11772v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11772v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunkai Li, Meihao Liao, Zhengyu Wu, Daohan Su, Wentao Zhang, Rong-Hua Li, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing graph neural networks (GNNs) are limited to undirected graphs,
+whose restricted scope of the captured relational information hinders their
+expressive capabilities and deployments in real-world scenarios. Compared with
+undirected graphs, directed graphs (digraphs) fit the demand for modeling more
+complex topological systems by capturing more intricate relationships between
+nodes, such as formulating transportation and financial networks. While some
+directed GNNs have been introduced, their inspiration mainly comes from deep
+learning architectures, which lead to redundant complexity and computation,
+making them inapplicable to large-scale databases. To address these issues, we
+propose LightDiC, a scalable variant of the digraph convolution based on the
+magnetic Laplacian. Since topology-related computations are conducted solely
+during offline pre-processing, LightDiC achieves exceptional scalability,
+enabling downstream predictions to be trained separately without incurring
+recursive computational costs. Theoretical analysis shows that LightDiC
+utilizes directed information to achieve message passing based on the complex
+field, which corresponds to the proximal gradient descent process of the
+Dirichlet energy optimization function from the perspective of digraph signal
+denoising, ensuring its expressiveness. Experimental results demonstrate that
+LightDiC performs comparably well or even outperforms other SOTA methods in
+various downstream tasks, with fewer learnable parameters and higher training
+efficiency. Notably, LightDiC is the first DiGNN to provide satisfactory
+results in the most representative large-scale database (ogbn-papers100M).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADA-GNN: Atom-Distance-Angle Graph Neural Network for Crystal Material
+  Property Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiao Huang, Qianli Xing, Jinglong Ji, Bo Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Property prediction is a fundamental task in crystal material research. To
+model atoms and structures, structures represented as graphs are widely used
+and graph learning-based methods have achieved significant progress. Bond
+angles and bond distances are two key structural information that greatly
+influence crystal properties. However, most of the existing works only consider
+bond distances and overlook bond angles. The main challenge lies in the time
+cost of handling bond angles, which leads to a significant increase in
+inference time. To solve this issue, we first propose a crystal structure
+modeling based on dual scale neighbor partitioning mechanism, which uses a
+larger scale cutoff for edge neighbors and a smaller scale cutoff for angle
+neighbors. Then, we propose a novel Atom-Distance-Angle Graph Neural Network
+(ADA-GNN) for property prediction tasks, which can process node information and
+structural information separately. The accuracy of predictions and inference
+time are improved with the dual scale modeling and the specially designed
+architecture of ADA-GNN. The experimental results validate that our approach
+achieves state-of-the-art results in two large-scale material benchmark
+datasets on property prediction tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Effective and General Graph Unlearning via Mutual Evolution <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11760v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11760v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunkai Li, Yulin Zhao, Zhengyu Wu, Wentao Zhang, Rong-Hua Li, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid advancement of AI applications, the growing needs for data
+privacy and model robustness have highlighted the importance of machine
+unlearning, especially in thriving graph-based scenarios. However, most
+existing graph unlearning strategies primarily rely on well-designed
+architectures or manual process, rendering them less user-friendly and posing
+challenges in terms of deployment efficiency. Furthermore, striking a balance
+between unlearning performance and framework generalization is also a pivotal
+concern. To address the above issues, we propose \underline{\textbf{M}}utual
+\underline{\textbf{E}}volution \underline{\textbf{G}}raph
+\underline{\textbf{U}}nlearning (MEGU), a new mutual evolution paradigm that
+simultaneously evolves the predictive and unlearning capacities of graph
+unlearning. By incorporating aforementioned two components, MEGU ensures
+complementary optimization in a unified training framework that aligns with the
+prediction and unlearning requirements. Extensive experiments on 9 graph
+benchmark datasets demonstrate the superior performance of MEGU in addressing
+unlearning requirements at the feature, node, and edge levels. Specifically,
+MEGU achieves average performance improvements of 2.7\%, 2.5\%, and 3.2\%
+across these three levels of unlearning tasks when compared to state-of-the-art
+baselines. Furthermore, MEGU exhibits satisfactory training efficiency,
+reducing time and space overhead by an average of 159.8x and 9.6x,
+respectively, in comparison to retraining GNN from scratch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2024 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedGTA: Topology-aware Averaging for Federated Graph Learning <span class="chip">VLDB 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunkai Li, Zhengyu Wu, Wentao Zhang, Yinlin Zhu, Rong-Hua Li, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Graph Learning (FGL) is a distributed machine learning paradigm
+that enables collaborative training on large-scale subgraphs across multiple
+local systems. Existing FGL studies fall into two categories: (i) FGL
+Optimization, which improves multi-client training in existing machine learning
+models; (ii) FGL Model, which enhances performance with complex local models
+and multi-client interactions. However, most FGL optimization strategies are
+designed specifically for the computer vision domain and ignore graph
+structure, presenting dissatisfied performance and slow convergence. Meanwhile,
+complex local model architectures in FGL Models studies lack scalability for
+handling large-scale subgraphs and have deployment limitations. To address
+these issues, we propose Federated Graph Topology-aware Aggregation (FedGTA), a
+personalized optimization strategy that optimizes through topology-aware local
+smoothing confidence and mixed neighbor features. During experiments, we deploy
+FedGTA in 12 multi-scale real-world datasets with the Louvain and Metis split.
+This allows us to evaluate the performance and robustness of FedGTA across a
+range of scenarios. Extensive experiments demonstrate that FedGTA achieves
+state-of-the-art performance while exhibiting high scalability and efficiency.
+The experiment includes ogbn-papers100M, the most representative large-scale
+graph database so that we can verify the applicability of our method to
+large-scale graph learning. To the best of our knowledge, our study is the
+first to bridge large-scale graph learning with FGL using this optimization
+strategy, contributing to the development of efficient and scalable FGL
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by VLDB 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaFGL: A New Paradigm for Federated Node Classification with Topology
+  Heterogeneity <span class="chip">ICDE 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11750v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11750v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunkai Li, Zhengyu Wu, Wentao Zhang, Henan Sun, Rong-Hua Li, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Federated Graph Learning (FGL) has attracted significant attention
+as a distributed framework based on graph neural networks, primarily due to its
+capability to break data silos. Existing FGL studies employ community split on
+the homophilous global graph by default to simulate federated semi-supervised
+node classification settings. Such a strategy assumes the consistency of
+topology between the multi-client subgraphs and the global graph, where
+connected nodes are highly likely to possess similar feature distributions and
+the same label. However, in real-world implementations, the varying
+perspectives of local data engineering result in various subgraph topologies,
+posing unique heterogeneity challenges in FGL. Unlike the well-known label
+Non-independent identical distribution (Non-iid) problems in federated
+learning, FGL heterogeneity essentially reveals the topological divergence
+among multiple clients, namely homophily or heterophily. To simulate and handle
+this unique challenge, we introduce the concept of structure Non-iid split and
+then present a new paradigm called \underline{Ada}ptive \underline{F}ederated
+\underline{G}raph \underline{L}earning (AdaFGL), a decoupled two-step
+personalized approach. To begin with, AdaFGL employs standard multi-client
+federated collaborative training to acquire the federated knowledge extractor
+by aggregating uploaded models in the final round at the server. Then, each
+client conducts personalized training based on the local subgraph and the
+federated knowledge extractor. Extensive experiments on the 12 graph benchmark
+datasets validate the superior performance of AdaFGL over state-of-the-art
+baselines. Specifically, in terms of test accuracy, our proposed AdaFGL
+outperforms baselines by significant margins of 3.24\% and 5.57\% on community
+split and structure Non-iid split, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICDE 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GI-PIP: Do We Require Impractical Auxiliary <span class="highlight-title">Dataset</span> for Gradient
+  Inversion Attacks? <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11748v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11748v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu sun, Gaojian Xiong, Xianxun Yao, Kailang Ma, Jian Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep gradient inversion attacks expose a serious threat to Federated Learning
+(FL) by accurately recovering private data from shared gradients. However, the
+state-of-the-art heavily relies on impractical assumptions to access excessive
+auxiliary data, which violates the basic data partitioning principle of FL. In
+this paper, a novel method, Gradient Inversion Attack using Practical Image
+Prior (GI-PIP), is proposed under a revised threat model. GI-PIP exploits
+anomaly detection models to capture the underlying distribution from fewer
+data, while GAN-based methods consume significant more data to synthesize
+images. The extracted distribution is then leveraged to regulate the attack
+process as Anomaly Score loss. Experimental results show that GI-PIP achieves a
+16.12 dB PSNR recovery using only 3.8\% data of ImageNet, while GAN-based
+methods necessitate over 70\%. Moreover, GI-PIP exhibits superior capability on
+distribution generalization compared to GAN-based methods. Our approach
+significantly alleviates the auxiliary data requirement on both amount and
+distribution in gradient inversion attacks, hence posing more substantial
+threat to real-world FL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5pages, 5 figures, accepted to ICASSP 2024, not published yet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-level Cross-modal Alignment for Image Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11740v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11740v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Qiu, Qin Zhang, Xiaojun Chen, Shaotian Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the cross-modal pretraining model has been employed to produce
+meaningful pseudo-labels to supervise the training of an image clustering
+model. However, numerous erroneous alignments in a cross-modal pre-training
+model could produce poor-quality pseudo-labels and degrade clustering
+performance. To solve the aforementioned issue, we propose a novel
+\textbf{Multi-level Cross-modal Alignment} method to improve the alignments in
+a cross-modal pretraining model for downstream tasks, by building a smaller but
+better semantic space and aligning the images and texts in three levels, i.e.,
+instance-level, prototype-level, and semantic-level. Theoretical results show
+that our proposed method converges, and suggests effective means to reduce the
+expected clustering risk of our method. Experimental results on five benchmark
+datasets clearly show the superiority of our new method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EmerDiff: Emerging Pixel-level Semantic Knowledge in Diffusion Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11739v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11739v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koichi Namekata, Amirmojtaba Sabour, Sanja Fidler, Seung Wook Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have recently received increasing research attention for
+their remarkable transfer abilities in semantic segmentation tasks. However,
+generating fine-grained segmentation masks with diffusion models often requires
+additional training on annotated datasets, leaving it unclear to what extent
+pre-trained diffusion models alone understand the semantic relations of their
+generated images. To address this question, we leverage the semantic knowledge
+extracted from Stable Diffusion (SD) and aim to develop an image segmentor
+capable of generating fine-grained segmentation maps without any additional
+training. The primary difficulty stems from the fact that semantically
+meaningful feature maps typically exist only in the spatially lower-dimensional
+layers, which poses a challenge in directly extracting pixel-level semantic
+relations from these feature maps. To overcome this issue, our framework
+identifies semantic correspondences between image pixels and spatial locations
+of low-dimensional feature maps by exploiting SD's generation process and
+utilizes them for constructing image-resolution segmentation maps. In extensive
+experiments, the produced segmentation maps are demonstrated to be well
+delineated and capture detailed parts of the images, indicating the existence
+of highly accurate pixel-level semantic knowledge in diffusion models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. Project page: https://kmcode1.github.io/Projects/EmerDiff/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention on Personalized Clinical Decision Support System: Federated
+  Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11736v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11736v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chu Myaet Thwal, Kyi Thar, Ye Lin Tun, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Health management has become a primary problem as new kinds of diseases and
+complex symptoms are introduced to a rapidly growing modern society. Building a
+better and smarter healthcare infrastructure is one of the ultimate goals of a
+smart city. To the best of our knowledge, neural network models are already
+employed to assist healthcare professionals in achieving this goal. Typically,
+training a neural network requires a rich amount of data but heterogeneous and
+vulnerable properties of clinical data introduce a challenge for the
+traditional centralized network. Moreover, adding new inputs to a medical
+database requires re-training an existing model from scratch. To tackle these
+challenges, we proposed a deep learning-based clinical decision support system
+trained and managed under a federated learning paradigm. We focused on a novel
+strategy to guarantee the safety of patient privacy and overcome the risk of
+cyberattacks while enabling large-scale clinical data mining. As a result, we
+can leverage rich clinical data for training each local neural network without
+the need for exchanging the confidential data of patients. Moreover, we
+implemented the proposed scheme as a sequence-to-sequence model architecture
+integrating the attention mechanism. Thus, our objective is to provide a
+personalized clinical decision support system with evolvable characteristics
+that can deliver accurate solutions and assist healthcare professionals in
+medical diagnosing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in IEEE BigComp 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast and Scalable Network Slicing by Integrating Deep Learning with
+  Lagrangian Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianlun Hu, Qi Liao, Qiang Liu, Antonio Massaro, Georg Carle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network slicing is a key technique in 5G and beyond for efficiently
+supporting diverse services. Many network slicing solutions rely on deep
+learning to manage complex and high-dimensional resource allocation problems.
+However, deep learning models suffer limited generalization and adaptability to
+dynamic slicing configurations. In this paper, we propose a novel framework
+that integrates constrained optimization methods and deep learning models,
+resulting in strong generalization and superior approximation capability. Based
+on the proposed framework, we design a new neural-assisted algorithm to
+allocate radio resources to slices to maximize the network utility under
+inter-slice resource constraints. The algorithm exhibits high scalability,
+accommodating varying numbers of slices and slice configurations with ease. We
+implement the proposed solution in a system-level network simulator and
+evaluate its performance extensively by comparing it to state-of-the-art
+solutions including deep reinforcement learning approaches. The numerical
+results show that our solution obtains near-optimal quality-of-service
+satisfaction and promising generalization performance under different network
+slicing scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures, IEEE Global Communications Conference 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Out-of-Distribution Samples via Conditional Distribution
+  Entropy with Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11726v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11726v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanwen Feng, Wenlong Chen, Ao Ke, Yilong Ren, Xike Xie, S. Kevin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When deploying a trained machine learning model in the real world, it is
+inevitable to receive inputs from out-of-distribution (OOD) sources. For
+instance, in continual learning settings, it is common to encounter OOD samples
+due to the non-stationarity of a domain. More generally, when we have access to
+a set of test inputs, the existing rich line of OOD detection solutions,
+especially the recent promise of distance-based methods, falls short in
+effectively utilizing the distribution information from training samples and
+test inputs. In this paper, we argue that empirical probability distributions
+that incorporate geometric information from both training samples and test
+inputs can be highly beneficial for OOD detection in the presence of test
+inputs available. To address this, we propose to model OOD detection as a
+discrete optimal transport problem. Within the framework of optimal transport,
+we propose a novel score function known as the \emph{conditional distribution
+entropy} to quantify the uncertainty of a test input being an OOD sample. Our
+proposal inherits the merits of certain distance-based methods while
+eliminating the reliance on distribution assumptions, a-prior knowledge, and
+specific training mechanisms. Extensive experiments conducted on benchmark
+datasets demonstrate that our method outperforms its competitors in OOD
+detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Condensation: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyi Gao, Junliang Yu, Wei Jiang, Tong Chen, Wentao Zhang, Hongzhi Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The burgeoning volume of graph data poses significant challenges in storage,
+transmission, and particularly the training of graph neural networks (GNNs). To
+address these challenges, graph condensation (GC) has emerged as an innovative
+solution. GC focuses on synthesizing a compact yet highly representative graph,
+on which GNNs can achieve performance comparable to trained on the large
+original graph. The notable efficacy of GC and its broad prospects have
+garnered significant attention and spurred extensive research. This survey
+paper provides an up-to-date and systematic overview of GC, organizing existing
+research into four categories aligned with critical GC evaluation criteria:
+effectiveness, generalization, fairness, and efficiency. To facilitate an
+in-depth and comprehensive understanding of GC, we examine various methods
+under each category and thoroughly discuss two essential components within GC:
+optimization strategies and condensed graph generation. Additionally, we
+introduce the applications of GC in a variety of fields, and highlight the
+present challenges and novel insights in GC, promoting advancements in future
+research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Admission Prediction in Undergraduate Applications: an Interpretable
+  Deep Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amisha Priyadarshini, Barbara Martinez-Neda, Sergio Gago-Masague
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article addresses the challenge of validating the admission committee's
+decisions for undergraduate admissions. In recent years, the traditional review
+process has struggled to handle the overwhelmingly large amount of applicants'
+data. Moreover, this traditional assessment often leads to human bias, which
+might result in discrimination among applicants. Although classical machine
+learning-based approaches exist that aim to verify the quantitative assessment
+made by the application reviewers, these methods lack scalability and suffer
+from performance issues when a large volume of data is in place. In this
+context, we propose deep learning-based classifiers, namely Feed-Forward and
+Input Convex neural networks, which overcome the challenges faced by the
+existing methods. Furthermore, we give additional insights into our model by
+incorporating an interpretability module, namely LIME. Our training and test
+datasets comprise applicants' data with a wide range of variables and
+information. Our models achieve higher accuracy compared to the best-performing
+traditional machine learning-based approach by a considerable margin of 3.03\%.
+Additionally, we show the sensitivity of different features and their relative
+impacts on the overall admission decision using the LIME technique.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for Transdisciplinary AI 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parametric Matrix Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11694v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11694v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Cook, Danny Jammooa, Morten Hjorth-Jensen, Daniel D. Lee, Dean Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a general class of machine learning algorithms called parametric
+matrix models. Parametric matrix models are based on matrix equations, and the
+design is motivated by the efficiency of reduced basis methods for
+approximating solutions of parametric equations. The dependent variables can be
+defined implicitly or explicitly, and the equations may use algebraic,
+differential, or integral relations. Parametric matrix models can be trained
+with empirical data only, and no high-fidelity model calculations are needed.
+While originally designed for scientific computing, parametric matrix models
+are universal function approximators that can be applied to general machine
+learning problems. After introducing the underlying theory, we apply parametric
+matrix models to a series of different challenges that show their performance
+for a wide range of problems. For all the challenges tested here, parametric
+matrix models produce accurate results within a computational framework that
+allows for parameter extrapolation and interpretability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TIM: An Efficient Temporal Interaction Module for Spiking <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicheng Shen, Dongcheng Zhao, Guobin Shen, Yi Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNNs), as the third generation of neural networks,
+have gained prominence for their biological plausibility and computational
+efficiency, especially in processing diverse datasets. The integration of
+attention mechanisms, inspired by advancements in neural network architectures,
+has led to the development of Spiking Transformers. These have shown promise in
+enhancing SNNs' capabilities, particularly in the realms of both static and
+neuromorphic datasets. Despite their progress, a discernible gap exists in
+these systems, specifically in the Spiking Self Attention (SSA) mechanism's
+effectiveness in leveraging the temporal processing potential of SNNs. To
+address this, we introduce the Temporal Interaction Module (TIM), a novel,
+convolution-based enhancement designed to augment the temporal data processing
+abilities within SNN architectures. TIM's integration into existing SNN
+frameworks is seamless and efficient, requiring minimal additional parameters
+while significantly boosting their temporal information handling capabilities.
+Through rigorous experimentation, TIM has demonstrated its effectiveness in
+exploiting temporal information, leading to state-of-the-art performance across
+various neuromorphic datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10pages,6figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simulating Nighttime Visible Satellite Imagery of Tropical Cyclones
+  Using Conditional Generative Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghuai Yao, Puyuan Du, Yucheng Zhao, Yubo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visible (VIS) imagery of satellites has various important applications in
+meteorology, including monitoring Tropical Cyclones (TCs). However, it is
+unavailable at night because of the lack of sunlight. This study presents a
+Conditional Generative Adversarial Networks (CGAN) model that generates highly
+accurate nighttime visible reflectance using infrared (IR) bands and sunlight
+direction parameters as input. The model was trained and validated using target
+area observations of the Advanced Himawari Imager (AHI) in the daytime. This
+study also presents the first nighttime model validation using the Day/Night
+Band (DNB) of the Visible/Infrared Imager Radiometer Suite (VIIRS). The daytime
+statistical results of the Structural Similarity Index Measure (SSIM), Peak
+Signal-to-Noise Ratio (PSNR), Root Mean Square Error (RMSE), Correlation
+Coefficient (CC), and Bias are 0.885, 28.3, 0.0428, 0.984, and -0.0016
+respectively, completely surpassing the model performance of previous studies.
+The nighttime statistical results of SSIM, PSNR, RMSE, and CC are 0.821, 24.4,
+0.0643, and 0.969 respectively, which are slightly negatively impacted by the
+parallax between satellites. We performed full-disk model validation which
+proves our model could also be readily applied in the tropical ocean without
+TCs in the northern hemisphere. This model contributes to the nighttime
+monitoring of meteorological phenomena by providing accurate AI-generated
+visible imagery with adjustable virtual sunlight directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RTA-Former: Reverse <span class="highlight-title">Transformer</span> Attention for Polyp Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikai Li, Murong Yi, Ali Uneri, Sihan Niu, Craig Jones
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Polyp segmentation is a key aspect of colorectal cancer prevention, enabling
+early detection and guiding subsequent treatments. Intelligent diagnostic
+tools, including deep learning solutions, are widely explored to streamline and
+potentially automate this process. However, even with many powerful network
+architectures, there still comes the problem of producing accurate edge
+segmentation. In this paper, we introduce a novel network, namely RTA-Former,
+that employs a transformer model as the encoder backbone and innovatively
+adapts Reverse Attention (RA) with a transformer stage in the decoder for
+enhanced edge segmentation. The results of the experiments illustrate that
+RTA-Former achieves state-of-the-art (SOTA) performance in five polyp
+segmentation datasets. The strong capability of RTA-Former holds promise in
+improving the accuracy of Transformer-based polyp segmentation, potentially
+leading to better clinical decisions and patient outcomes. Our code will be
+publicly available on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Improved Grey Wolf Optimization Algorithm for Heart Disease
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sihan Niu, Yifan Zhou, Zhikai Li, Shuyao Huang, Yujun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a unique solution to challenges in medical image
+processing by incorporating an adaptive curve grey wolf optimization (ACGWO)
+algorithm into neural network backpropagation. Neural networks show potential
+in medical data but suffer from issues like overfitting and lack of
+interpretability due to imbalanced and scarce data. Traditional Gray Wolf
+Optimization (GWO) also has its drawbacks, such as a lack of population
+diversity and premature convergence. This paper addresses these problems by
+introducing an adaptive algorithm, enhancing the standard GWO with a sigmoid
+function. This algorithm was extensively compared to four leading algorithms
+using six well-known test functions, outperforming them effectively. Moreover,
+by utilizing the ACGWO, we increase the robustness and generalization of the
+neural network, resulting in more interpretable predictions. Applied to the
+publicly accessible Cleveland Heart Disease dataset, our technique surpasses
+ten other methods, achieving 86.8% accuracy, indicating its potential for
+efficient heart disease prediction in the clinical setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ INC<span class="highlight-title">Prompt</span>: Task-Aware incremental <span class="highlight-title">Prompt</span>ing for Rehearsal-Free
+  Class-incremental Learning <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Wang, Xiaoyang Qu, Jing Xiao, Bokui Chen, Jianzong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces INCPrompt, an innovative continual learning solution
+that effectively addresses catastrophic forgetting. INCPrompt's key innovation
+lies in its use of adaptive key-learner and task-aware prompts that capture
+task-relevant information. This unique combination encapsulates general
+knowledge across tasks and encodes task-specific knowledge. Our comprehensive
+evaluation across multiple continual learning benchmarks demonstrates
+INCPrompt's superiority over existing algorithms, showing its effectiveness in
+mitigating catastrophic forgetting while maintaining high performance. These
+results highlight the significant impact of task-aware incremental prompting on
+continual learning performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 49th IEEE International Conference on Acoustics,
+  Speech, and Signal Processing (ICASSP 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ P2DT: Mitigating Forgetting in task-incremental Learning with
+  progressive <span class="highlight-title">prompt</span> Decision <span class="highlight-title">Transformer</span> <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Wang, Xiaoyang Qu, Jing Xiao, Bokui Chen, Jianzong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Catastrophic forgetting poses a substantial challenge for managing
+intelligent agents controlled by a large model, causing performance degradation
+when these agents face new tasks. In our work, we propose a novel solution -
+the Progressive Prompt Decision Transformer (P2DT). This method enhances a
+transformer-based model by dynamically appending decision tokens during new
+task training, thus fostering task-specific policies. Our approach mitigates
+forgetting in continual and offline reinforcement learning scenarios. Moreover,
+P2DT leverages trajectories collected via traditional reinforcement learning
+from all tasks and generates new task-specific tokens during training, thereby
+retaining knowledge from previous studies. Preliminary results demonstrate that
+our model effectively alleviates catastrophic forgetting and scales well with
+increasing task environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 49th IEEE International Conference on Acoustics,
+  Speech, and Signal Processing (ICASSP 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerating Approximate Thompson Sampling with Underdamped Langevin
+  Monte Carlo <span class="chip">AISTATS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11665v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11665v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyang Zheng, Wei Deng, Christian Moya, Guang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Approximate Thompson sampling with Langevin Monte Carlo broadens its reach
+from Gaussian posterior sampling to encompass more general smooth posteriors.
+However, it still encounters scalability issues in high-dimensional problems
+when demanding high accuracy. To address this, we propose an approximate
+Thompson sampling strategy, utilizing underdamped Langevin Monte Carlo, where
+the latter is the go-to workhorse for simulations of high-dimensional
+posteriors. Based on the standard smoothness and log-concavity conditions, we
+study the accelerated posterior concentration and sampling using a specific
+potential function. This design improves the sample complexity for realizing
+logarithmic regrets from $\mathcal{\tilde O}(d)$ to $\mathcal{\tilde
+O}(\sqrt{d})$. The scalability and robustness of our algorithm are also
+empirically validated through synthetic experiments in high-dimensional bandit
+problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 pages, 1 figure, to appear in AISTATS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-Space Cost Fault Tolerance for <span class="highlight-title">Transformer</span>-based Language Models on
+  ReRAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingbing Li, Geng Yuan, Zigeng Wang, Shaoyi Huang, Hongwu Peng, Payman Behnam, Wujie Wen, Hang Liu, Caiwen Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Resistive Random Access Memory (ReRAM) has emerged as a promising platform
+for deep neural networks (DNNs) due to its support for parallel in-situ
+matrix-vector multiplication. However, hardware failures, such as
+stuck-at-fault defects, can result in significant prediction errors during
+model inference. While additional crossbars can be used to address these
+failures, they come with storage overhead and are not efficient in terms of
+space, energy, and cost. In this paper, we propose a fault protection mechanism
+that incurs zero space cost. Our approach includes: 1) differentiable structure
+pruning of rows and columns to reduce model redundancy, 2) weight duplication
+and voting for robust output, and 3) embedding duplicated most significant bits
+(MSBs) into the model weight. We evaluate our method on nine tasks of the GLUE
+benchmark with the BERT model, and experimental results prove its
+effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differentiable Tree Search in Latent State Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dixant Mittal, Wee Sun Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In decision-making problems with limited training data, policy functions
+approximated using deep neural networks often exhibit suboptimal performance.
+An alternative approach involves learning a world model from the limited data
+and determining actions through online search. However, the performance is
+adversely affected by compounding errors arising from inaccuracies in the
+learnt world model. While methods like TreeQN have attempted to address these
+inaccuracies by incorporating algorithmic structural biases into their
+architectures, the biases they introduce are often weak and insufficient for
+complex decision-making tasks. In this work, we introduce Differentiable Tree
+Search (DTS), a novel neural network architecture that significantly
+strengthens the inductive bias by embedding the algorithmic structure of a
+best-first online search algorithm. DTS employs a learnt world model to conduct
+a fully differentiable online search in latent state space. The world model is
+jointly optimised with the search algorithm, enabling the learning of a robust
+world model and mitigating the effect of model inaccuracies. We address
+potential Q-function discontinuities arising from naive incorporation of
+best-first search by adopting a stochastic tree expansion policy, formulating
+search tree expansion as a decision-making task, and introducing an effective
+variance reduction technique for the gradient computation. We evaluate DTS in
+an offline-RL setting with a limited training data scenario on Procgen games
+and grid navigation task, and demonstrate that DTS outperforms popular
+model-free and model-based baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OnDev-LCT: On-Device Lightweight Convolutional <span class="highlight-title">Transformer</span>s towards
+  federated learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chu Myaet Thwal, Minh N. H. Nguyen, Ye Lin Tun, Seong Tae Kim, My T. Thai, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) has emerged as a promising approach to
+collaboratively train machine learning models across multiple edge devices
+while preserving privacy. The success of FL hinges on the efficiency of
+participating models and their ability to handle the unique challenges of
+distributed learning. While several variants of Vision Transformer (ViT) have
+shown great potential as alternatives to modern convolutional neural networks
+(CNNs) for centralized training, the unprecedented size and higher
+computational demands hinder their deployment on resource-constrained edge
+devices, challenging their widespread application in FL. Since client devices
+in FL typically have limited computing resources and communication bandwidth,
+models intended for such devices must strike a balance between model size,
+computational efficiency, and the ability to adapt to the diverse and non-IID
+data distributions encountered in FL. To address these challenges, we propose
+OnDev-LCT: Lightweight Convolutional Transformers for On-Device vision tasks
+with limited training data and resources. Our models incorporate image-specific
+inductive biases through the LCT tokenizer by leveraging efficient depthwise
+separable convolutions in residual linear bottleneck blocks to extract local
+features, while the multi-head self-attention (MHSA) mechanism in the LCT
+encoder implicitly facilitates capturing global representations of images.
+Extensive experiments on benchmark image datasets indicate that our models
+outperform existing lightweight vision models while having fewer parameters and
+lower computational demands, making them suitable for FL scenarios with data
+heterogeneity and communication bottlenecks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Neural Networks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal
+  Contrastive EHR Modelling with Hierarchical Regularisation <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heejoon Koo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting next visit diagnosis using Electronic Health Records (EHR) is an
+essential task in healthcare, critical for devising proactive future plans for
+both healthcare providers and patients. Nonetheless, many preceding studies
+have not sufficiently addressed the heterogeneous and hierarchical
+characteristics inherent in EHR data, inevitably leading to sub-optimal
+performance. To this end, we propose NECHO, a novel medical code-centric
+multimodal contrastive EHR learning framework with hierarchical regularisation.
+First, we integrate multifaceted information encompassing medical codes,
+demographics, and clinical notes using a tailored network design and a pair of
+bimodal contrastive losses, all of which pivot around a medical code
+representation. We also regularise modality-specific encoders using a parental
+level information in medical ontology to learn hierarchical structure of EHR
+data. A series of experiments on MIMIC-III data demonstrates effectiveness of
+our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EACL 2024 (The 18th Conference of the European Chapter of
+  the Association for Computational Linguistics)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LW-FedSSL: Resource-efficient Layer-wise Federated <span class="highlight-title">Self-supervised</span>
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Lin Tun, Chu Myaet Thwal, Le Quang Huy, Minh N. H. Nguyen, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many recent studies integrate federated learning (FL) with self-supervised
+learning (SSL) to take advantage of raw training data distributed across edge
+devices. However, edge devices often struggle with high computation and
+communication costs imposed by SSL and FL algorithms. To tackle this hindrance,
+we propose LW-FedSSL, a layer-wise federated self-supervised learning approach
+that allows edge devices to incrementally train one layer of the model at a
+time. LW-FedSSL comprises server-side calibration and representation alignment
+mechanisms to maintain comparable performance with end-to-end FedSSL while
+significantly lowering clients' resource requirements. The server-side
+calibration mechanism takes advantage of the resource-rich server in an FL
+environment to assist in global model training. Meanwhile, the representation
+alignment mechanism encourages closeness between representations of FL local
+models and those of the global model. Our experiments show that LW-FedSSL has a
+$3.3 \times$ lower memory requirement and a $3.2 \times$ cheaper communication
+cost than its end-to-end counterpart. We also explore a progressive training
+strategy called Prog-FedSSL that outperforms end-to-end training with a similar
+memory requirement and a $1.8 \times$ cheaper communication cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonparametric Estimation via Variance-Reduced Sketching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuehaw Khoo, Yifan Peng, Daren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonparametric models are of great interest in various scientific and
+engineering disciplines. Classical kernel methods, while numerically robust and
+statistically sound in low-dimensional settings, become inadequate in
+higher-dimensional settings due to the curse of dimensionality. In this paper,
+we introduce a new framework called Variance-Reduced Sketching (VRS),
+specifically designed to estimate density functions and nonparametric
+regression functions in higher dimensions with a reduced curse of
+dimensionality. Our framework conceptualizes multivariable functions as
+infinite-size matrices, and facilitates a new sketching technique motivated by
+numerical linear algebra literature to reduce the variance in estimation
+problems. We demonstrate the robust numerical performance of VRS through a
+series of simulated experiments and real-world data applications. Notably, VRS
+shows remarkable improvement over existing neural network estimators and
+classical kernel methods in numerous density estimation and nonparametric
+regression models. Additionally, we offer theoretical justifications for VRS to
+support its ability to deliver nonparametric estimation with a reduced curse of
+dimensionality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>64 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Should Ask Clarifying Questions to Increase
+  Confidence in Generated Code 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13507v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13507v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie JW Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have significantly improved the ability to
+perform tasks in the field of code generation. However, there is still a gap
+between LLMs being capable coders and being top-tier software engineers. Based
+on the observation that toplevel software engineers often ask clarifying
+questions to reduce ambiguity in both requirements and coding solutions, I
+argue that the same should be applied to LLMs for code generation tasks. By
+asking probing questions in various topics before generating the final code,
+the challenges of programming with LLMs, such as unclear intent specification,
+lack of computational thinking, and undesired code quality, may be alleviated.
+This, in turn, increases confidence in the generated code. In this work, I
+explore how to leverage better communication skills to achieve greater
+confidence in generated code. I propose a communication-centered process that
+uses an LLM-generated communicator to identify issues with high ambiguity or
+low confidence in problem descriptions and generated code. I then ask
+clarifying questions to obtain responses from users for refining the code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 2 figures, 1 table. Accepted and presented at the 7th Annual
+  Symposium on Machine Programming (MAPS 2023 Workshop, see
+  https://mapsworkshop.github.io/). Reference: "Wu, Jie JW. Large Language
+  Models Should Ask Clarifying Questions to Increase Confidence in Generated
+  Code. The 7th Annual Symposium on Machine Programming (MAPS 23), December 3,
+  2023, San Francisco, CA, USA"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiarizationLM: Speaker Diarization Post-Processing with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03506v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03506v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quan Wang, Yiling Huang, Guanlong Zhao, Evan Clark, Wei Xia, Hank Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce DiarizationLM, a framework to leverage large
+language models (LLM) to post-process the outputs from a speaker diarization
+system. Various goals can be achieved with the proposed framework, such as
+improving the readability of the diarized transcript, or reducing the word
+diarization error rate (WDER). In this framework, the outputs of the automatic
+speech recognition (ASR) and speaker diarization systems are represented as a
+compact textual format, which is included in the prompt to an optionally
+finetuned LLM. The outputs of the LLM can be used as the refined diarization
+results with the desired enhancement. As a post-processing step, this framework
+can be easily applied to any off-the-shelf ASR and speaker diarization systems
+without retraining existing components. Our experiments show that a finetuned
+PaLM 2-S model can reduce the WDER by rel. 55.5% on the Fisher telephone
+conversation dataset, and rel. 44.9% on the Callhome English dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Personality Trait Inference Via Mobile Phone Sensors: A Machine Learning
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10305v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10305v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wun Yung Shaney Sze, Maryglen Pearl Herrero, Roger Garriga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study provides evidence that personality can be reliably predicted from
+activity data collected through mobile phone sensors. Employing a set of well
+informed indicators calculable from accelerometer records and movement
+patterns, we were able to predict users' personality up to a 0.78 F1 score on a
+two class problem. Given the fast growing number of data collected from mobile
+phones, our novel personality indicators open the door to exciting avenues for
+future research in social sciences. Our results reveal distinct behavioral
+patterns that proved to be differentially predictive of big five personality
+traits. They potentially enable cost effective, questionnaire free
+investigation of personality related questions at an unprecedented scale. We
+show how a combination of rich behavioral data obtained with smartphone sensing
+and the use of machine learning techniques can help to advance personality
+research and can inform both practitioners and researchers about the different
+behavioral patterns of personality. These findings have practical implications
+for organizations harnessing mobile sensor data for personality assessment,
+guiding the refinement of more precise and efficient prediction models in the
+future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Size-Independent Generalization Bounds for Deep Operator Nets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.11359v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.11359v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pulkit Gopalani, Sayar Karmakar, Dibyakanti Kumar, Anirbit Mukherjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent times machine learning methods have made significant advances in
+becoming a useful tool for analyzing physical systems. A particularly active
+area in this theme has been "physics-informed machine learning" which focuses
+on using neural nets for numerically solving differential equations. In this
+work, we aim to advance the theory of measuring out-of-sample error while
+training DeepONets -- which is among the most versatile ways to solve PDE
+systems in one-shot.
+  Firstly, for a class of DeepONets, we prove a bound on their Rademacher
+complexity which does not explicitly scale with the width of the nets involved.
+Secondly, we use this to show how the Huber loss can be chosen so that for
+these DeepONet classes generalization error bounds can be obtained that have no
+explicit dependence on the size of the nets. We note that our theoretical
+results apply to any PDE being targeted to be solved by DeepONets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 5 figures; Added theorem on generalization error indicating
+  benefits of training DeepONets on the Huber loss and corresponding
+  experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking the Robustness of Image Watermarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08573v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08573v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bang An, Mucong Ding, Tahseen Rabbani, Aakriti Agrawal, Yuancheng Xu, Chenghao Deng, Sicheng Zhu, Abdirisak Mohamed, Yuxin Wen, Tom Goldstein, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the weaknesses of image watermarking techniques. We
+present WAVES (Watermark Analysis Via Enhanced Stress-testing), a novel
+benchmark for assessing watermark robustness, overcoming the limitations of
+current evaluation methods.WAVES integrates detection and identification tasks,
+and establishes a standardized evaluation protocol comprised of a diverse range
+of stress tests. The attacks in WAVES range from traditional image distortions
+to advanced and novel variations of diffusive, and adversarial attacks. Our
+evaluation examines two pivotal dimensions: the degree of image quality
+degradation and the efficacy of watermark detection after attacks. We develop a
+series of Performance vs. Quality 2D plots, varying over several prominent
+image similarity metrics, which are then aggregated in a heuristically novel
+manner to paint an overall picture of watermark robustness and attack potency.
+Our comprehensive evaluation reveals previously undetected vulnerabilities of
+several modern watermarking algorithms. We envision WAVES as a toolkit for the
+future development of robust watermarking systems. The project is available at
+https://wavesbench.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DFU: scale-robust diffusion model for zero-shot super-resolution image
+  generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06144v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06144v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Havrilla, Kevin Rojas, Wenjing Liao, Molei Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion generative models have achieved remarkable success in generating
+images with a fixed resolution. However, existing models have limited ability
+to generalize to different resolutions when training data at those resolutions
+are not available. Leveraging techniques from operator learning, we present a
+novel deep-learning architecture, Dual-FNO UNet (DFU), which approximates the
+score operator by combining both spatial and spectral information at multiple
+resolutions. Comparisons of DFU to baselines demonstrate its scalability: 1)
+simultaneously training on multiple resolutions improves FID over training at
+any single fixed resolution; 2) DFU generalizes beyond its training
+resolutions, allowing for coherent, high-fidelity generation at
+higher-resolutions with the same model, i.e. zero-shot super-resolution
+image-generation; 3) we propose a fine-tuning strategy to further enhance the
+zero-shot super-resolution image-generation capability of our model, leading to
+a FID of 11.3 at 1.66 times the maximum training resolution on FFHQ, which no
+other method can come close to achieving.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DTC: Deep Tracking Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15462v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15462v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Jenelten, Junzhe He, Farbod Farshidian, Marco Hutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legged locomotion is a complex control problem that requires both accuracy
+and robustness to cope with real-world challenges. Legged systems have
+traditionally been controlled using trajectory optimization with inverse
+dynamics. Such hierarchical model-based methods are appealing due to intuitive
+cost function tuning, accurate planning, generalization, and most importantly,
+the insightful understanding gained from more than one decade of extensive
+research. However, model mismatch and violation of assumptions are common
+sources of faulty operation. Simulation-based reinforcement learning, on the
+other hand, results in locomotion policies with unprecedented robustness and
+recovery skills. Yet, all learning algorithms struggle with sparse rewards
+emerging from environments where valid footholds are rare, such as gaps or
+stepping stones. In this work, we propose a hybrid control architecture that
+combines the advantages of both worlds to simultaneously achieve greater
+robustness, foot-placement accuracy, and terrain generalization. Our approach
+utilizes a model-based planner to roll out a reference motion during training.
+A deep neural network policy is trained in simulation, aiming to track the
+optimized footholds. We evaluate the accuracy of our locomotion pipeline on
+sparse terrains, where pure data-driven methods are prone to fail. Furthermore,
+we demonstrate superior robustness in the presence of slippery or deformable
+ground when compared to model-based counterparts. Finally, we show that our
+proposed tracking controller generalizes across different trajectory
+optimization methods not seen during training. In conclusion, our work unites
+the predictive capabilities and optimality guarantees of online planning with
+the inherent robustness attributed to offline learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Better Batch for Deep Probabilistic Time Series Forecasting <span class="chip">AISTATS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17028v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17028v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincent Zhihao Zheng, Seongjin Choi, Lijun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep probabilistic time series forecasting has gained significant attention
+due to its superior performance in nonlinear approximation and its ability to
+provide valuable uncertainty quantification for decision-making tasks. However,
+many existing models oversimplify the problem by assuming that the error
+process is time-independent, thereby overlooking the serial correlation in the
+error process. To overcome this limitation, we propose an innovative training
+method that incorporates error autocorrelation to further enhance the accuracy
+of probabilistic forecasting. Our method involves constructing a mini-batch as
+a collection of $D$ consecutive time series segments for model training and
+explicitly learning a time-varying covariance matrix over each mini-batch that
+encodes the error correlation among adjacent time steps. The learned covariance
+matrix can be used to improve prediction accuracy and enhance uncertainty
+quantification. We evaluate our method on two different neural forecasting
+models and multiple public datasets, and the experimental results confirm the
+effectiveness of the proposed approach in enhancing the performance of both
+models across a wide range of datasets, yielding notable improvements in
+predictive accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 figures, camera-ready version, The 27th International
+  Conference on Artificial Intelligence and Statistics (AISTATS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Effect of Intrinsic <span class="highlight-title">Dataset</span> Properties on Generalization: Unraveling
+  Learning Differences Between Natural and Medical Images <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08865v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08865v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Konz, Maciej A. Mazurowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates discrepancies in how neural networks learn from
+different imaging domains, which are commonly overlooked when adopting computer
+vision techniques from the domain of natural images to other specialized
+domains such as medical images. Recent works have found that the generalization
+error of a trained network typically increases with the intrinsic dimension
+($d_{data}$) of its training set. Yet, the steepness of this relationship
+varies significantly between medical (radiological) and natural imaging
+domains, with no existing theoretical explanation. We address this gap in
+knowledge by establishing and empirically validating a generalization scaling
+law with respect to $d_{data}$, and propose that the substantial scaling
+discrepancy between the two considered domains may be at least partially
+attributed to the higher intrinsic "label sharpness" ($K_F$) of medical imaging
+datasets, a metric which we propose. Next, we demonstrate an additional benefit
+of measuring the label sharpness of a training set: it is negatively correlated
+with the trained model's adversarial robustness, which notably leads to models
+for medical images having a substantially higher vulnerability to adversarial
+attack. Finally, we extend our $d_{data}$ formalism to the related metric of
+learned representation intrinsic dimension ($d_{repr}$), derive a
+generalization scaling law with respect to $d_{repr}$, and show that $d_{data}$
+serves as an upper bound for $d_{repr}$. Our theoretical results are supported
+by thorough experiments with six models and eleven natural and medical imaging
+datasets over a range of training set sizes. Our findings offer insights into
+the influence of intrinsic dataset properties on generalization, representation
+learning, and robustness in deep neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. Code:
+  https://github.com/mazurowski-lab/intrinsic-properties</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Augment on Manifold: Mixup Regularization with UMAP <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13141v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13141v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yousef El-Laham, Elizabeth Fons, Dillon Daudert, Svitlana Vyetrenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation techniques play an important role in enhancing the
+performance of deep learning models. Despite their proven benefits in computer
+vision tasks, their application in the other domains remains limited. This
+paper proposes a Mixup regularization scheme, referred to as UMAP Mixup,
+designed for ``on-manifold" automated data augmentation for deep learning
+predictive models. The proposed approach ensures that the Mixup operations
+result in synthesized samples that lie on the data manifold of the features and
+labels by utilizing a dimensionality reduction technique known as uniform
+manifold approximation and projection. Evaluations across diverse regression
+tasks show that UMAP Mixup is competitive with or outperforms other Mixup
+variants, show promise for its potential as an effective tool for enhancing the
+generalization performance of deep learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted paper to be published in the proceedings of ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Annotation Sensitivity: Training Data Collection Methods Affect Model
+  Performance <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.14212v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.14212v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Kern, Stephanie Eckman, Jacob Beck, Rob Chew, Bolei Ma, Frauke Kreuter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When training data are collected from human annotators, the design of the
+annotation instrument, the instructions given to annotators, the
+characteristics of the annotators, and their interactions can impact training
+data. This study demonstrates that design choices made when creating an
+annotation instrument also impact the models trained on the resulting
+annotations. We introduce the term annotation sensitivity to refer to the
+impact of annotation data collection methods on the annotations themselves and
+on downstream model performance and predictions. We collect annotations of hate
+speech and offensive language in five experimental conditions of an annotation
+instrument, randomly assigning annotators to conditions. We then fine-tune BERT
+models on each of the five resulting datasets and evaluate model performance on
+a holdout portion of each condition. We find considerable differences between
+the conditions for 1) the share of hate speech/offensive language annotations,
+2) model performance, 3) model predictions, and 4) model learning curves. Our
+results emphasize the crucial role played by the annotation instrument which
+has received little attention in the machine learning literature. We call for
+additional research into how and why the instrument impacts the annotations to
+inform the development of best practices in instrument design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings:
+  https://aclanthology.org/2023.findings-emnlp.992/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Stochastic Differential Equations with Change Points: A
+  Generative Adversarial Approach <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13152v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13152v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongchang Sun, Yousef El-Laham, Svitlana Vyetrenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic differential equations (SDEs) have been widely used to model real
+world random phenomena. Existing works mainly focus on the case where the time
+series is modeled by a single SDE, which might be restrictive for modeling time
+series with distributional shift. In this work, we propose a change point
+detection algorithm for time series modeled as neural SDEs. Given a time series
+dataset, the proposed method jointly learns the unknown change points and the
+parameters of distinct neural SDE models corresponding to each change point.
+Specifically, the SDEs are learned under the framework of generative
+adversarial networks (GANs) and the change points are detected based on the
+output of the GAN discriminator in a forward pass. At each step of the proposed
+algorithm, the change points and the SDE model parameters are updated in an
+alternating fashion. Numerical results on both synthetic and real datasets are
+provided to validate the performance of our algorithm in comparison to
+classical change point detection benchmarks, standard GAN-based neural SDEs,
+and other state-of-the-art deep generative models for time series data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted paper to be published in the proceedings of ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decision Tree Search as a Markov Decision Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12701v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12701v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hector Kohler, Riad Akrour, Philippe Preux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding an optimal decision tree for a supervised learning task is a
+challenging combinatorial problem to solve at scale. It was recently proposed
+to frame the problem as a Markov Decision Problem (MDP) and use deep
+reinforcement learning to tackle scaling. Unfortunately, these methods are not
+competitive with the current branch-and-bound state-of-the-art. We propose
+instead to scale the resolution of such MDPs using an information-theoretic
+tests generating function that heuristically, and dynamically for every state,
+limits the set of admissible test actions to a few good candidates. As a
+solver, we show empirically that our algorithm is at the very least competitive
+with branch-and-bound alternatives. As a machine learning tool, a key advantage
+of our approach is to solve for multiple complexity-performance trade-offs at
+virtually no additional cost. With such a set of solutions, a user can then
+select the tree that generalizes best and which has the interpretability level
+that best suits their needs, which no current branch-and-bound method allows.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Learning-Based Analysis of Ebola Virus' Impact on Gene
+  Expression in Nonhuman Primates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08738v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08738v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mostafa Rezapour, Muhammad Khalid Khan Niazi, Hao Lu, Aarthi Narayanan, Metin Nafi Gurcan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces the Supervised Magnitude-Altitude Scoring (SMAS)
+methodology, a machine learning-based approach, for analyzing gene expression
+data obtained from nonhuman primates (NHPs) infected with Ebola virus (EBOV).
+We utilize a comprehensive dataset of NanoString gene expression profiles from
+Ebola-infected NHPs, deploying the SMAS system for nuanced host-pathogen
+interaction analysis. SMAS effectively combines gene selection based on
+statistical significance and expression changes, employing linear classifiers
+such as logistic regression to accurately differentiate between RT-qPCR
+positive and negative NHP samples. A key finding of our research is the
+identification of IFI6 and IFI27 as critical biomarkers, demonstrating
+exceptional predictive performance with 100% accuracy and Area Under the Curve
+(AUC) metrics in classifying various stages of Ebola infection. Alongside IFI6
+and IFI27, genes, including MX1, OAS1, and ISG15, were significantly
+upregulated, highlighting their essential roles in the immune response to EBOV.
+Our results underscore the efficacy of the SMAS method in revealing complex
+genetic interactions and response mechanisms during EBOV infection. This
+research provides valuable insights into EBOV pathogenesis and aids in
+developing more precise diagnostic tools and therapeutic strategies to address
+EBOV infection in particular and viral infection in general.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 8 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning-assisted Stochastic Capacity Expansion Planning: A Bayesian
+  Optimization Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10451v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10451v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aron Brenner, Rahman Khorramfar, Dharik Mallapragada, Saurabh Amin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving large-scale capacity expansion problems (CEPs) is central to
+cost-effective decarbonization of regional-scale energy systems. To ensure the
+intended outcomes of CEPs, modeling uncertainty due to weather-dependent
+variable renewable energy (VRE) supply and energy demand becomes crucially
+important. However, the resulting stochastic optimization models are often less
+computationally tractable than their deterministic counterparts. Here, we
+propose a learning-assisted approximate solution method to tractably solve
+two-stage stochastic CEPs. Our method identifies low-cost planning decisions by
+constructing and solving a sequence of tractable temporally aggregated
+surrogate problems. We adopt a Bayesian optimization approach to searching the
+space of time series aggregation hyperparameters and compute approximate
+solutions that minimize costs on a validation set of supply-demand projections.
+Importantly, we evaluate solved planning outcomes on a held-out set of test
+projections. We apply our approach to generation and transmission expansion
+planning for a joint power-gas system spanning New England. We show that our
+approach yields an estimated cost savings of up to 3.8% in comparison to
+benchmark time series aggregation approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ New Versions of Gradient Temporal Difference Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.04033v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.04033v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donghwan Lee, Han-Dong Lim, Jihoon Park, Okyong Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sutton, Szepesv\'{a}ri and Maei introduced the first gradient
+temporal-difference (GTD) learning algorithms compatible with both linear
+function approximation and off-policy training. The goal of this paper is (a)
+to propose some variants of GTDs with extensive comparative analysis and (b) to
+establish new theoretical analysis frameworks for the GTDs. These variants are
+based on convex-concave saddle-point interpretations of GTDs, which effectively
+unify all the GTDs into a single framework, and provide simple stability
+analysis based on recent results on primal-dual gradient dynamics. Finally,
+numerical comparative analysis is given to evaluate these approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Approximating Langevin Monte Carlo with ResNet-like Neural Network
+  architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03242v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03242v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles Miranda, Janina Schütte, David Sommer, Martin Eigel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We sample from a given target distribution by constructing a neural network
+which maps samples from a simple reference, e.g. the standard normal
+distribution, to samples from the target. To that end, we propose using a
+neural network architecture inspired by the Langevin Monte Carlo (LMC)
+algorithm. Based on LMC perturbation results, we show approximation rates of
+the proposed architecture for smooth, log-concave target distributions measured
+in the Wasserstein-$2$ distance. The analysis heavily relies on the notion of
+sub-Gaussianity of the intermediate measures of the perturbed LMC process. In
+particular, we derive bounds on the growth of the intermediate variance proxies
+under different assumptions on the perturbations. Moreover, we propose an
+architecture similar to deep residual neural networks and derive expressivity
+results for approximating the sample to target distribution map.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparison analysis between standard polysomnographic data and
+  in-ear-EEG signals: A preliminary study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10107v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10107v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianpaolo Palo, Luigi Fiorillo, Giuliana Monachino, Michal Bechny, Mark Melnykowycz, Athina Tzovara, Valentina Agostini, Francesca Dalia Faraci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Study Objectives: Polysomnography (PSG) currently serves as the benchmark for
+evaluating sleep disorders. Its discomfort, impracticality for home-use, and
+introduction of bias in sleep quality assessment necessitate the exploration of
+less invasive, cost-effective, and portable alternatives. One promising
+contender is the in-ear-EEG sensor, which offers advantages in terms of
+comfort, fixed electrode positions, resistance to electromagnetic interference,
+and user-friendliness. This study aims to establish a methodology to assess the
+similarity between the in-ear-EEG signal and standard PSG.
+  Methods: We assess the agreement between the PSG and in-ear-EEG derived
+hypnograms. We extract features in the time- and frequency- domain from PSG and
+in-ear-EEG 30-second epochs. We only consider the epochs where the PSG-scorers
+and the in-ear-EEG-scorers were in agreement. We introduce a methodology to
+quantify the similarity between PSG derivations and the single-channel
+in-ear-EEG. The approach relies on a comparison of distributions of selected
+features -- extracted for each sleep stage and subject on both PSG and the
+in-ear-EEG signals -- via a Jensen-Shannon Divergence Feature-based Similarity
+Index (JSD-FSI).
+  Results: We found a high intra-scorer variability, mainly due to the
+uncertainty the scorers had in evaluating the in-ear-EEG signals. We show that
+the similarity between PSG and in-ear-EEG signals is high (JSD-FSI: 0.61 +/-
+0.06 in awake, 0.60 +/- 0.07 in NREM and 0.51 +/- 0.08 in REM), and in line
+with the similarity values computed independently on standard
+PSG-channel-combinations.
+  Conclusions: In-ear-EEG is a valuable solution for home-based sleep
+monitoring, however further studies with a larger and more heterogeneous
+dataset are needed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 12 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics-guided Noise Neural Proxy for Practical Low-light Raw Image
+  Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09126v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09126v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansen Feng, Lizhi Wang, Yiqi Huang, Yuzhi Wang, Lin Zhu, Hua Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the mainstream practice for training low-light raw image denoising
+methods has shifted towards employing synthetic data. Noise modeling, which
+focuses on characterizing the noise distribution of real-world sensors,
+profoundly influences the effectiveness and practicality of synthetic data.
+Currently, physics-based noise modeling struggles to characterize the entire
+real noise distribution, while learning-based noise modeling impractically
+depends on paired real data. In this paper, we propose a novel strategy:
+learning the noise model from dark frames instead of paired real data, to break
+down the data dependency. Based on this strategy, we introduce an efficient
+physics-guided noise neural proxy (PNNP) to approximate the real-world sensor
+noise model. Specifically, we integrate physical priors into neural proxies and
+introduce three efficient techniques: physics-guided noise decoupling (PND),
+physics-guided proxy model (PPM), and differentiable distribution loss (DDL).
+PND decouples the dark frame into different components and handles different
+levels of noise flexibly, which reduces the complexity of noise modeling. PPM
+incorporates physical priors to constrain the generated noise, which promotes
+the accuracy of noise modeling. DDL provides explicit and reliable supervision
+for noise distribution, which promotes the precision of noise modeling. PNNP
+exhibits powerful potential in characterizing the real noise distribution.
+Extensive experiments on public datasets demonstrate superior performance in
+practical low-light raw image denoising. The code will be available at
+\url{https://github.com/fenghansen/PNNP}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Noise Contrastive Estimation-based Matching Framework for Low-resource
+  Security Attack Pattern Recognition <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10337v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10337v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tu Nguyen, Nedim Srndic, Alexander Neth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tactics, Techniques and Procedures (TTPs) represent sophisticated attack
+patterns in the cybersecurity domain, described encyclopedically in textual
+knowledge bases. Identifying TTPs in cybersecurity writing, often called TTP
+mapping, is an important and challenging task. Conventional learning approaches
+often target the problem in the classical multi-class or multilabel
+classification setting. This setting hinders the learning ability of the model
+due to a large number of classes (i.e., TTPs), the inevitable skewness of the
+label distribution and the complex hierarchical structure of the label space.
+We formulate the problem in a different learning paradigm, where the assignment
+of a text to a TTP label is decided by the direct semantic similarity between
+the two, thus reducing the complexity of competing solely over the large
+labeling space. To that end, we propose a neural matching architecture with an
+effective sampling-based learn-to-compare mechanism, facilitating the learning
+process of the matching model despite constrained resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at EACL 2024, in ARR October 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Reinforcement Learning with Swin <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.15269v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.15269v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Meng, Morten Goodwin, Anis Yazidi, Paal Engelstad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers are neural network models that utilize multiple layers of
+self-attention heads and have exhibited enormous potential in natural language
+processing tasks. Meanwhile, there have been efforts to adapt transformers to
+visual tasks of machine learning, including Vision Transformers and Swin
+Transformers. Although some researchers use Vision Transformers for
+reinforcement learning tasks, their experiments remain at a small scale due to
+the high computational cost. This article presents the first online
+reinforcement learning scheme that is based on Swin Transformers: Swin DQN. In
+contrast to existing research, our novel approach demonstrate the superior
+performance with experiments on 49 games in the Arcade Learning Environment.
+The results show that our approach achieves significantly higher maximal
+evaluation scores than the baseline method in 45 of all the 49 games (92%), and
+higher mean evaluation scores than the baseline method in 40 of all the 49
+games (82%).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Catastrophic Interference is Mitigated in Naturalistic Power-Law
+  Learning Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10393v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10393v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atith Gandhi, Raj Sanjay Shah, Vijay Marupudi, Sashank Varma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks often suffer from catastrophic interference (CI): performance
+on previously learned tasks drops off significantly when learning a new task.
+This contrasts strongly with humans, who can sequentially learn new tasks
+without appreciably forgetting previous tasks. Prior work has explored various
+techniques for mitigating CI such as regularization, rehearsal, generative
+replay, and distillation methods. The current work takes a different approach,
+one guided by cognitive science research showing that in naturalistic
+environments, the probability of encountering a task decreases as a power-law
+of the time since it was last performed. We argue that a realistic evaluation
+of techniques for the mitigation of CI should be performed in simulated
+naturalistic learning environments. Thus, we evaluate the extent of mitigation
+of CI when training simple rehearsal-based methods in power-law environments
+similar to the ones humans face. Our work explores this novel rehearsal-based
+approach for a domain-incremental task: learning permutations in the MNIST
+task. We compare our rehearsal environment with other baselines to show its
+efficacy in promoting continual learning. Additionally, we investigate whether
+this environment shows forward facilitation, i.e., faster learning of later
+tasks. Next, we explore the robustness of our learning environment to the
+number of tasks, model size, and amount of data rehearsed after each task.
+Notably, our results show that the performance is comparable or superior to
+that of models trained using popular regularization methods and also to
+rehearsals in non-power-law environments. The benefits of this training
+paradigm include simplicity and the lack of a need for extra neural circuitry.
+In addition, because our method is orthogonal to other methods, future research
+can combine training in power-law environments with other continual learning
+mechanisms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Algorithmic Reasoning for Combinatorial Optimisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06064v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06064v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dobrik Georgiev, Danilo Numeroso, Davide Bacciu, Pietro Liò
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving NP-hard/complete combinatorial problems with neural networks is a
+challenging research area that aims to surpass classical approximate
+algorithms. The long-term objective is to outperform hand-designed heuristics
+for NP-hard/complete problems by learning to generate superior solutions solely
+from training data. Current neural-based methods for solving CO problems often
+overlook the inherent "algorithmic" nature of the problems. In contrast,
+heuristics designed for CO problems, e.g. TSP, frequently leverage
+well-established algorithms, such as those for finding the minimum spanning
+tree. In this paper, we propose leveraging recent advancements in neural
+algorithmic reasoning to improve the learning of CO problems. Specifically, we
+suggest pre-training our neural model on relevant algorithms before training it
+on CO instances. Our results demonstrate that by using this learning setup, we
+achieve superior performance compared to non-algorithmically informed deep
+learning models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explaining RL Decisions with Trajectories <span class="chip">ICLR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04073v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04073v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shripad Vilasrao Deshmukh, Arpan Dasgupta, Balaji Krishnamurthy, Nan Jiang, Chirag Agarwal, Georgios Theocharous, Jayakumar Subramanian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explanation is a key component for the adoption of reinforcement learning
+(RL) in many real-world decision-making problems. In the literature, the
+explanation is often provided by saliency attribution to the features of the RL
+agent's state. In this work, we propose a complementary approach to these
+explanations, particularly for offline RL, where we attribute the policy
+decisions of a trained RL agent to the trajectories encountered by it during
+training. To do so, we encode trajectories in offline training data
+individually as well as collectively (encoding a set of trajectories). We then
+attribute policy decisions to a set of trajectories in this encoded space by
+estimating the sensitivity of the decision with respect to that set. Further,
+we demonstrate the effectiveness of the proposed approach in terms of quality
+of attributions as well as practical scalability in diverse environments that
+involve both discrete and continuous state and action spaces such as
+grid-worlds, video games (Atari) and continuous control (MuJoCo). We also
+conduct a human study on a simple navigation task to observe how their
+understanding of the task compares with data attributed for a trained RL
+policy. Keywords -- Explainable AI, Verifiability of AI Decisions, Explainable
+RL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at International Conference on Learning Representations
+  (ICLR), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ImpNet: Imperceptible and blackbox-undetectable backdoors in compiled
+  neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.00108v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.00108v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Clifford, Ilia Shumailov, Yiren Zhao, Ross Anderson, Robert Mullins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early backdoor attacks against machine learning set off an arms race in
+attack and defence development. Defences have since appeared demonstrating some
+ability to detect backdoors in models or even remove them. These defences work
+by inspecting the training data, the model, or the integrity of the training
+procedure. In this work, we show that backdoors can be added during
+compilation, circumventing any safeguards in the data preparation and model
+training stages. The attacker can not only insert existing weight-based
+backdoors during compilation, but also a new class of weight-independent
+backdoors, such as ImpNet. These backdoors are impossible to detect during the
+training or data preparation processes, because they are not yet present. Next,
+we demonstrate that some backdoors, including ImpNet, can only be reliably
+detected at the stage where they are inserted and removing them anywhere else
+presents a significant challenge. We conclude that ML model security requires
+assurance of provenance along the entire technical pipeline, including the
+data, model architecture, compiler, and hardware specification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, to be published in IEEE Secure and Trustworthy
+  Machine Learning 2024. For website see https://ml.backdoors.uk . For source
+  code, see https://git.sr.ht/~tim-clifford/impnet_source</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analytical Modelling of Raw Data for Flow-Guided In-body Nanoscale
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16034v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16034v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillem Pascual, Filip Lemic, Carmen Delgado, Xavier Costa-Perez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in nanotechnology and material science are paving the way toward
+nanoscale devices that combine sensing, computing, data and energy storage, and
+wireless communication. In precision medicine, these nanodevices show promise
+for disease diagnostics, treatment, and monitoring from within the patients'
+bloodstreams. Assigning the location of a sensed biological event with the
+event itself, which is the main proposition of flow-guided in-body nanoscale
+localization, would be immensely beneficial from the perspective of precision
+medicine. The nanoscale nature of the nanodevices and the challenging
+environment that the bloodstream represents, result in current flow-guided
+localization approaches being constrained in their communication and
+energy-related capabilities. The communication and energy constraints of the
+nanodevices result in different features of raw data for flow-guided
+localization, in turn affecting its performance. An analytical modeling of the
+effects of imperfect communication and constrained energy causing intermittent
+operation of the nanodevices on the raw data produced by the nanodevices would
+be beneficial. Hence, we propose an analytical model of raw data for
+flow-guided localization, where the raw data is modeled as a function of
+communication and energy-related capabilities of the nanodevice. We evaluate
+the model by comparing its output with the one obtained through the utilization
+of a simulator for objective evaluation of flow-guided localization, featuring
+comparably higher level of realism. Our results across a number of scenarios
+and heterogeneous performance metrics indicate high similarity between the
+model and simulator-generated raw datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures, 4 tables, 16 references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the different regimes of Stochastic Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10688v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10688v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio Sclocchi, Matthieu Wyart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern deep networks are trained with stochastic gradient descent (SGD) whose
+key hyperparameters are the number of data considered at each step or batch
+size $B$, and the step size or learning rate $\eta$. For small $B$ and large
+$\eta$, SGD corresponds to a stochastic evolution of the parameters, whose
+noise amplitude is governed by the `temperature' $T\equiv \eta/B$. Yet this
+description is observed to break down for sufficiently large batches $B\geq
+B^*$, or simplifies to gradient descent (GD) when the temperature is
+sufficiently small. Understanding where these cross-overs take place remains a
+central challenge. Here, we resolve these questions for a teacher-student
+perceptron classification model and show empirically that our key predictions
+still apply to deep networks. Specifically, we obtain a phase diagram in the
+$B$-$\eta$ plane that separates three dynamical phases: \textit{(i)} a
+noise-dominated SGD governed by temperature, \textit{(ii)} a
+large-first-step-dominated SGD and \textit{(iii)} GD. These different phases
+also correspond to different regimes of generalization error. Remarkably, our
+analysis reveals that the batch size $B^*$ separating regimes \textit{(i)} and
+\textit{(ii)} scale with the size $P$ of the training set, with an exponent
+that characterizes the hardness of the classification problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main: 8 pages, 4 figures; Appendix: 20 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Uncertainty Quantification Using Conformalised Monte Carlo
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.09647v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.09647v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Bethell, Simos Gerasimou, Radu Calinescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying deep learning models in safety-critical applications remains a very
+challenging task, mandating the provision of assurances for the dependable
+operation of these models. Uncertainty quantification (UQ) methods estimate the
+model's confidence per prediction, informing decision-making by considering the
+effect of randomness and model misspecification. Despite the advances of
+state-of-the-art UQ methods, they are computationally expensive or produce
+conservative prediction sets/intervals. We introduce MC-CP, a novel hybrid UQ
+method that combines a new adaptive Monte Carlo (MC) dropout method with
+conformal prediction (CP). MC-CP adaptively modulates the traditional MC
+dropout at runtime to save memory and computation resources, enabling
+predictions to be consumed by CP, yielding robust prediction sets/intervals.
+Throughout comprehensive experiments, we show that MC-CP delivers significant
+improvements over advanced UQ methods, like MC dropout, RAPS and CQR, both in
+classification and regression benchmarks. MC-CP can be easily added to existing
+models, making its deployment simple.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Optimal Regularization Parameters via Bilevel Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18394v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18394v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias J. Ehrhardt, Silvia Gazzola, Sebastian J. Scott
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational regularization is commonly used to solve linear inverse problems,
+and involves augmenting a data fidelity by a regularizer. The regularizer is
+used to promote a priori information and is weighted by a regularization
+parameter. Selection of an appropriate regularization parameter is critical,
+with various choices leading to very different reconstructions. Classical
+strategies used to determine a suitable parameter value include the discrepancy
+principle and the L-curve criterion, and in recent years a supervised machine
+learning approach called bilevel learning has been employed. Bilevel learning
+is a powerful framework to determine optimal parameters and involves solving a
+nested optimization problem. While previous strategies enjoy various
+theoretical results, the well-posedness of bilevel learning in this setting is
+still an open question. In particular, a necessary property is positivity of
+the determined regularization parameter. In this work, we provide a new
+condition that better characterizes positivity of optimal regularization
+parameters than the existing theory. Numerical results verify and explore this
+new condition for both small and high-dimensional problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 11 figures. Version for publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Expected Return: Accounting for Policy Reproducibility when
+  Evaluating Reinforcement Learning Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07178v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07178v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manon Flageat, Bryan Lim, Antoine Cully
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many applications in Reinforcement Learning (RL) usually have noise or
+stochasticity present in the environment. Beyond their impact on learning,
+these uncertainties lead the exact same policy to perform differently, i.e.
+yield different return, from one roll-out to another. Common evaluation
+procedures in RL summarise the consequent return distributions using solely the
+expected return, which does not account for the spread of the distribution. Our
+work defines this spread as the policy reproducibility: the ability of a policy
+to obtain similar performance when rolled out many times, a crucial property in
+some real-world applications. We highlight that existing procedures that only
+use the expected return are limited on two fronts: first an infinite number of
+return distributions with a wide range of performance-reproducibility
+trade-offs can have the same expected return, limiting its effectiveness when
+used for comparing policies; second, the expected return metric does not leave
+any room for practitioners to choose the best trade-off value for considered
+applications. In this work, we address these limitations by recommending the
+use of Lower Confidence Bound, a metric taken from Bayesian optimisation that
+provides the user with a preference parameter to choose a desired
+performance-reproducibility trade-off. We also formalise and quantify policy
+reproducibility, and demonstrate the benefit of our metrics using extensive
+experiments of popular RL algorithms on common uncertain RL tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Cross Domain Generalization of Hamiltonian Representation via
+  Meta Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.01168v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.01168v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeongwoo Song, Hawoong Jeong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in deep learning for physics have focused on discovering
+shared representations of target systems by incorporating physics priors or
+inductive biases into neural networks. While effective, these methods are
+limited to the system domain, where the type of system remains consistent and
+thus cannot ensure the adaptation to new, or unseen physical systems governed
+by different laws. For instance, a neural network trained on a mass-spring
+system cannot guarantee accurate predictions for the behavior of a two-body
+system or any other system with different physical laws. In this work, we take
+a significant leap forward by targeting cross domain generalization within the
+field of Hamiltonian dynamics. We model our system with a graph neural network
+and employ a meta learning algorithm to enable the model to gain experience
+over a distribution of tasks and make it adapt to new physics. Our approach
+aims to learn a unified Hamiltonian representation that is generalizable across
+multiple system domains, thereby overcoming the limitations of system-specific
+models. Our results demonstrate that the meta-trained model not only adapts
+effectively to new systems but also captures a generalized Hamiltonian
+representation that is consistent across different physical domains. Overall,
+through the use of meta learning, we offer a framework that achieves cross
+domain generalization, providing a step towards a unified model for
+understanding a wide array of dynamical systems via deep learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference paper at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convolve and Conquer: Data Comparison with Wiener Filters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.06558v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.06558v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deborah Pelacani Cruz, George Strong, Oscar Bates, Carlos Cueto, Jiashun Yao, Lluis Guasch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantitative evaluations of differences and/or similarities between data
+samples define and shape optimisation problems associated with learning data
+distributions. Current methods to compare data often suffer from limitations in
+capturing such distributions or lack desirable mathematical properties for
+optimisation (e.g. smoothness, differentiability, or convexity). In this paper,
+we introduce a new method to measure (dis)similarities between paired samples
+inspired by Wiener-filter theory. The convolutional nature of Wiener filters
+allows us to comprehensively compare data samples in a globally correlated way.
+We validate our approach in four machine learning applications: data
+compression, medical imaging imputation, translated classification, and
+non-parametric generative modelling. Our results demonstrate increased
+resolution in reconstructed images with better perceptual quality and higher
+data fidelity, as well as robustness against translations, compared to
+conventional mean-squared-error analogue implementations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, Medical Imaging Meets Neurips Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 2D-3D Interlaced <span class="highlight-title">Transformer</span> for Point Cloud Segmentation with
+  Scene-Level Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12817v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12817v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng-Kun Yang, Min-Hung Chen, Yung-Yu Chuang, Yen-Yu Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a Multimodal Interlaced Transformer (MIT) that jointly considers
+2D and 3D data for weakly supervised point cloud segmentation. Research studies
+have shown that 2D and 3D features are complementary for point cloud
+segmentation. However, existing methods require extra 2D annotations to achieve
+2D-3D information fusion. Considering the high annotation cost of point clouds,
+effective 2D and 3D feature fusion based on weakly supervised learning is in
+great demand. To this end, we propose a transformer model with two encoders and
+one decoder for weakly supervised point cloud segmentation using only
+scene-level class tags. Specifically, the two encoders compute the
+self-attended features for 3D point clouds and 2D multi-view images,
+respectively. The decoder implements interlaced 2D-3D cross-attention and
+carries out implicit 2D and 3D feature fusion. We alternately switch the roles
+of queries and key-value pairs in the decoder layers. It turns out that the 2D
+and 3D features are iteratively enriched by each other. Experiments show that
+it performs favorably against existing weakly supervised point cloud
+segmentation methods by a large margin on the S3DIS and ScanNet benchmarks. The
+project page will be available at https://jimmy15923.github.io/mit_web/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 (main + supp). Website:
+  https://jimmy15923.github.io/mit_web/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Learning for MIMO Equalization Using <span class="highlight-title">Transformer</span>-Based
+  Sequence Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.06101v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.06101v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Zecchin, Kai Yu, Osvaldo Simeone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large pre-trained sequence models, such as transformer-based architectures,
+have been recently shown to have the capacity to carry out in-context learning
+(ICL). In ICL, a decision on a new input is made via a direct mapping of the
+input and of a few examples from the given task, serving as the task's context,
+to the output variable. No explicit updates of the model parameters are needed
+to tailor the decision to a new task. Pre-training, which amounts to a form of
+meta-learning, is based on the observation of examples from several related
+tasks. Prior work has shown ICL capabilities for linear regression. In this
+study, we leverage ICL to address the inverse problem of multiple-input and
+multiple-output (MIMO) equalization based on a context given by pilot symbols.
+A task is defined by the unknown fading channel and by the signal-to-noise
+ratio (SNR) level, which may be known. To highlight the practical potential of
+the approach, we allow the presence of quantization of the received signals. We
+demonstrate via numerical results that transformer-based ICL has a threshold
+behavior, whereby, as the number of pre-training tasks grows, the performance
+switches from that of a minimum mean squared error (MMSE) equalizer with a
+prior determined by the pre-trained tasks to that of an MMSE equalizer with the
+true data-generating prior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Vanilla Variational Autoencoders: Detecting Posterior Collapse in
+  Conditional and Hierarchical Variational Autoencoders <span class="chip">ICLR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05023v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05023v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hien Dang, Tho Tran, Tan Nguyen, Nhat Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The posterior collapse phenomenon in variational autoencoder (VAE), where the
+variational posterior distribution closely matches the prior distribution, can
+hinder the quality of the learned latent variables. As a consequence of
+posterior collapse, the latent variables extracted by the encoder in VAE
+preserve less information from the input data and thus fail to produce
+meaningful representations as input to the reconstruction process in the
+decoder. While this phenomenon has been an actively addressed topic related to
+VAE performance, the theory for posterior collapse remains underdeveloped,
+especially beyond the standard VAE. In this work, we advance the theoretical
+understanding of posterior collapse to two important and prevalent yet less
+studied classes of VAE: conditional VAE and hierarchical VAE. Specifically, via
+a non-trivial theoretical analysis of linear conditional VAE and hierarchical
+VAE with two levels of latent, we prove that the cause of posterior collapses
+in these models includes the correlation between the input and output of the
+conditional VAE and the effect of learnable encoder variance in the
+hierarchical VAE. We empirically validate our theoretical findings for linear
+conditional and hierarchical VAE and demonstrate that these results are also
+predictive for non-linear cases with extensive experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Conference on Learning Representations (ICLR) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Through the Dual-Prism: A Spectral Perspective on Graph Data
+  Augmentation for Graph Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09953v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09953v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutong Xia, Runpeng Yu, Yuxuan Liang, Xavier Bresson, Xinchao Wang, Roger Zimmermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have become the preferred tool to process graph
+data, with their efficacy being boosted through graph data augmentation
+techniques. Despite the evolution of augmentation methods, issues like graph
+property distortions and restricted structural changes persist. This leads to
+the question: Is it possible to develop more property-conserving and
+structure-sensitive augmentation methods? Through a spectral lens, we
+investigate the interplay between graph properties, their augmentation, and
+their spectral behavior, and found that keeping the low-frequency eigenvalues
+unchanged can preserve the critical properties at a large scale when generating
+augmented graphs. These observations inform our introduction of the Dual-Prism
+(DP) augmentation method, comprising DP-Noise and DP-Mask, which adeptly
+retains essential graph properties while diversifying augmented graphs.
+Extensive experiments validate the efficiency of our approach, providing a new
+and promising direction for graph data augmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Starlit: Privacy-Preserving Federated Learning to Enhance Financial
+  Fraud Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10765v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10765v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aydin Abadi, Bradley Doyle, Francesco Gini, Kieron Guinamard, Sasi Kumar Murakonda, Jack Liddell, Paul Mellor, Steven J. Murdoch, Mohammad Naseri, Hector Page, George Theodorakopoulos, Suzanne Weller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a data-minimization approach enabling
+collaborative model training across diverse clients with local data, avoiding
+direct data exchange. However, state-of-the-art FL solutions to identify
+fraudulent financial transactions exhibit a subset of the following
+limitations. They (1) lack a formal security definition and proof, (2) assume
+prior freezing of suspicious customers' accounts by financial institutions
+(limiting the solutions' adoption), (3) scale poorly, involving either $O(n^2)$
+computationally expensive modular exponentiation (where $n$ is the total number
+of financial institutions) or highly inefficient fully homomorphic encryption,
+(4) assume the parties have already completed the identity alignment phase,
+hence excluding it from the implementation, performance evaluation, and
+security analysis, and (5) struggle to resist clients' dropouts. This work
+introduces Starlit, a novel scalable privacy-preserving FL mechanism that
+overcomes these limitations. It has various applications, such as enhancing
+financial fraud detection, mitigating terrorism, and enhancing digital health.
+We implemented Starlit and conducted a thorough performance analysis using
+synthetic data from a key player in global financial transactions. The
+evaluation indicates Starlit's scalability, efficiency, and accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Medication Recommendation via Domain Knowledge Informed Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19604v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19604v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicen Liu, Xiaolong Wang, Xianbing Zhao, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medication recommendation is a fundamental yet crucial branch of healthcare,
+which provides opportunities to support clinical physicians with more accurate
+medication prescriptions for patients with complex health conditions. Learning
+from electronic health records (EHR) to recommend medications is the most
+common way in previous studies. However, most of them neglect incorporating
+domain knowledge according to the clinical manifestations in the EHR of the
+patient. To address these issues, we propose a novel \textbf{D}omain
+\textbf{K}nowledge \textbf{I}nformed \textbf{Net}work (DKINet) to integrate
+domain knowledge with observable clinical manifestations of the patient, which
+is the first dynamic domain knowledge informed framework toward medication
+recommendation. In particular, we first design a knowledge-driven encoder to
+capture the domain information and then develop a data-driven encoder to
+integrate domain knowledge into the observable EHR. To endow the model with the
+capability of temporal decision, we design an explicit medication encoder for
+learning the longitudinal dependence of the patient. Extensive experiments on
+three publicly available datasets verify the superiority of our method. The
+code will be public upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Empirical Study of Using Large Language Models for Unit Test
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.00418v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.00418v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Latif Siddiq, Joanna C. S. Santos, Ridwanul Hasan Tanvir, Noshin Ulfat, Fahmid Al Rifat, Vinicius Carvalho Lopes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A code generation model generates code by taking a prompt from a code
+comment, existing code, or a combination of both. Although code generation
+models (e.g., GitHub Copilot) are increasingly being adopted in practice, it is
+unclear whether they can successfully be used for unit test generation without
+fine-tuning for a strongly typed language like Java. To fill this gap, we
+investigated how well three models (Codex, GPT-3.5-Turbo, and StarCoder) can
+generate unit tests. We used two benchmarks (HumanEval and Evosuite SF110) to
+investigate the effect of context generation on the unit test generation
+process. We evaluated the models based on compilation rates, test correctness,
+test coverage, and test smells. We found that the Codex model achieved above
+80% coverage for the HumanEval dataset, but no model had more than 2% coverage
+for the EvoSuite SF110 benchmark. The generated tests also suffered from test
+smells, such as Duplicated Asserts and Empty Tests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Langevin Unlearning: A New Perspective of Noisy Gradient Descent for
+  Machine Unlearning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10371v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10371v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eli Chien, Haoyu Wang, Ziang Chen, Pan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning has raised significant interest with the adoption of laws
+ensuring the ``right to be forgotten''. Researchers have provided a
+probabilistic notion of approximate unlearning under a similar definition of
+Differential Privacy (DP), where privacy is defined as statistical
+indistinguishability to retraining from scratch. We propose Langevin
+unlearning, an unlearning framework based on noisy gradient descent with
+privacy guarantees for approximate unlearning problems. Langevin unlearning
+unifies the DP learning process and the privacy-certified unlearning process
+with many algorithmic benefits. These include approximate certified unlearning
+for non-convex problems, complexity saving compared to retraining, sequential
+and batch unlearning for multiple unlearning requests. We verify the
+practicality of Langevin unlearning by studying its privacy-utility-complexity
+trade-off via experiments on benchmark datasets, and also demonstrate its
+superiority against gradient-decent-plus-output-perturbation based approximate
+unlearning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Latent Variable Approach for Non-Hierarchical Multi-Fidelity Adaptive
+  Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03298v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03298v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Ping Chen, Liwei Wang, Yigitcan Comlek, Wei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-fidelity (MF) methods are gaining popularity for enhancing surrogate
+modeling and design optimization by incorporating data from various
+low-fidelity (LF) models. While most existing MF methods assume a fixed
+dataset, adaptive sampling methods that dynamically allocate resources among
+fidelity models can achieve higher efficiency in the exploring and exploiting
+the design space. However, most existing MF methods rely on the hierarchical
+assumption of fidelity levels or fail to capture the intercorrelation between
+multiple fidelity levels and utilize it to quantify the value of the future
+samples and navigate the adaptive sampling. To address this hurdle, we propose
+a framework hinged on a latent embedding for different fidelity models and the
+associated pre-posterior analysis to explicitly utilize their correlation for
+adaptive sampling. In this framework, each infill sampling iteration includes
+two steps: We first identify the location of interest with the greatest
+potential improvement using the high-fidelity (HF) model, then we search for
+the next sample across all fidelity levels that maximize the improvement per
+unit cost at the location identified in the first step. This is made possible
+by a single Latent Variable Gaussian Process (LVGP) model that maps different
+fidelity models into an interpretable latent space to capture their
+correlations without assuming hierarchical fidelity levels. The LVGP enables us
+to assess how LF sampling candidates will affect HF response with pre-posterior
+analysis and determine the next sample with the best benefit-to-cost ratio.
+Through test cases, we demonstrate that the proposed method outperforms the
+benchmark methods in both MF global fitting (GF) and Bayesian Optimization (BO)
+problems in convergence rate and robustness. Moreover, the method offers the
+flexibility to switch between GF and BO by simply changing the acquisition
+function.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-UAV Speed Control with Collision Avoidance and Handover-aware Cell
+  Association: DRL with Action Branching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13158v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13158v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijiang Yan, Wael Jaafar, Bassant Selim, Hina Tabassum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a deep reinforcement learning solution for optimizing
+multi-UAV cell-association decisions and their moving velocity on a 3D aerial
+highway. The objective is to enhance transportation and communication
+performance, including collision avoidance, connectivity, and handovers. The
+problem is formulated as a Markov decision process (MDP) with UAVs' states
+defined by velocities and communication data rates. We propose a neural
+architecture with a shared decision module and multiple network branches, each
+dedicated to a specific action dimension in a 2D transportation-communication
+space. This design efficiently handles the multi-dimensional action space,
+allowing independence for individual action dimensions. We introduce two
+models, Branching Dueling Q-Network (BDQ) and Branching Dueling Double Deep
+Q-Network (Dueling DDQN), to demonstrate the approach. Simulation results show
+a significant improvement of 18.32% compared to existing benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Globecom 2023 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Act as You Learn: Adaptive Decision-Making in Non-Stationary Markov
+  Decision Processes <span class="chip">AAMAS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01841v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01841v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baiting Luo, Yunuo Zhang, Abhishek Dubey, Ayan Mukhopadhyay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A fundamental (and largely open) challenge in sequential decision-making is
+dealing with non-stationary environments, where exogenous environmental
+conditions change over time. Such problems are traditionally modeled as
+non-stationary Markov decision processes (NSMDP). However, existing approaches
+for decision-making in NSMDPs have two major shortcomings: first, they assume
+that the updated environmental dynamics at the current time are known (although
+future dynamics can change); and second, planning is largely pessimistic, i.e.,
+the agent acts ``safely'' to account for the non-stationary evolution of the
+environment. We argue that both these assumptions are invalid in practice --
+updated environmental conditions are rarely known, and as the agent interacts
+with the environment, it can learn about the updated dynamics and avoid being
+pessimistic, at least in states whose dynamics it is confident about. We
+present a heuristic search algorithm called \textit{Adaptive Monte Carlo Tree
+Search (ADA-MCTS)} that addresses these challenges. We show that the agent can
+learn the updated dynamics of the environment over time and then act as it
+learns, i.e., if the agent is in a region of the state space about which it has
+updated knowledge, it can avoid being pessimistic. To quantify ``updated
+knowledge,'' we disintegrate the aleatoric and epistemic uncertainty in the
+agent's updated belief and show how the agent can use these estimates for
+decision-making. We compare the proposed approach with the multiple
+state-of-the-art approaches in decision-making across multiple well-established
+open-source problems and empirically show that our approach is faster and
+highly adaptive without sacrificing safety.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the International Conference on
+  Autonomous Agents and MultiAgent Systems (AAMAS), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HashVFL: Defending Against Data Reconstruction Attacks in Vertical
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.00325v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.00325v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyu Qiu, Xuhong Zhang, Shouling Ji, Chong Fu, Xing Yang, Ting Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vertical Federated Learning (VFL) is a trending collaborative machine
+learning model training solution. Existing industrial frameworks employ secure
+multi-party computation techniques such as homomorphic encryption to ensure
+data security and privacy. Despite these efforts, studies have revealed that
+data leakage remains a risk in VFL due to the correlations between intermediate
+representations and raw data. Neural networks can accurately capture these
+correlations, allowing an adversary to reconstruct the data. This emphasizes
+the need for continued research into securing VFL systems.
+  Our work shows that hashing is a promising solution to counter data
+reconstruction attacks. The one-way nature of hashing makes it difficult for an
+adversary to recover data from hash codes. However, implementing hashing in VFL
+presents new challenges, including vanishing gradients and information loss. To
+address these issues, we propose HashVFL, which integrates hashing and
+simultaneously achieves learnability, bit balance, and consistency.
+  Experimental results indicate that HashVFL effectively maintains task
+performance while defending against data reconstruction attacks. It also brings
+additional benefits in reducing the degree of label leakage, mitigating
+adversarial attacks, and detecting abnormal inputs. We hope our work will
+inspire further research into the potential applications of HashVFL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provably Convergent Federated Trilevel Learning <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11835v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11835v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Jiao, Kai Yang, Tiancheng Wu, Chengtao Jian, Jianwei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trilevel learning, also called trilevel optimization (TLO), has been
+recognized as a powerful modelling tool for hierarchical decision process and
+widely applied in many machine learning applications, such as robust neural
+architecture search, hyperparameter optimization, and domain adaptation.
+Tackling TLO problems has presented a great challenge due to their nested
+decision-making structure. In addition, existing works on TLO face the
+following key challenges: 1) they all focus on the non-distributed setting,
+which may lead to privacy breach; 2) they do not offer any non-asymptotic
+convergence analysis which characterizes how fast an algorithm converges. To
+address the aforementioned challenges, this paper proposes an asynchronous
+federated trilevel optimization method to solve TLO problems. The proposed
+method utilizes $\mu$-cuts to construct a hyper-polyhedral approximation for
+the TLO problem and solve it in an asynchronous manner. We demonstrate that the
+proposed $\mu$-cuts are applicable to not only convex functions but also a wide
+range of non-convex functions that meet the $\mu$-weakly convex assumption.
+Furthermore, we theoretically analyze the non-asymptotic convergence rate for
+the proposed method by showing its iteration complexity to obtain
+$\epsilon$-stationary point is upper bounded by
+$\mathcal{O}(\frac{1}{\epsilon^2})$. Extensive experiments on real-world
+datasets have been conducted to elucidate the superiority of the proposed
+method, e.g., it has a faster convergence rate with a maximum acceleration of
+approximately 80$\%$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modulate Your Spectrum in <span class="highlight-title">Self-Supervised</span> Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16789v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16789v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Weng, Yunhao Ni, Tengwei Song, Jie Luo, Rao Muhammad Anwer, Salman Khan, Fahad Shahbaz Khan, Lei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whitening loss offers a theoretical guarantee against feature collapse in
+self-supervised learning (SSL) with joint embedding architectures. Typically,
+it involves a hard whitening approach, transforming the embedding and applying
+loss to the whitened output. In this work, we introduce Spectral Transformation
+(ST), a framework to modulate the spectrum of embedding and to seek for
+functions beyond whitening that can avoid dimensional collapse. We show that
+whitening is a special instance of ST by definition, and our empirical
+investigations unveil other ST instances capable of preventing collapse.
+Additionally, we propose a novel ST instance named IterNorm with trace loss
+(INTL). Theoretical analysis confirms INTL's efficacy in preventing collapse
+and modulating the spectrum of embedding toward equal-eigenvalues during
+optimization. Our experiments on ImageNet classification and COCO object
+detection demonstrate INTL's potential in learning superior representations.
+The code is available at https://github.com/winci-ai/INTL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024. The code is available at
+  https://github.com/winci-ai/intl</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning bounded-degree polytrees with known skeleton <span class="chip">ALT 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06333v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06333v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davin Choo, Joy Qiping Yang, Arnab Bhattacharyya, Clément L. Canonne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We establish finite-sample guarantees for efficient proper learning of
+bounded-degree polytrees, a rich class of high-dimensional probability
+distributions and a subclass of Bayesian networks, a widely-studied type of
+graphical model. Recently, Bhattacharyya et al. (2021) obtained finite-sample
+guarantees for recovering tree-structured Bayesian networks, i.e., 1-polytrees.
+We extend their results by providing an efficient algorithm which learns
+$d$-polytrees in polynomial time and sample complexity for any bounded $d$ when
+the underlying undirected graph (skeleton) is known. We complement our
+algorithm with an information-theoretic sample complexity lower bound, showing
+that the dependence on the dimension and target accuracy parameters are nearly
+tight.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Fixed some typos. Added some discussions. Accepted to ALT 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ALEXR: An Optimal Single-Loop Algorithm for Convex Finite-Sum Coupled
+  Compositional Stochastic Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02277v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02277v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bokun Wang, Tianbao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper revisits a class of convex Finite-Sum Coupled Compositional
+Stochastic Optimization (cFCCO) problems with many applications, including
+group distributionally robust optimization (GDRO), learning with imbalanced
+data, reinforcement learning, and learning to rank. To better solve these
+problems, we introduce an efficient single-loop primal-dual block-coordinate
+proximal algorithm, dubbed ALEXR. This algorithm leverages block-coordinate
+stochastic mirror ascent updates for the dual variable and stochastic proximal
+gradient descent updates for the primal variable. We establish the convergence
+rates of ALEXR in both convex and strongly convex cases under smoothness and
+non-smoothness conditions of involved functions, which not only improve the
+best rates in previous works on smooth cFCCO problems but also expand the realm
+of cFCCO for solving more challenging non-smooth problems such as the dual form
+of GDRO. Finally, we present lower complexity bounds to demonstrate that the
+convergence rates of ALEXR are optimal among first-order block-coordinate
+stochastic algorithms for the considered class of cFCCO problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Fixed several typos; Added some numerical experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Task-Driven Causal Feature Distillation: Towards Trustworthy Risk
+  Prediction <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.16113v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.16113v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixuan Chu, Mengxuan Hu, Qing Cui, Longfei Li, Sheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since artificial intelligence has seen tremendous recent successes in many
+areas, it has sparked great interest in its potential for trustworthy and
+interpretable risk prediction. However, most models lack causal reasoning and
+struggle with class imbalance, leading to poor precision and recall. To address
+this, we propose a Task-Driven Causal Feature Distillation model (TDCFD) to
+transform original feature values into causal feature attributions for the
+specific risk prediction task. The causal feature attribution helps describe
+how much contribution the value of this feature can make to the risk prediction
+result. After the causal feature distillation, a deep neural network is applied
+to produce trustworthy prediction results with causal interpretability and high
+precision/recall. We evaluate the performance of our TDCFD method on several
+synthetic and real datasets, and the results demonstrate its superiority over
+the state-of-the-art methods regarding precision, recall, interpretability, and
+causality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 2024 AAAI Conference on Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empirical Study of Named Entity Recognition Performance Using
+  Distribution-aware Word Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.01636v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.01636v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Chen, Qi Zhao, Xinyang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the fast development of Deep Learning techniques, Named Entity
+Recognition (NER) is becoming more and more important in the information
+extraction task. The greatest difficulty that the NER task faces is to keep the
+detectability even when types of NE and documents are unfamiliar. Realizing
+that the specificity information may contain potential meanings of a word and
+generate semantic-related features for word embedding, we develop a
+distribution-aware word embedding and implement three different methods to make
+use of the distribution information in a NER framework. And the result shows
+that the performance of NER will be improved if the word specificity is
+incorporated into existing NER methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Want to correct</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Global Convergence of Natural Policy Gradient with Hessian-aided
+  Momentum Variance Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01084v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01084v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Feng, Ke Wei, Jinchi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural policy gradient (NPG) and its variants are widely-used policy search
+methods in reinforcement learning. Inspired by prior work, a new NPG variant
+coined NPG-HM is developed in this paper, which utilizes the Hessian-aided
+momentum technique for variance reduction, while the sub-problem is solved via
+the stochastic gradient descent method. It is shown that NPG-HM can achieve the
+global last iterate $\epsilon$-optimality with a sample complexity of
+$\mathcal{O}(\epsilon^{-2})$, which is the best known result for natural policy
+gradient type methods under the generic Fisher non-degenerate policy
+parameterizations. The convergence analysis is built upon a relaxed weak
+gradient dominance property tailored for NPG under the compatible function
+approximation framework, as well as a neat way to decompose the error when
+handling the sub-problem. Moreover, numerical experiments on Mujoco-based
+environments demonstrate the superior performance of NPG-HM over other
+state-of-the-art policy gradient methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Look, Remember and Reason: Grounded reasoning in videos with language
+  models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17778v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17778v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Apratim Bhattacharyya, Sunny Panchal, Mingu Lee, Reza Pourreza, Pulkit Madan, Roland Memisevic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal language models (LM) have recently shown promising performance in
+high-level reasoning tasks on videos. However, existing methods still fall
+short in tasks like causal or compositional spatiotemporal reasoning over
+actions, in which model predictions need to be grounded in fine-grained
+low-level details, such as object motions and object interactions. In this
+work, we propose training an LM end-to-end on low-level surrogate tasks,
+including object detection, re-identification, and tracking, to endow the model
+with the required low-level visual capabilities. We show that a two-stream
+video encoder with spatiotemporal attention is effective at capturing the
+required static and motion-based cues in the video. By leveraging the LM's
+ability to perform the low-level surrogate tasks, we can cast reasoning in
+videos as the three-step process of Look, Remember, Reason wherein visual
+information is extracted using low-level visual skills step-by-step and then
+integrated to arrive at a final answer. We demonstrate the effectiveness of our
+framework on diverse visual reasoning tasks from the ACRE, CATER,
+Something-Else and STAR datasets. Our approach is trainable end-to-end and
+surpasses state-of-the-art task-specific methods across these tasks by a large
+margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Finite-Time Logarithmic Bayes Regret Upper Bounds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09136v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09136v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexia Atsidakou, Branislav Kveton, Sumeet Katariya, Constantine Caramanis, Sujay Sanghavi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We derive the first finite-time logarithmic Bayes regret upper bounds for
+Bayesian bandits. In a multi-armed bandit, we obtain $O(c_\Delta \log n)$ and
+$O(c_h \log^2 n)$ upper bounds for an upper confidence bound algorithm, where
+$c_h$ and $c_\Delta$ are constants depending on the prior distribution and the
+gaps of bandit instances sampled from it, respectively. The latter bound
+asymptotically matches the lower bound of Lai (1987). Our proofs are a major
+technical departure from prior works, while being simple and general. To show
+the generality of our techniques, we apply them to linear bandits. Our results
+provide insights on the value of prior in the Bayesian setting, both in the
+objective and as a side information given to the learner. They significantly
+improve upon existing $\tilde{O}(\sqrt{n})$ bounds, which have become standard
+in the literature despite the logarithmic lower bound of Lai (1987).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LRS: Enhancing Adversarial Transferability through Lipschitz Regularized
+  Surrogate <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13118v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13118v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Wu, Tie Luo, Donald C. Wunsch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transferability of adversarial examples is of central importance to
+transfer-based black-box adversarial attacks. Previous works for generating
+transferable adversarial examples focus on attacking \emph{given} pretrained
+surrogate models while the connections between surrogate models and adversarial
+trasferability have been overlooked. In this paper, we propose {\em Lipschitz
+Regularized Surrogate} (LRS) for transfer-based black-box attacks, a novel
+approach that transforms surrogate models towards favorable adversarial
+transferability. Using such transformed surrogate models, any existing
+transfer-based black-box attack can run without any change, yet achieving much
+better performance. Specifically, we impose Lipschitz regularization on the
+loss landscape of surrogate models to enable a smoother and more controlled
+optimization process for generating more transferable adversarial examples. In
+addition, this paper also sheds light on the connection between the inner
+properties of surrogate models and adversarial transferability, where three
+factors are identified: smaller local Lipschitz constant, smoother loss
+landscape, and stronger adversarial robustness. We evaluate our proposed LRS
+approach by attacking state-of-the-art standard deep neural networks and
+defense models. The results demonstrate significant improvement on the attack
+success rates and transferability. Our code is available at
+https://github.com/TrustAIoT/LRS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2024 main track. Code available on Github (see abstract).
+  Appendix is included in this updated version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using Twitter Data to Understand Public Perceptions of Approved versus
+  Off-label Use for COVID-19-related Medications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.14358v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.14358v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yining Hua, Hang Jiang, Shixu Lin, Jie Yang, Joseph M. Plasek, David W. Bates, Li Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding public discourse on emergency use of unproven therapeutics is
+crucial for monitoring safe use and combating misinformation. We developed a
+natural language processing-based pipeline to comprehend public perceptions of
+and stances on coronavirus disease 2019 (COVID-19)-related drugs on Twitter
+over time. This retrospective study included 609,189 US-based tweets from
+January 29, 2020, to November 30, 2021, about four drugs that garnered
+significant public attention during the COVID-19 pandemic: (1)
+Hydroxychloroquine and Ivermectin, therapies with anecdotal evidence; and (2)
+Molnupiravir and Remdesivir, FDA-approved treatments for eligible patients.
+Time-trend analysis was employed to understand popularity trends and related
+events. Content and demographic analyses were conducted to explore potential
+rationales behind people's stances on each drug. Time-trend analysis indicated
+that Hydroxychloroquine and Ivermectin were discussed more than Molnupiravir
+and Remdesivir, particularly during COVID-19 surges. Hydroxychloroquine and
+Ivermectin discussions were highly politicized, related to conspiracy theories,
+hearsay, and celebrity influences. The distribution of stances between the two
+major US political parties was significantly different (P < .001); Republicans
+were more likely to support Hydroxychloroquine (55%) and Ivermectin (30%) than
+Democrats. People with healthcare backgrounds tended to oppose
+Hydroxychloroquine (7%) more than the general population, while the general
+population was more likely to support Ivermectin (14%). Our study found that
+social media users have varying perceptions and stances on off-label versus
+FDA-authorized drug use at different stages of COVID-19. This indicates that
+health systems, regulatory agencies, and policymakers should design tailored
+strategies to monitor and reduce misinformation to promote safe drug use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Full paper published in JAMIA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning an Inventory Control Policy with General Inventory Arrival
+  Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.17168v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.17168v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sohrab Andaz, Carson Eisenach, Dhruv Madeka, Kari Torkkola, Randy Jia, Dean Foster, Sham Kakade
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we address the problem of learning and backtesting inventory
+control policies in the presence of general arrival dynamics -- which we term
+as a quantity-over-time arrivals model (QOT). We also allow for order
+quantities to be modified as a post-processing step to meet vendor constraints
+such as order minimum and batch size constraints -- a common practice in real
+supply chains. To the best of our knowledge this is the first work to handle
+either arbitrary arrival dynamics or an arbitrary downstream post-processing of
+order quantities. Building upon recent work (Madeka et al., 2022) we similarly
+formulate the periodic review inventory control problem as an exogenous
+decision process, where most of the state is outside the control of the agent.
+Madeka et al., 2022 show how to construct a simulator that replays historic
+data to solve this class of problem. In our case, we incorporate a deep
+generative model for the arrivals process as part of the history replay. By
+formulating the problem as an exogenous decision process, we can apply results
+from Madeka et al., 2022 to obtain a reduction to supervised learning. Via
+simulation studies we show that this approach yields statistically significant
+improvements in profitability over production baselines. Using data from a
+real-world A/B test, we show that Gen-QOT generalizes well to off-policy data
+and that the resulting buying policy outperforms traditional inventory
+management systems in real world settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">5</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Large Multimodal Models against Common Corruptions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Zhang, Tianyu Pang, Chao Du, Yi Ren, Bo Li, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This technical report aims to fill a deficiency in the assessment of large
+multimodal models (LMMs) by specifically examining the self-consistency of
+their outputs when subjected to common corruptions. We investigate the
+cross-modal interactions between text, image, and speech, encompassing four
+essential generation tasks: text-to-image, image-to-text, text-to-speech, and
+speech-to-text. We create a comprehensive benchmark, named MMCBench, that
+covers more than 100 popular LMMs (totally over 150 model checkpoints). A
+thorough evaluation under common corruptions is critical for practical
+deployment and facilitates a better understanding of the reliability of
+cutting-edge LMMs. The benchmarking code is available at
+https://github.com/sail-sg/MMCBench
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MInD: Improving Multimodal Sentiment Analysis via Multimodal Information
+  Disentanglement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weichen Dai, Xingyu Li, Pengbo Hu, Zeyu Wang, Ji Qi, Jianlin Peng, Yi Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning effective joint representations has been a central task in
+multimodal sentiment analysis. Previous methods focus on leveraging the
+correlations between different modalities and enhancing performance through
+sophisticated fusion techniques. However, challenges still exist due to the
+inherent heterogeneity of distinct modalities, which may lead to distributional
+gap, impeding the full exploitation of inter-modal information and resulting in
+redundancy and impurity in the information extracted from features. To address
+this problem, we introduce the Multimodal Information Disentanglement (MInD)
+approach. MInD decomposes the multimodal inputs into a modality-invariant
+component, a modality-specific component, and a remnant noise component for
+each modality through a shared encoder and multiple private encoders. The
+shared encoder aims to explore the shared information and commonality across
+modalities, while the private encoders are deployed to capture the distinctive
+information and characteristic features. These representations thus furnish a
+comprehensive perspective of the multimodal data, facilitating the fusion
+process instrumental for subsequent prediction tasks. Furthermore, MInD
+improves the learned representations by explicitly modeling the task-irrelevant
+noise in an adversarial manner. Experimental evaluations conducted on benchmark
+datasets, including CMU-MOSI, CMU-MOSEI, and UR-Funny, demonstrate MInD's
+superior performance over existing state-of-the-art methods in both multimodal
+emotion recognition and multimodal humor detection tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identity-Driven Multimedia Forgery Detection via Reference Assistance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhao Xu, Jingjing Chen, Xue Song, Feng Han, Haijun Shan, Yugang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in technologies, such as the 'deepfake' technique, have
+paved the way for the generation of various media forgeries. In response to the
+potential hazards of these media forgeries, many researchers engage in
+exploring detection methods, increasing the demand for high-quality media
+forgery datasets. Despite this, existing datasets have certain limitations.
+Firstly, most of datasets focus on the manipulation of visual modality and
+usually lack diversity, as only a few forgery approaches are considered.
+Secondly, the quality of media is often inadequate in clarity and naturalness.
+Meanwhile, the size of the dataset is also limited. Thirdly, while many
+real-world forgeries are driven by identity, the identity information of the
+subject in media is frequently neglected. For detection, identity information
+could be an essential clue to boost accuracy. Moreover, official media
+concerning certain identities on the Internet can serve as prior knowledge,
+aiding both the audience and forgery detectors in determining the true
+identity. Therefore, we propose an identity-driven multimedia forgery dataset,
+IDForge, which contains 249,138 video shots. All video shots are sourced from
+324 wild videos collected of 54 celebrities from the Internet. The fake video
+shots involve 9 types of manipulation across visual, audio and textual
+modalities. Additionally, IDForge provides extra 214,438 real video shots as a
+reference set for the 54 celebrities. Correspondingly, we design an effective
+multimedia detection network, Reference-assisted Multimodal Forgery Detection
+Network (R-MFDN). Through extensive experiments on the proposed dataset, we
+demonstrate the effectiveness of R-MFDN on the multimedia detection task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoAVT: A Cognition-Inspired Unified Audio-Visual-Text <span class="highlight-title">Pre-Train</span>ing Model
+  for Multimodal Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianghu Yue, Xiaohai Tian, Malu Zhang, Zhizheng Wu, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been a long-standing quest for a unified audio-visual-text model to
+enable various multimodal understanding tasks, which mimics the listening,
+seeing and reading process of human beings. Humans tends to represent knowledge
+using two separate systems: one for representing verbal (textual) information
+and one for representing non-verbal (visual and auditory) information. These
+two systems can operate independently but can also interact with each other.
+Motivated by this understanding of human cognition, in this paper, we introduce
+CoAVT -- a novel cognition-inspired Correlated Audio-Visual-Text pre-training
+model to connect the three modalities. It contains a joint audio-visual encoder
+that learns to encode audio-visual synchronization information together with
+the audio and visual content for non-verbal information, and a text encoder to
+handle textual input for verbal information. To bridge the gap between
+modalities, CoAVT employs a query encoder, which contains a set of learnable
+query embeddings, and extracts the most informative audiovisual features of the
+corresponding text. Additionally, to leverage the correspondences between audio
+and vision with language respectively, we also establish the audio-text and
+visual-text bi-modal alignments upon the foundational audiovisual-text
+tri-modal alignment to enhance the multimodal representation learning. Finally,
+we jointly optimize CoAVT model with three multimodal objectives: contrastive
+loss, matching loss and language modeling loss. Extensive experiments show that
+CoAVT can learn strong multimodal correlations and be generalized to various
+downstream tasks. CoAVT establishes new state-of-the-art performance on
+text-video retrieval task on AudioCaps for both zero-shot and fine-tuning
+settings, audio-visual event classification and audio-visual retrieval tasks on
+AudioSet and VGGSound.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Task Performance: Evaluating and Reducing the Flaws of Large
+  Multimodal Models with In-Context Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00647v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00647v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mustafa Shukor, Alexandre Rame, Corentin Dancette, Matthieu Cord
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following the success of Large Language Models (LLMs), Large Multimodal
+Models (LMMs), such as the Flamingo model and its subsequent competitors, have
+started to emerge as natural steps towards generalist agents. However,
+interacting with recent LMMs reveals major limitations that are hardly captured
+by the current evaluation benchmarks. Indeed, task performances (e.g., VQA
+accuracy) alone do not provide enough clues to understand their real
+capabilities, limitations, and to which extent such models are aligned to human
+expectations. To refine our understanding of those flaws, we deviate from the
+current evaluation paradigm, and (1) evaluate 10 recent open-source LMMs from
+3B up to 80B parameter scale, on 5 different axes; hallucinations, abstention,
+compositionality, explainability and instruction following. Our evaluation on
+these axes reveals major flaws in LMMs. While the current go-to solution to
+align these models is based on training, such as instruction tuning or RLHF, we
+rather (2) explore the training-free in-context learning (ICL) as a solution,
+and study how it affects these limitations. Based on our ICL study, (3) we push
+ICL further and propose new multimodal ICL variants such as; Multitask-ICL,
+Chain-of-Hindsight-ICL, and Self-Correcting-ICL. Our findings are as follows.
+(1) Despite their success, LMMs have flaws that remain unsolved with scaling
+alone. (2) The effect of ICL on LMMs flaws is nuanced; despite its
+effectiveness for improved explainability, answer abstention, ICL only slightly
+improves instruction following, does not improve compositional abilities, and
+actually even amplifies hallucinations. (3) The proposed ICL variants are
+promising as post-hoc approaches to efficiently tackle some of those flaws. The
+code is available here: https://github.com/mshukor/EvALign-ICL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. Project Page: https://evalign-icl.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-01-21T00:00:00Z">2024-01-21</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">34</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-to-Image Cross-Modal Generation: A Systematic <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maciej Żelaszczyk, Jacek Mańdziuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We review research on generating visual data from text from the angle of
+"cross-modal generation." This point of view allows us to draw parallels
+between various methods geared towards working on input text and producing
+visual output, without limiting the analysis to narrow sub-areas. It also
+results in the identification of common templates in the field, which are then
+compared and contrasted both within pools of similar methods and across lines
+of research. We provide a breakdown of text-to-image generation into various
+flavors of image-from-text methods, video-from-text methods, image editing,
+self-supervised and graph-based approaches. In this discussion, we focus on
+research papers published at 8 leading machine learning conferences in the
+years 2016-2022, also incorporating a number of relevant papers not matching
+the outlined search criteria. The conducted review suggests a significant
+increase in the number of papers published in the area and highlights research
+gaps and potential lines of investigation. To our knowledge, this is the first
+review to systematically look at text-to-image generation from the perspective
+of "cross-modal generation."
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Freely Long-Thinking <span class="highlight-title">Transformer</span> (FraiLT) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11626v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11626v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akbay Tabak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Freely Long-Thinking Transformer (FraiLT) is an improved transformer model
+designed to enhance processing capabilities without scaling up size. It
+utilizes a recursive approach, iterating over a subset of layers multiple
+times, and introduces iteration encodings to maintain awareness across these
+cycles. Iteration encoding allows FraiLT to achieve the interpretive depth of
+larger models in a compact form. When evaluated on a synthetic story dataset,
+FraiLT outperformed larger models, showcasing its ability to deliver
+high-quality performance while reducing memory demands. This model represents a
+step forward towards more efficient and accessible language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-context Learning with Retrieved Demonstrations for Language Models: A
+  <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        an Luo, Xin Xu, Yue Liu, Panupong Pasupat, Mehran Kazemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models, especially pre-trained large language models, have showcased
+remarkable abilities as few-shot in-context learners (ICL), adept at adapting
+to new tasks with just a few demonstrations in the input context. However, the
+model's ability to perform ICL is sensitive to the choice of the few-shot
+demonstrations. Instead of using a fixed set of demonstrations, one recent
+development is to retrieve demonstrations tailored to each input query. The
+implementation of demonstration retrieval is relatively straightforward,
+leveraging existing databases and retrieval systems. This not only improves the
+efficiency and scalability of the learning process but also has been shown to
+reduce biases inherent in manual example selection. In light of the encouraging
+results and growing research in ICL with retrieved demonstrations, we conduct
+an extensive review of studies in this area. In this survey, we discuss and
+compare different design choices for retrieval models, retrieval training
+procedures, and inference algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Evaluation Measures for Evaluating Social Biases in Masked
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many evaluation measures are used to evaluate social biases in masked
+language models (MLMs). However, we find that these previously proposed
+evaluation measures are lacking robustness in scenarios with limited datasets.
+This is because these measures are obtained by comparing the
+pseudo-log-likelihood (PLL) scores of the stereotypical and anti-stereotypical
+samples using an indicator function. The disadvantage is the limited mining of
+the PLL score sets without capturing its distributional information. In this
+paper, we represent a PLL score set as a Gaussian distribution and use Kullback
+Leibler (KL) divergence and Jensen Shannon (JS) divergence to construct
+evaluation measures for the distributions of stereotypical and
+anti-stereotypical PLL scores. Experimental results on the publicly available
+datasets StereoSet (SS) and CrowS-Pairs (CP) show that our proposed measures
+are significantly more robust and interpretable than those proposed previously.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CheX-<span class="highlight-title">GPT</span>: Harnessing Large Language Models for Enhanced Chest X-ray
+  Report Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11505v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11505v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jawook Gu, Han-Cheol Cho, Jiho Kim, Kihyun You, Eun Kyoung Hong, Byungseok Roh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Free-text radiology reports present a rich data source for various medical
+tasks, but effectively labeling these texts remains challenging. Traditional
+rule-based labeling methods fall short of capturing the nuances of diverse
+free-text patterns. Moreover, models using expert-annotated data are limited by
+data scarcity and pre-defined classes, impacting their performance, flexibility
+and scalability. To address these issues, our study offers three main
+contributions: 1) We demonstrate the potential of GPT as an adept labeler using
+carefully designed prompts. 2) Utilizing only the data labeled by GPT, we
+trained a BERT-based labeler, CheX-GPT, which operates faster and more
+efficiently than its GPT counterpart. 3) To benchmark labeler performance, we
+introduced a publicly available expert-annotated test set, MIMIC-500,
+comprising 500 cases from the MIMIC validation set. Our findings demonstrate
+that CheX-GPT not only excels in labeling accuracy over existing models, but
+also showcases superior efficiency, flexibility, and scalability, supported by
+our introduction of the MIMIC-500 dataset for robust benchmarking. Code and
+models are available at https://github.com/kakaobrain/CheXGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ With Greater Text Comes Greater Necessity: Inference-Time Training Helps
+  Long Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Y. Wang, D. Ma, D. Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long text generation, such as novel writing or discourse-level translation
+with extremely long contexts, presents significant challenges to current
+language models. Existing methods mainly focus on extending the model's context
+window through strategies like length extrapolation. However, these approaches
+demand substantial hardware resources during the training and/or inference
+phases. Our proposed method, Temp-Lora, introduces an alternative concept.
+Instead of relying on the KV cache to store all context information, Temp-Lora
+embeds this information directly into the model's parameters. In the process of
+long text generation, we use a temporary Lora module, progressively trained
+with text generated previously. This approach not only efficiently preserves
+contextual knowledge but also prevents any permanent alteration to the model's
+parameters given that the module is discarded post-generation. Extensive
+experiments on the PG19 language modeling benchmark and the GuoFeng
+discourse-level translation benchmark validate the effectiveness of Temp-Lora.
+Our results show that: 1) Temp-Lora substantially enhances generation quality
+for long texts, as indicated by a 13.2% decrease in perplexity on a subset of
+PG19, and a 29.6% decrease in perplexity along with a 53.2% increase in BLEU
+score on GuoFeng, 2) Temp-Lora is compatible with and enhances most existing
+long text generation methods, and 3) Temp-Lora can greatly reduce computational
+costs by shortening the context window. While ensuring a slight improvement in
+generation quality (a decrease of 3.8% in PPL), it enables a reduction of 70.5%
+in the FLOPs required for inference and a 51.5% decrease in latency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Better Inclusivity: A Diverse Tweet Corpus of English Varieties 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nhi Pham, Lachlan Pham, Adam L. Meyers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prevalence of social media presents a growing opportunity to collect and
+analyse examples of English varieties. Whilst usage of these varieties was -
+and, in many cases, still is - used only in spoken contexts or hard-to-access
+private messages, social media sites like Twitter provide a platform for users
+to communicate informally in a scrapeable format. Notably, Indian English
+(Hinglish), Singaporean English (Singlish), and African-American English (AAE)
+can be commonly found online. These varieties pose a challenge to existing
+natural language processing (NLP) tools as they often differ orthographically
+and syntactically from standard English for which the majority of these tools
+are built. NLP models trained on standard English texts produced biased
+outcomes for users of underrepresented varieties. Some research has aimed to
+overcome the inherent biases caused by unrepresentative data through techniques
+like data augmentation or adjusting training models.
+  We aim to address the issue of bias at its root - the data itself. We curate
+a dataset of tweets from countries with high proportions of underserved English
+variety speakers, and propose an annotation framework of six categorical
+classifications along a pseudo-spectrum that measures the degree of standard
+English and that thereby indirectly aims to surface the manifestations of
+English varieties in these tweets. Following best annotation practices, our
+growing corpus features 170,800 tweets taken from 7 countries, labeled by
+annotators who are from those countries and can communicate in
+regionally-dominant varieties of English. Our corpus highlights the accuracy
+discrepancies in pre-trained language identifiers between western English and
+non-western (i.e., less standard) English varieties. We hope to contribute to
+the growing literature identifying and reducing the implicit demographic
+discrepancies in NLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages (including limitations, references and appendices), 2
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Over-Reasoning and Redundant Calculation of Large Language Models <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11467v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11467v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng-Han Chiang, Hung-yi Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) can solve problems step-by-step. While this
+chain-of-thought (CoT) reasoning boosts LLMs' performance, it is unclear if
+LLMs \textit{know} when to use CoT and whether those CoT are always necessary
+to answer the question. This paper shows that LLMs tend to generate redundant
+calculations and reasoning on a manually constructed math QA dataset,
+GSM8K-Zero. GSM8K-Zero is constructed such that the questions can be answered
+without any calculations, but LLMs, including Llama-2 models and Claude-2, tend
+to generate lengthy and unnecessary calculations to answer the questions. We
+also conduct experiments to explain why LLMs generate redundant calculations
+and reasonings. GSM8K-Zero is publicly available at
+https://github.com/d223302/Over-Reasoning-of-LLMs and
+https://huggingface.co/datasets/dcml0714/GSM8K-Zero.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EACL 2024 main conference paper. Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimating the Usefulness of Clarifying Questions and Answers for
+  Conversational Search <span class="chip">ECIR '24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan Sekulić, Weronika Łajewska, Krisztian Balog, Fabio Crestani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the body of research directed towards constructing and generating
+clarifying questions in mixed-initiative conversational search systems is vast,
+research aimed at processing and comprehending users' answers to such questions
+is scarce. To this end, we present a simple yet effective method for processing
+answers to clarifying questions, moving away from previous work that simply
+appends answers to the original query and thus potentially degrades retrieval
+performance. Specifically, we propose a classifier for assessing usefulness of
+the prompted clarifying question and an answer given by the user. Useful
+questions or answers are further appended to the conversation history and
+passed to a transformer-based query rewriting module. Results demonstrate
+significant improvements over strong non-mixed-initiative baselines.
+Furthermore, the proposed approach mitigates the performance drops when non
+useful questions and answers are utilized.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the author's version of the work. The definitive version is
+  published in: Proceedings of the 46th European Conference on Information
+  Retrieval (ECIR '24), March 24-28, 2024, Glasgow, Scotland</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear Alignment: A Closed-form Solution for Aligning Human Preferences
+  without Tuning and Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11458v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11458v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songyang Gao, Qiming Ge, Wei Shen, Shihan Dou, Junjie Ye, Xiao Wang, Rui Zheng, Yicheng Zou, Zhi Chen, Hang Yan, Qi Zhang, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of AI assistants based on Language Models (LLMs) hinges on
+Reinforcement Learning from Human Feedback (RLHF) to comprehend and align with
+user intentions. However, traditional alignment algorithms, such as PPO, are
+hampered by complex annotation and training requirements. This reliance limits
+the applicability of RLHF and hinders the development of professional
+assistants tailored to diverse human preferences. In this work, we introduce
+\textit{Linear Alignment}, a novel algorithm that aligns language models with
+human preferences in one single inference step, eliminating the reliance on
+data annotation and model training. Linear alignment incorporates a new
+parameterization for policy optimization under divergence constraints, which
+enables the extraction of optimal policy in a closed-form manner and
+facilitates the direct estimation of the aligned response. Extensive
+experiments on both general and personalized preference datasets demonstrate
+that linear alignment significantly enhances the performance and efficiency of
+LLM alignment across diverse scenarios. Our code and dataset will be published
+on \url{https://github.com/Wizardcoast/Linear_Alignment.git}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Reliable and Factual Response Generation: Detecting Unanswerable
+  Questions in Information-Seeking Conversations <span class="chip">ECIR '24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11452v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11452v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weronika Łajewska, Krisztian Balog
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI models face the challenge of hallucinations that can undermine
+users' trust in such systems. We approach the problem of conversational
+information seeking as a two-step process, where relevant passages in a corpus
+are identified first and then summarized into a final system response. This way
+we can automatically assess if the answer to the user's question is present in
+the corpus. Specifically, our proposed method employs a sentence-level
+classifier to detect if the answer is present, then aggregates these
+predictions on the passage level, and eventually across the top-ranked passages
+to arrive at a final answerability estimate. For training and evaluation, we
+develop a dataset based on the TREC CAsT benchmark that includes answerability
+labels on the sentence, passage, and ranking levels. We demonstrate that our
+proposed method represents a strong baseline and outperforms a state-of-the-art
+LLM on the answerability prediction task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the author's version of the work. The definitive version is
+  published in: Proceedings of the 46th European Conference on Information
+  Retrieval} (ECIR '24), March 24--28, 2024, Glasgow, Scotland</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Majority or Minority: Data Imbalance Learning Method for Named Entity
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sota Nemoto, Shunsuke Kitada, Hitoshi Iyatomi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data imbalance presents a significant challenge in various machine learning
+(ML) tasks, particularly named entity recognition (NER) within natural language
+processing (NLP). NER exhibits a data imbalance with a long-tail distribution,
+featuring numerous minority classes (i.e., entity classes) and a single
+majority class (i.e., O-class). The imbalance leads to the misclassifications
+of the entity classes as the O-class. To tackle the imbalance, we propose a
+simple and effective learning method, named majority or minority (MoM)
+learning. MoM learning incorporates the loss computed only for samples whose
+ground truth is the majority class (i.e., the O-class) into the loss of the
+conventional ML model. Evaluation experiments on four NER datasets (Japanese
+and English) showed that MoM learning improves prediction performance of the
+minority classes, without sacrificing the performance of the majority class and
+is more effective than widely known and state-of-the-art methods. We also
+evaluated MoM learning using frameworks as sequential labeling and machine
+reading comprehension, which are commonly used in NER. Furthermore, MoM
+learning has achieved consistent performance improvements regardless of
+language, model, or framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SE<span class="highlight-title">BERT</span>Nets: Sequence Enhanced <span class="highlight-title">BERT</span> Networks for Event Entity Extraction
+  Tasks Oriented to the Finance Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Congqing He, Xiangyu Zhu, Yuquan Le, Yuzhong Liu, Jianhong Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event extraction lies at the cores of investment analysis and asset
+management in the financial field, and thus has received much attention. The
+2019 China conference on knowledge graph and semantic computing (CCKS)
+challenge sets up a evaluation competition for event entity extraction task
+oriented to the finance field. In this task, we mainly focus on how to extract
+the event entity accurately, and recall all the corresponding event entity
+effectively. In this paper, we propose a novel model, Sequence Enhanced BERT
+Networks (SEBERTNets for short), which can inherit the advantages of the
+BERT,and while capturing sequence semantic information. In addition, motivated
+by recommendation system, we propose Hybrid Sequence Enhanced BERT Networks
+(HSEBERTNets for short), which uses a multi-channel recall method to recall all
+the corresponding event entity. The experimental results show that, the F1
+score of SEBERTNets is 0.905 in the first stage, and the F1 score of
+HSEBERTNets is 0.934 in the first stage, which demonstarate the effectiveness
+of our methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CCKS 2019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MolTailor: Tailoring Chemical Molecular Representation to Specific Tasks
+  via Text <span class="highlight-title">Prompt</span>s <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoqiang Guo, Sendong Zhao, Haochun Wang, Yanrui Du, Bing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning is now widely used in drug discovery, providing significant
+acceleration and cost reduction. As the most fundamental building block,
+molecular representation is essential for predicting molecular properties to
+enable various downstream applications. Most existing methods attempt to
+incorporate more information to learn better representations. However, not all
+features are equally important for a specific task. Ignoring this would
+potentially compromise the training efficiency and predictive accuracy. To
+address this issue, we propose a novel approach, which treats language models
+as an agent and molecular pretraining models as a knowledge base. The agent
+accentuates task-relevant features in the molecular representation by
+understanding the natural language description of the task, just as a tailor
+customizes clothes for clients. Thus, we call this approach MolTailor.
+Evaluations demonstrate MolTailor's superior performance over baselines,
+validating the efficacy of enhancing relevance for molecular representation
+learning. This illustrates the potential of language model guided optimization
+to better exploit and unleash the capabilities of existing powerful molecular
+representation methods. Our codes and appendix are available at
+https://github.com/SCIR-HI/MolTailor.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedLM: Exploring Language Models for Medical Question Answering Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11389v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11389v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niraj Yagnik, Jay Jhaveri, Vivek Sharma, Gabriel Pila, Asma Ben, Jingbo Shang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the face of rapidly expanding online medical literature, automated systems
+for aggregating and summarizing information are becoming increasingly crucial
+for healthcare professionals and patients. Large Language Models (LLMs), with
+their advanced generative capabilities, have shown promise in various NLP
+tasks, and their potential in the healthcare domain, particularly for
+Closed-Book Generative QnA, is significant. However, the performance of these
+models in domain-specific tasks such as medical Q&A remains largely unexplored.
+This study aims to fill this gap by comparing the performance of general and
+medical-specific distilled LMs for medical Q&A. We aim to evaluate the
+effectiveness of fine-tuning domain-specific LMs and compare the performance of
+different families of Language Models. The study will address critical
+questions about these models' reliability, comparative performance, and
+effectiveness in the context of medical Q&A. The findings will provide valuable
+insights into the suitability of different LMs for specific applications in the
+medical domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using Large Language Model for End-to-End Chinese ASR and NER 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuang Li, Jiawei Yu, Yanqing Zhao, Min Zhang, Mengxin Ren, Xiaofeng Zhao, Xiaosong Qiao, Chang Su, Miaomiao Ma, Hao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mapping speech tokens to the same feature space as text tokens has become the
+paradigm for the integration of speech modality into decoder-only large
+language models (LLMs). An alternative approach is to use an encoder-decoder
+architecture that incorporates speech features through cross-attention. This
+approach, however, has received less attention in the literature. In this work,
+we connect the Whisper encoder with ChatGLM3 and provide in-depth comparisons
+of these two approaches using Chinese automatic speech recognition (ASR) and
+name entity recognition (NER) tasks. We evaluate them not only by conventional
+metrics like the F1 score but also by a novel fine-grained taxonomy of ASR-NER
+errors. Our experiments reveal that encoder-decoder architecture outperforms
+decoder-only architecture with a short context, while decoder-only architecture
+benefits from a long context as it fully exploits all layers of the LLM. By
+using LLM, we significantly reduced the entity omission errors and improved the
+entity ASR accuracy compared to the Conformer baseline. Additionally, we
+obtained a state-of-the-art (SOTA) F1 score of 0.805 on the AISHELL-NER test
+set by using chain-of-thought (CoT) NER which first infers long-form ASR
+transcriptions and then predicts NER labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Models as Hierarchy Encoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan He, Zhangdie Yuan, Jiaoyan Chen, Ian Horrocks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpreting hierarchical structures latent in language is a key limitation
+of current language models (LMs). While previous research has implicitly
+leveraged these hierarchies to enhance LMs, approaches for their explicit
+encoding are yet to be explored. To address this, we introduce a novel approach
+to re-train transformer encoder-based LMs as Hierarchy Transformer encoders
+(HiTs), harnessing the expansive nature of hyperbolic space. Our method
+situates the output embedding space of pre-trained LMs within a Poincar\'e ball
+with a curvature that adapts to the embedding dimension, followed by
+re-training on hyperbolic cluster and centripetal losses. These losses are
+designed to effectively cluster related entities (input as texts) and organise
+them hierarchically. We evaluate HiTs against pre-trained and fine-tuned LMs,
+focusing on their capabilities in simulating transitive inference, predicting
+subsumptions, and transferring knowledge across hierarchies. The results
+demonstrate that HiTs consistently outperform both pre-trained and fine-tuned
+LMs in these tasks, underscoring the effectiveness and transferability of our
+re-trained hierarchy encoders.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding a Needle in the Adversarial Haystack: A Targeted Paraphrasing
+  Approach For Uncovering Edge Cases with Minimal Distribution Distortion <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aly M. Kassem, Sherif Saad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks against NLP Deep Learning models are a significant
+concern. In particular, adversarial samples exploit the model's sensitivity to
+small input changes. While these changes appear insignificant on the semantics
+of the input sample, they result in significant decay in model performance. In
+this paper, we propose Targeted Paraphrasing via RL (TPRL), an approach to
+automatically learn a policy to generate challenging samples that most likely
+improve the model's performance. TPRL leverages FLAN T5, a language model, as a
+generator and employs a self learned policy using a proximal policy gradient to
+generate the adversarial examples automatically. TPRL's reward is based on the
+confusion induced in the classifier, preserving the original text meaning
+through a Mutual Implication score. We demonstrate and evaluate TPRL's
+effectiveness in discovering natural adversarial attacks and improving model
+performance through extensive experiments on four diverse NLP classification
+tasks via Automatic and Human evaluation. TPRL outperforms strong baselines,
+exhibits generalizability across classifiers and datasets, and combines the
+strengths of language modeling and reinforcement learning to generate diverse
+and influential adversarial examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EACL 2024 - Main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Confidence Preservation Property in Knowledge Distillation Abstractions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dmitry Vengertsev, Elena Sherman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media platforms prevent malicious activities by detecting harmful
+content of posts and comments. To that end, they employ large-scale deep neural
+network language models for sentiment analysis and content understanding. Some
+models, like BERT, are complex, and have numerous parameters, which makes them
+expensive to operate and maintain. To overcome these deficiencies, industry
+experts employ a knowledge distillation compression technique, where a
+distilled model is trained to reproduce the classification behavior of the
+original model. The distillation processes terminates when the distillation
+loss function reaches the stopping criteria. This function is mainly designed
+to ensure that the original and the distilled models exhibit alike
+classification behaviors. However, besides classification accuracy, there are
+additional properties of the original model that the distilled model should
+preserve to be considered as an appropriate abstraction. In this work, we
+explore whether distilled TinyBERT models preserve confidence values of the
+original BERT models, and investigate how this confidence preservation property
+could guide tuning hyperparameters of the distillation process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revolutionizing API Documentation through Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        AmirHossein Naghshzan, Sylvie Ratte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study tackles the challenges associated with interpreting Application
+Programming Interface (API) documentation, an integral aspect of software
+development. Official API documentation, while essential, can be lengthy and
+challenging to navigate, prompting developers to seek unofficial sources such
+as Stack Overflow. Leveraging the vast user-generated content on Stack
+Overflow, including code snippets and discussions, we employ BERTopic and
+extractive summarization to automatically generate concise and informative API
+summaries. These summaries encompass key insights like general usage, common
+developer issues, and potential solutions, sourced from the wealth of knowledge
+on Stack Overflow. Software developers evaluate these summaries for
+performance, coherence, and interoperability, providing valuable feedback on
+the practicality of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2308.09070</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProLex: A Benchmark for Language Proficiency-oriented Lexical
+  Substitution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11356v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11356v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanming Zhang, Zixun Chen, Zhou Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lexical Substitution discovers appropriate substitutes for a given target
+word in a context sentence. However, the task fails to consider substitutes
+that are of equal or higher proficiency than the target, an aspect that could
+be beneficial for language learners looking to improve their writing. To bridge
+this gap, we propose a new task, language proficiency-oriented lexical
+substitution. We also introduce ProLex, a novel benchmark designed to assess
+systems' ability to generate not only appropriate substitutes but also
+substitutes that demonstrate better language proficiency. Besides the
+benchmark, we propose models that can automatically perform the new task. We
+show that our best model, a Llama2-13B model fine-tuned with task-specific
+synthetic data, outperforms ChatGPT by an average of 3.2% in F-score and
+achieves comparable results with GPT-4 on ProLex.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenSim: Generating Robotic Simulation Tasks via Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01361v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01361v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lirui Wang, Yiyang Ling, Zhecheng Yuan, Mohit Shridhar, Chen Bao, Yuzhe Qin, Bailin Wang, Huazhe Xu, Xiaolong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collecting large amounts of real-world interaction data to train general
+robotic policies is often prohibitively expensive, thus motivating the use of
+simulation data. However, existing methods for data generation have generally
+focused on scene-level diversity (e.g., object instances and poses) rather than
+task-level diversity, due to the human effort required to come up with and
+verify novel tasks. This has made it challenging for policies trained on
+simulation data to demonstrate significant task-level generalization. In this
+paper, we propose to automatically generate rich simulation environments and
+expert demonstrations by exploiting a large language models' (LLM) grounding
+and coding ability. Our approach, dubbed GenSim, has two modes: goal-directed
+generation, wherein a target task is given to the LLM and the LLM proposes a
+task curriculum to solve the target task, and exploratory generation, wherein
+the LLM bootstraps from previous tasks and iteratively proposes novel tasks
+that would be helpful in solving more complex tasks. We use GPT4 to expand the
+existing benchmark by ten times to over 100 tasks, on which we conduct
+supervised finetuning and evaluate several LLMs including finetuned GPTs and
+Code Llama on code generation for robotic simulation tasks. Furthermore, we
+observe that LLMs-generated simulation programs can enhance task-level
+generalization significantly when used for multitask policy training. We
+further find that with minimal sim-to-real adaptation, the multitask policies
+pretrained on GPT4-generated simulation tasks exhibit stronger transfer to
+unseen long-horizon tasks in the real world and outperform baselines by 25%.
+See the project website (https://liruiw.github.io/gensim) for code, demos, and
+videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See our project website (https://liruiw.github.io/gensim), demo and
+  datasets (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code
+  (https://github.com/liruiw/GenSim) for more details</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChaCha: Leveraging Large Language Models to <span class="highlight-title">Prompt</span> Children to Share
+  Their Emotions about Personal Events 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12244v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12244v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Woosuk Seo, Chanmo Yang, Young-Ho Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Children typically learn to identify and express emotions through sharing
+their stories and feelings with others, particularly their family. However, it
+is challenging for parents or siblings to have emotional communication with
+children since children are still developing their communication skills. We
+present ChaCha, a chatbot that encourages and guides children to share personal
+events and associated emotions. ChaCha combines a state machine and large
+language models (LLMs) to keep the dialogue on track while carrying on
+free-form conversations. Through an exploratory study with 20 children (aged
+8-12), we examine how ChaCha prompts children to share personal events and
+guides them to describe associated emotions. Participants perceived ChaCha as a
+close friend and shared their stories on various topics, such as family trips
+and personal achievements. Based on the findings, we discuss opportunities for
+leveraging LLMs to design child-friendly chatbots to support children in
+sharing emotions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures, 2 tables; Accepted at ACM CHI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Code Simulation Challenges for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09074v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09074v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emanuele La Malfa, Christoph Weinhuber, Orazio Torre, Fangru Lin, Anthony Cohn, Nigel Shadbolt, Michael Wooldridge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the extent to which Large Language Models (LLMs) can simulate
+the execution of computer code and algorithms. We begin by looking at straight
+line programs, and show that current LLMs demonstrate poor performance even
+with such simple programs -- performance rapidly degrades with the length of
+code. We then investigate the ability of LLMs to simulate programs that contain
+critical paths and redundant instructions. We also go beyond straight line
+program simulation with sorting algorithms and nested loops, and we show the
+computational complexity of a routine directly affects the ability of an LLM to
+simulate its execution. We observe that LLMs execute instructions sequentially
+and with a low error margin only for short programs or standard procedures.
+LLMs' code simulation is in tension with their pattern recognition and
+memorisation capabilities: on tasks where memorisation is detrimental, we
+propose a novel prompting method to simulate code execution line by line.
+Empirically, our new Chain of Simulation (CoSm) method improves on the standard
+Chain of Thought prompting approach by avoiding the pitfalls of memorisation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>main paper (10 pages) + Appendix (11 pages)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VivesDebate-Speech: A Corpus of Spoken Argumentation to Leverage Audio
+  Features for Argument Mining <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12584v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12584v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramon Ruiz-Dolz, Javier Iranzo-Sánchez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we describe VivesDebate-Speech, a corpus of spoken
+argumentation created to leverage audio features for argument mining tasks. The
+creation of this corpus represents an important contribution to the
+intersection of speech processing and argument mining communities, and one of
+the most complete publicly available resources in this topic. Moreover, we have
+performed a set of first-of-their-kind experiments which show an improvement
+when integrating audio features into the argument mining pipeline. The provided
+results can be used as a baseline for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages; EMNLP 2023 Accepted Version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Debate Evaluation with Argumentation Semantics and Natural
+  Language Argument Graph Networks <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.14647v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.14647v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramon Ruiz-Dolz, Stella Heras, Ana García-Fornes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lack of annotated data on professional argumentation and complete
+argumentative debates has led to the oversimplification and the inability of
+approaching more complex natural language processing tasks. Such is the case of
+the automatic debate evaluation. In this paper, we propose an original hybrid
+method to automatically evaluate argumentative debates. For that purpose, we
+combine concepts from argumentation theory such as argumentation frameworks and
+semantics, with Transformer-based architectures and neural graph networks.
+Furthermore, we obtain promising results that lay the basis on an unexplored
+new instance of the automatic analysis of natural language arguments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Accepted Version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DePT: Decomposed <span class="highlight-title">Prompt</span> Tuning for Parameter-Efficient Fine-tuning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05173v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05173v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxiang Shi, Aldo Lipani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt tuning (PT), where a small amount of trainable soft (continuous)
+prompt vectors is affixed to the input of language models (LM), has shown
+promising results across various tasks and models for parameter-efficient
+fine-tuning (PEFT). PT stands out from other PEFT approaches because it
+maintains competitive performance with fewer trainable parameters and does not
+drastically scale up its parameters as the model size expands. However, PT
+introduces additional soft prompt tokens, leading to longer input sequences,
+which significantly impacts training and inference time and memory usage due to
+the Transformer's quadratic complexity. Particularly concerning for Large
+Language Models (LLMs) that face heavy daily querying. To address this issue,
+we propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt
+into a shorter soft prompt and a pair of low-rank matrices that are then
+optimised with two different learning rates. This allows DePT to achieve better
+performance while saving substantial memory and time costs compared to vanilla
+PT and its variants, without changing trainable parameter sizes. Through
+extensive experiments on 23 natural language processing (NLP) and
+vision-language (VL) tasks, we demonstrate that DePT outperforms
+state-of-the-art PEFT approaches, including the full fine-tuning baseline, in
+some scenarios. Additionally, we empirically show that DEPT grows more
+efficient as the model size increases. Our further study reveals that DePT
+integrates seamlessly with parameter-efficient transfer learning in the
+few-shot learning setting and highlights its adaptability to various model
+architectures and sizes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. Code is available at https://github.com/ZhengxiangShi/DePT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rosetta Stone at KSAA-RD Shared Task: A Hop From Language Modeling To
+  Word--Definition Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.15823v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.15823v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed ElBakry, Mohamed Gabr, Muhammad ElNokrashy, Badr AlKhamissi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A Reverse Dictionary is a tool enabling users to discover a word based on its
+provided definition, meaning, or description. Such a technique proves valuable
+in various scenarios, aiding language learners who possess a description of a
+word without its identity, and benefiting writers seeking precise terminology.
+These scenarios often encapsulate what is referred to as the
+"Tip-of-the-Tongue" (TOT) phenomena. In this work, we present our winning
+solution for the Arabic Reverse Dictionary shared task. This task focuses on
+deriving a vector representation of an Arabic word from its accompanying
+description. The shared task encompasses two distinct subtasks: the first
+involves an Arabic definition as input, while the second employs an English
+definition. For the first subtask, our approach relies on an ensemble of
+finetuned Arabic BERT-based models, predicting the word embedding for a given
+definition. The final representation is obtained through averaging the output
+embeddings from each model within the ensemble. In contrast, the most effective
+solution for the second subtask involves translating the English test
+definitions into Arabic and applying them to the finetuned models originally
+trained for the first subtask. This straightforward method achieves the highest
+score across both subtasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of ArabicNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Topic-VQ-VAE: Leveraging Latent Codebooks for Flexible Topic-Guided
+  Document Generation <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11532v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11532v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        YoungJoon Yoo, Jongwon Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel approach for topic modeling utilizing latent
+codebooks from Vector-Quantized Variational Auto-Encoder~(VQ-VAE), discretely
+encapsulating the rich information of the pre-trained embeddings such as the
+pre-trained language model. From the novel interpretation of the latent
+codebooks and embeddings as conceptual bag-of-words, we propose a new
+generative topic model called Topic-VQ-VAE~(TVQ-VAE) which inversely generates
+the original documents related to the respective latent codebook. The TVQ-VAE
+can visualize the topics with various generative distributions including the
+traditional BoW distribution and the autoregressive image generation. Our
+experimental results on document analysis and image generation demonstrate that
+TVQ-VAE effectively captures the topic context which reveals the underlying
+structures of the dataset and supports flexible forms of document generation.
+Official implementation of the proposed TVQ-VAE is available at
+https://github.com/clovaai/TVQ-VAE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the 38th annual AAAI conference on Artificial
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AV-data2vec: <span class="highlight-title">Self-supervised</span> Learning of Audio-Visual Speech
+  Representations with Contextualized Target Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.06419v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.06419v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Lian, Alexei Baevski, Wei-Ning Hsu, Michael Auli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervision has shown great potential for audio-visual speech
+recognition by vastly reducing the amount of labeled data required to build
+good systems. However, existing methods are either not entirely end-to-end or
+do not train joint representations of both modalities. In this paper, we
+introduce AV-data2vec which addresses these challenges and builds audio-visual
+representations based on predicting contextualized representations which has
+been successful in the uni-modal case. The model uses a shared transformer
+encoder for both audio and video and can combine both modalities to improve
+speech recognition. Results on LRS3 show that AV-data2vec consistently
+outperforms existing methods under all settings with the same amount of data
+and model size.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 ASRU</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Hierarchical Spoken Language Dysfluency Modeling <span class="chip">EACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10015v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10015v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Lian, Gopala Anumanchipalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech disfluency modeling is the bottleneck for both speech therapy and
+language learning. However, there is no effective AI solution to systematically
+tackle this problem. We solidify the concept of disfluent speech and disfluent
+speech modeling. We then present Hierarchical Unconstrained Disfluency Modeling
+(H-UDM) approach, the hierarchical extension of UDM that addresses both
+disfluency transcription and detection to eliminate the need for extensive
+manual annotation. Our experimental findings serve as clear evidence of the
+effectiveness and reliability of the methods we have introduced, encompassing
+both transcription and detection tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2024 EACL. Hierarchical extension of our previous workshop paper
+  arXiv:2312.12810</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Optimal Statistical Watermarking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baihe Huang, Banghua Zhu, Hanlin Zhu, Jason D. Lee, Jiantao Jiao, Michael I. Jordan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study statistical watermarking by formulating it as a hypothesis testing
+problem, a general framework which subsumes all previous statistical
+watermarking methods. Key to our formulation is a coupling of the output tokens
+and the rejection region, realized by pseudo-random generators in practice,
+that allows non-trivial trade-off between the Type I error and Type II error.
+We characterize the Uniformly Most Powerful (UMP) watermark in the general
+hypothesis testing setting and the minimax Type II error in the model-agnostic
+setting. In the common scenario where the output is a sequence of $n$ tokens,
+we establish nearly matching upper and lower bounds on the number of i.i.d.
+tokens required to guarantee small Type I and Type II errors. Our rate of
+$\Theta(h^{-1} \log (1/h))$ with respect to the average entropy per token $h$
+highlights potentials for improvement from the rate of $h^{-2}$ in the previous
+works. Moreover, we formulate the robust watermarking problem where users are
+allowed to perform a class of perturbations on the generated texts, and
+characterize the optimal type II error of robust UMP tests via a linear
+programming problem. To the best of our knowledge, this is the first systematic
+statistical treatment on the watermarking problem with near-optimal rates in
+the i.i.d. setting, which might be of interest for future works.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MathVista: Evaluating Mathematical Reasoning of Foundation Models in
+  Visual Contexts <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02255v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02255v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pan Lu, Hritik Bansal, Tony Xia, Jiacheng Liu, Chunyuan Li, Hannaneh Hajishirzi, Hao Cheng, Kai-Wei Chang, Michel Galley, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) and Large Multimodal Models (LMMs) exhibit
+impressive problem-solving skills in many tasks and domains, but their ability
+in mathematical reasoning in visual contexts has not been systematically
+studied. To bridge this gap, we present MathVista, a benchmark designed to
+combine challenges from diverse mathematical and visual tasks. It consists of
+6,141 examples, derived from 28 existing multimodal datasets involving
+mathematics and 3 newly created datasets (i.e., IQTest, FunctionQA, and
+PaperQA). Completing these tasks requires fine-grained, deep visual
+understanding and compositional reasoning, which all state-of-the-art
+foundation models find challenging. With MathVista, we have conducted a
+comprehensive, quantitative evaluation of 12 prominent foundation models. The
+best-performing GPT-4V model achieves an overall accuracy of 49.9%,
+substantially outperforming Bard, the second-best performer, by 15.1%. Our
+in-depth analysis reveals that the superiority of GPT-4V is mainly attributed
+to its enhanced visual perception and mathematical reasoning. However, GPT-4V
+still falls short of human performance by 10.4%, as it often struggles to
+understand complex figures and perform rigorous reasoning. This significant gap
+underscores the critical role that MathVista will play in the development of
+general-purpose AI agents capable of tackling mathematically intensive and
+visually rich real-world tasks. We further explore the new ability of
+self-verification, the application of self-consistency, and the interactive
+chatbot capabilities of GPT-4V, highlighting its promising potential for future
+research. The project is available at https://mathvista.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>116 pages, 120 figures. Accepted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chem-FINESE: Validating Fine-Grained Few-shot Entity Extraction through
+  Text Reconstruction <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyun Wang, Zixuan Zhang, Hongxiang Li, Xuan Liu, Jiawei Han, Heng Ji, Huimin Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-grained few-shot entity extraction in the chemical domain faces two
+unique challenges. First, compared with entity extraction tasks in the general
+domain, sentences from chemical papers usually contain more entities. Moreover,
+entity extraction models usually have difficulty extracting entities of
+long-tailed types. In this paper, we propose Chem-FINESE, a novel
+sequence-to-sequence (seq2seq) based few-shot entity extraction approach, to
+address these two challenges. Our Chem-FINESE has two components: a seq2seq
+entity extractor to extract named entities from the input sentence and a
+seq2seq self-validation module to reconstruct the original input sentence from
+extracted entities. Inspired by the fact that a good entity extraction system
+needs to extract entities faithfully, our new self-validation module leverages
+entity extraction results to reconstruct the original input sentence. Besides,
+we design a new contrastive loss to reduce excessive copying during the
+extraction process. Finally, we release ChemNER+, a new fine-grained chemical
+entity extraction dataset that is annotated by domain experts with the ChemNER
+schema. Experiments in few-shot settings with both ChemNER+ and CHEMET datasets
+show that our newly proposed framework has contributed up to 8.26% and 6.84%
+absolute F1-score gains respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages. Accepted by Findings of the Association for Computational
+  Linguistics: EACL 2024. Code and resources are available at
+  https://github.com/EagleW/Chem-FINESE</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">46</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-to-Image Cross-Modal Generation: A Systematic <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maciej Żelaszczyk, Jacek Mańdziuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We review research on generating visual data from text from the angle of
+"cross-modal generation." This point of view allows us to draw parallels
+between various methods geared towards working on input text and producing
+visual output, without limiting the analysis to narrow sub-areas. It also
+results in the identification of common templates in the field, which are then
+compared and contrasted both within pools of similar methods and across lines
+of research. We provide a breakdown of text-to-image generation into various
+flavors of image-from-text methods, video-from-text methods, image editing,
+self-supervised and graph-based approaches. In this discussion, we focus on
+research papers published at 8 leading machine learning conferences in the
+years 2016-2022, also incorporating a number of relevant papers not matching
+the outlined search criteria. The conducted review suggests a significant
+increase in the number of papers published in the area and highlights research
+gaps and potential lines of investigation. To our knowledge, this is the first
+review to systematically look at text-to-image generation from the perspective
+of "cross-modal generation."
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on African Computer Vision <span class="highlight-title">Dataset</span>s, Topics and Researchers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdul-Hakeem Omotayo, Ashery Mbilinyi, Lukman Ismaila, Houcemeddine Turki, Mahmoud Abdien, Karim Gamal, Idriss Tondji, Yvan Pimi, Naome A. Etori, Marwa M. Matar, Clifford Broni-Bediako, Abigail Oppong, Mai Gamal, Eman Ehab, Gbetondji Dovonon, Zainab Akinjobi, Daniel Ajisafe, Oluwabukola G. Adegboro, Mennatullah Siam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computer vision encompasses a range of tasks such as object detection,
+semantic segmentation, and 3D reconstruction. Despite its relevance to African
+communities, research in this field within Africa represents only 0.06% of
+top-tier publications over the past decade. This study undertakes a thorough
+analysis of 63,000 Scopus-indexed computer vision publications from Africa,
+spanning from 2012 to 2022. The aim is to provide a survey of African computer
+vision topics, datasets and researchers. A key aspect of our study is the
+identification and categorization of African Computer Vision datasets using
+large language models that automatically parse abstracts of these publications.
+We also provide a compilation of unofficial African Computer Vision datasets
+distributed through challenges or data hosting platforms, and provide a full
+taxonomy of dataset categories. Our survey also pinpoints computer vision
+topics trends specific to different African regions, indicating their unique
+focus areas. Additionally, we carried out an extensive survey to capture the
+views of African researchers on the current state of computer vision research
+in the continent and the structural barriers they believe need urgent
+attention. In conclusion, this study catalogs and categorizes Computer Vision
+datasets and topics contributed or initiated by African institutions and
+identifies barriers to publishing in top-tier Computer Vision venues. This
+survey underscores the importance of encouraging African researchers and
+institutions in advancing computer vision research in the continent. It also
+stresses on the need for research topics to be more aligned with the needs of
+African communities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review, Community Work of Ro'ya Grassroots,
+  https://ro-ya-cv4africa.github.io/homepage/. arXiv admin note: text overlap
+  with arXiv:2305.06773</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable High-Resolution Pixel-Space Image Synthesis with Hourglass
+  Diffusion <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katherine Crowson, Stefan Andreas Baumann, Alex Birch, Tanishq Mathew Abraham, Daniel Z. Kaplan, Enrico Shippole
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Hourglass Diffusion Transformer (HDiT), an image generative
+model that exhibits linear scaling with pixel count, supporting training at
+high-resolution (e.g. $1024 \times 1024$) directly in pixel-space. Building on
+the Transformer architecture, which is known to scale to billions of
+parameters, it bridges the gap between the efficiency of convolutional U-Nets
+and the scalability of Transformers. HDiT trains successfully without typical
+high-resolution training techniques such as multiscale architectures, latent
+autoencoders or self-conditioning. We demonstrate that HDiT performs
+competitively with existing models on ImageNet $256^2$, and sets a new
+state-of-the-art for diffusion models on FFHQ-$1024^2$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 13 figures, project page and code available at
+  https://crowsonkb.github.io/hourglass-diffusion-transformers/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TetraLoss: Improving the Robustness of Face Recognition against Morphing
+  Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11598v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11598v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathias Ibsen, Lázaro J. González-Soler, Christian Rathgeb, Christoph Busch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition systems are widely deployed in high-security applications
+such as for biometric verification at border controls. Despite their high
+accuracy on pristine data, it is well-known that digital manipulations, such as
+face morphing, pose a security threat to face recognition systems. Malicious
+actors can exploit the facilities offered by the identity document issuance
+process to obtain identity documents containing morphed images. Thus, subjects
+who contributed to the creation of the morphed image can with high probability
+use the identity document to bypass automated face recognition systems. In
+recent years, no-reference (i.e., single image) and differential morphing
+attack detectors have been proposed to tackle this risk. These systems are
+typically evaluated in isolation from the face recognition system that they
+have to operate jointly with and do not consider the face recognition process.
+Contrary to most existing works, we present a novel method for adapting deep
+learning-based face recognition systems to be more robust against face morphing
+attacks. To this end, we introduce TetraLoss, a novel loss function that learns
+to separate morphed face images from its contributing subjects in the embedding
+space while still preserving high biometric verification performance. In a
+comprehensive evaluation, we show that the proposed method can significantly
+enhance the original system while also significantly outperforming other tested
+baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the IEEE International Conference on Automatic Face &
+  Gesture Recognition 2024 (FG'24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Thermal Image Calibration and Correction using Unpaired Cycle-Consistent
+  Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11582v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11582v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossein Rajoli, Pouya Afshin, Fatemeh Afghah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned aerial vehicles (UAVs) offer a flexible and cost-effective solution
+for wildfire monitoring. However, their widespread deployment during wildfires
+has been hindered by a lack of operational guidelines and concerns about
+potential interference with aircraft systems. Consequently, the progress in
+developing deep-learning models for wildfire detection and characterization
+using aerial images is constrained by the limited availability, size, and
+quality of existing datasets. This paper introduces a solution aimed at
+enhancing the quality of current aerial wildfire datasets to align with
+advancements in camera technology. The proposed approach offers a solution to
+create a comprehensive, standardized large-scale image dataset. This paper
+presents a pipeline based on CycleGAN to enhance wildfire datasets and a novel
+fusion method that integrates paired RGB images as attribute conditioning in
+the generators of both directions, improving the accuracy of the generated
+images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted at the Asilomar 2023 Conference and will
+  be published</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical <span class="highlight-title">Prompt</span>s for Rehearsal-free Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukun Zuo, Hantao Yao, Lu Yu, Liansheng Zhuang, Changsheng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning endeavors to equip the model with the capability to
+integrate current task knowledge while mitigating the forgetting of past task
+knowledge. Inspired by prompt tuning, prompt-based methods maintain a frozen
+backbone and train with slight learnable prompts to minimize the catastrophic
+forgetting that arises due to updating a large number of backbone parameters.
+Nonetheless, these learnable prompts tend to concentrate on the discriminatory
+knowledge of the current task while ignoring past task knowledge, leading to
+that learnable prompts still suffering from catastrophic forgetting. This paper
+introduces a novel rehearsal-free paradigm for continual learning termed
+Hierarchical Prompts (H-Prompts), comprising three categories of prompts --
+class prompt, task prompt, and general prompt. To effectively depict the
+knowledge of past classes, class prompt leverages Bayesian Distribution
+Alignment to model the distribution of classes in each task. To reduce the
+forgetting of past task knowledge, task prompt employs Cross-task Knowledge
+Excavation to amalgamate the knowledge encapsulated in the learned class
+prompts of past tasks and current task knowledge. Furthermore, general prompt
+utilizes Generalized Knowledge Exploration to deduce highly generalized
+knowledge in a self-supervised manner. Evaluations on two benchmarks
+substantiate the efficacy of the proposed H-Prompts, exemplified by an average
+accuracy of 87.8% in Split CIFAR-100 and 70.6% in Split ImageNet-R.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to TPAMI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Robust Are Energy-Based Models Trained With Equilibrium Propagation? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Mansingh, Michal Kucer, Garrett Kenyon, Juston Moore, Michael Teti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are easily fooled by adversarial perturbations
+that are imperceptible to humans. Adversarial training, a process where
+adversarial examples are added to the training set, is the current
+state-of-the-art defense against adversarial attacks, but it lowers the model's
+accuracy on clean inputs, is computationally expensive, and offers less
+robustness to natural noise. In contrast, energy-based models (EBMs), which
+were designed for efficient implementation in neuromorphic hardware and
+physical systems, incorporate feedback connections from each layer to the
+previous layer, yielding a recurrent, deep-attractor architecture which we
+hypothesize should make them naturally robust. Our work is the first to explore
+the robustness of EBMs to both natural corruptions and adversarial attacks,
+which we do using the CIFAR-10 and CIFAR-100 datasets. We demonstrate that EBMs
+are more robust than transformers and display comparable robustness to
+adversarially-trained DNNs on gradient-based (white-box) attacks, query-based
+(black-box) attacks, and natural perturbations without sacrificing clean
+accuracy, and without the need for adversarial training or additional training
+techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View Neural 3D Reconstruction of Micro-/Nanostructures with Atomic
+  Force Microscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Chen, Mao Peng, Yijin Li, Bing-Feng Ju, Hujun Bao, Yuan-Liu Chen, Guofeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Atomic Force Microscopy (AFM) is a widely employed tool for micro-/nanoscale
+topographic imaging. However, conventional AFM scanning struggles to
+reconstruct complex 3D micro-/nanostructures precisely due to limitations such
+as incomplete sample topography capturing and tip-sample convolution artifacts.
+Here, we propose a multi-view neural-network-based framework with AFM
+(MVN-AFM), which accurately reconstructs surface models of intricate
+micro-/nanostructures. Unlike previous works, MVN-AFM does not depend on any
+specially shaped probes or costly modifications to the AFM system. To achieve
+this, MVN-AFM uniquely employs an iterative method to align multi-view data and
+eliminate AFM artifacts simultaneously. Furthermore, we pioneer the application
+of neural implicit surface reconstruction in nanotechnology and achieve
+markedly improved results. Extensive experiments show that MVN-AFM effectively
+eliminates artifacts present in raw AFM images and reconstructs various
+micro-/nanostructures including complex geometrical microstructures printed via
+Two-photon Lithography and nanoparticles such as PMMA nanospheres and ZIF-67
+nanocrystals. This work presents a cost-effective tool for micro-/nanoscale 3D
+analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deformable Endoscopic Tissues Reconstruction with Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingting Zhu, Zhao Wang, Zhenchao Jin, Guying Lin, Lequan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surgical 3D reconstruction is a critical area of research in robotic surgery,
+with recent works adopting variants of dynamic radiance fields to achieve
+success in 3D reconstruction of deformable tissues from single-viewpoint
+videos. However, these methods often suffer from time-consuming optimization or
+inferior quality, limiting their adoption in downstream tasks. Inspired by 3D
+Gaussian Splatting, a recent trending 3D representation, we present EndoGS,
+applying Gaussian Splatting for deformable endoscopic tissue reconstruction.
+Specifically, our approach incorporates deformation fields to handle dynamic
+scenes, depth-guided supervision to optimize 3D targets with a single
+viewpoint, and a spatial-temporal weight mask to mitigate tool occlusion. As a
+result, EndoGS reconstructs and renders high-quality deformable endoscopic
+tissues from a single-viewpoint video, estimated depth maps, and labeled tool
+masks. Experiments on DaVinci robotic surgery videos demonstrate that EndoGS
+achieves superior rendering quality. Code is available at
+https://github.com/HKU-MedAI/EndoGS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress. 10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CaBuAr: California Burned Areas <span class="highlight-title">dataset</span> for delineation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11519v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11519v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Rege Cambrin, Luca Colomba, Paolo Garza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forest wildfires represent one of the catastrophic events that, over the last
+decades, caused huge environmental and humanitarian damages. In addition to a
+significant amount of carbon dioxide emission, they are a source of risk to
+society in both short-term (e.g., temporary city evacuation due to fire) and
+long-term (e.g., higher risks of landslides) cases. Consequently, the
+availability of tools to support local authorities in automatically identifying
+burned areas plays an important role in the continuous monitoring requirement
+to alleviate the aftereffects of such catastrophic events. The great
+availability of satellite acquisitions coupled with computer vision techniques
+represents an important step in developing such tools. This paper introduces a
+novel open dataset that tackles the burned area delineation problem, a binary
+segmentation problem applied to satellite imagery. The presented resource
+consists of pre- and post-fire Sentinel-2 L2A acquisitions of California forest
+fires that took place starting in 2015. Raster annotations were generated from
+the data released by California's Department of Forestry and Fire Protection.
+Moreover, in conjunction with the dataset, we release three different baselines
+based on spectral indexes analyses, SegFormer, and U-Net models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the IEEE Geoscience and Remote Sensing Magazine</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MobileARLoc: On-device Robust Absolute Localisation for Pervasive
+  Markerless Mobile AR 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changkun Liu, Yukun Zhao, Tristan Braud
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have seen significant improvement in absolute camera pose
+estimation, paving the way for pervasive markerless Augmented Reality (AR).
+However, accurate absolute pose estimation techniques are computation- and
+storage-heavy, requiring computation offloading. As such, AR systems rely on
+visual-inertial odometry (VIO) to track the device's relative pose between
+requests to the server. However, VIO suffers from drift, requiring frequent
+absolute repositioning. This paper introduces MobileARLoc, a new framework for
+on-device large-scale markerless mobile AR that combines an absolute pose
+regressor (APR) with a local VIO tracking system. Absolute pose regressors
+(APRs) provide fast on-device pose estimation at the cost of reduced accuracy.
+To address APR accuracy and reduce VIO drift, MobileARLoc creates a feedback
+loop where VIO pose estimations refine the APR predictions. The VIO system
+identifies reliable predictions of APR, which are then used to compensate for
+the VIO drift. We comprehensively evaluate MobileARLoc through dataset
+simulations. MobileARLoc halves the error compared to the underlying APR and
+achieve fast (80\,ms) on-device inference speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the 3rd edition of the Pervasive and
+  Resource-Constrained AI (PerConAI) workshop (co-located with PerCom 2024).
+  arXiv admin note: substantial text overlap with arXiv:2308.05394</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Bird's Eye View Motion Prediction with Cross-Modality
+  Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoheng Fang, Zuhong Liu, Mingyu Wang, Chenxin Xu, Yiqi Zhong, Siheng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning the dense bird's eye view (BEV) motion flow in a self-supervised
+manner is an emerging research for robotics and autonomous driving. Current
+self-supervised methods mainly rely on point correspondences between point
+clouds, which may introduce the problems of fake flow and inconsistency,
+hindering the model's ability to learn accurate and realistic motion. In this
+paper, we introduce a novel cross-modality self-supervised training framework
+that effectively addresses these issues by leveraging multi-modality data to
+obtain supervision signals. We design three innovative supervision signals to
+preserve the inherent properties of scene motion, including the masked Chamfer
+distance loss, the piecewise rigidity loss, and the temporal consistency loss.
+Through extensive experiments, we demonstrate that our proposed self-supervised
+framework outperforms all previous self-supervision methods for the motion
+prediction task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Edge-Enabled Real-time Railway Track Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Chenglin, Wang Fei, Yang Min, Qin Yong, Bai Yun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and rapid railway track segmentation can assist automatic train
+driving and is a key step in early warning to fixed or moving obstacles on the
+railway track. However, certain existing algorithms tailored for track
+segmentation often struggle to meet the requirements of real-time and
+efficiency on resource-constrained edge devices. Considering this challenge, we
+propose an edge-enabled real-time railway track segmentation algorithm, which
+is optimized to be suitable for edge applications by optimizing the network
+structure and quantizing the model after training. Initially, Ghost convolution
+is introduced to reduce the complexity of the backbone, thereby achieving the
+extraction of key information of the interested region at a lower cost. To
+further reduce the model complexity and calculation, a new lightweight
+detection head is proposed to achieve the best balance between accuracy and
+efficiency. Subsequently, we introduce quantization techniques to map the
+model's floating-point weights and activation values into lower bit-width
+fixed-point representations, reducing computational demands and memory
+footprint, ultimately accelerating the model's inference. Finally, we draw
+inspiration from GPU parallel programming principles to expedite the
+pre-processing and post-processing stages of the algorithm by doing parallel
+processing. The approach is evaluated with public and challenging dataset
+RailSem19 and tested on Jetson Nano. Experimental results demonstrate that our
+enhanced algorithm achieves an accuracy level of 83.3% while achieving a
+real-time inference rate of 25 frames per second when the input size is
+480x480, thereby effectively meeting the requirements for real-time and
+high-efficiency operation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MapChange: Enhancing Semantic Change Detection with Temporal-Invariant
+  Historical Maps Based on Deep Triplet Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11489v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11489v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinhe Liu, Sunan Shi, Zhuo Zheng, Jue Wang, Shiqi Tian, Yanfei Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic Change Detection (SCD) is recognized as both a crucial and
+challenging task in the field of image analysis. Traditional methods for SCD
+have predominantly relied on the comparison of image pairs. However, this
+approach is significantly hindered by substantial imaging differences, which
+arise due to variations in shooting times, atmospheric conditions, and angles.
+Such discrepancies lead to two primary issues: the under-detection of minor yet
+significant changes, and the generation of false alarms due to temporal
+variances. These factors often result in unchanged objects appearing markedly
+different in multi-temporal images. In response to these challenges, the
+MapChange framework has been developed. This framework introduces a novel
+paradigm that synergizes temporal-invariant historical map data with
+contemporary high-resolution images. By employing this combination, the
+temporal variance inherent in conventional image pair comparisons is
+effectively mitigated. The efficacy of the MapChange framework has been
+empirically validated through comprehensive testing on two public datasets.
+These tests have demonstrated the framework's marked superiority over existing
+state-of-the-art SCD methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ColorVideoVDP: A visual difference predictor for image, video and
+  display distortions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rafal K. Mantiuk, Param Hanji, Maliha Ashraf, Yuta Asano, Alexandre Chapiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ColorVideoVDP is a video and image quality metric that models spatial and
+temporal aspects of vision, for both luminance and color. The metric is built
+on novel psychophysical models of chromatic spatiotemporal contrast sensitivity
+and cross-channel contrast masking. It accounts for the viewing conditions,
+geometric, and photometric characteristics of the display. It was trained to
+predict common video streaming distortions (e.g. video compression, rescaling,
+and transmission errors), and also 8 new distortion types related to AR/VR
+displays (e.g. light source and waveguide non-uniformities). To address the
+latter application, we collected our novel XR-Display-Artifact-Video quality
+dataset (XR-DAVID), comprised of 336 distorted videos. Extensive testing on
+XR-DAVID, as well as several datasets from the literature, indicate a
+significant gain in prediction performance compared to existing metrics.
+ColorVideoVDP opens the doors to many novel applications which require the
+joint automated spatiotemporal assessment of luminance and color distortions,
+including video streaming, display specification and design, visual comparison
+of results, and perceptually-guided quality optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Missing Modality in Multimodal Egocentric <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Merey Ramazanova, Alejandro Pardo, Humam Alwassel, Bernard Ghanem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal video understanding is crucial for analyzing egocentric videos,
+where integrating multiple sensory signals significantly enhances action
+recognition and moment localization. However, practical applications often
+grapple with incomplete modalities due to factors like privacy concerns,
+efficiency demands, or hardware malfunctions. Addressing this, our study delves
+into the impact of missing modalities on egocentric action recognition,
+particularly within transformer-based models. We introduce a novel concept
+-Missing Modality Token (MMT)-to maintain performance even when modalities are
+absent, a strategy that proves effective in the Ego4D, Epic-Kitchens, and
+Epic-Sounds datasets. Our method mitigates the performance loss, reducing it
+from its original $\sim 30\%$ drop to only $\sim 10\%$ when half of the test
+set is modal-incomplete. Through extensive experimentation, we demonstrate the
+adaptability of MMT to different training scenarios and its superiority in
+handling missing modalities compared to current methods. Our research
+contributes a comprehensive analysis and an innovative approach, opening
+avenues for more resilient multimodal systems in real-world settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Task-specific regularization loss towards model calibration for reliable
+  lung cancer detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehar Prateek Kalra, Mansi Singhal, Rohan Raju Dhanakashirur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lung cancer is one of the significant causes of cancer-related deaths
+globally. Early detection and treatment improve the chances of survival.
+Traditionally CT scans have been used to extract the most significant lung
+infection information and diagnose cancer. This process is carried out manually
+by an expert radiologist. The imbalance in the radiologists-to-population ratio
+in a country like India implies significant work pressure on them and thus
+raises the need to automate a few of their responsibilities. The tendency of
+modern-day Deep Neural networks to make overconfident mistakes limit their
+usage to detect cancer. In this paper, we propose a new task-specific loss
+function to calibrate the neural network to reduce the risk of overconfident
+mistakes. We use the state-of-the-art Multi-class Difference in Confidence and
+Accuracy (MDCA) loss in conjunction with the proposed task-specific loss
+function to achieve the same. We also integrate post-hoc calibration by
+performing temperature scaling on top of the train-time calibrated model. We
+demonstrate 5.98% improvement in the Expected Calibration Error (ECE) and a
+17.9% improvement in Maximum Calibration Error (MCE) as compared to the
+best-performing SOTA algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inter-Domain Mixup for Semi-Supervised Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11453v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11453v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jichang Li, Guanbin Li, Yizhou Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised domain adaptation (SSDA) aims to bridge source and target
+domain distributions, with a small number of target labels available, achieving
+better classification performance than unsupervised domain adaptation (UDA).
+However, existing SSDA work fails to make full use of label information from
+both source and target domains for feature alignment across domains, resulting
+in label mismatch in the label space during model testing. This paper presents
+a novel SSDA approach, Inter-domain Mixup with Neighborhood Expansion (IDMNE),
+to tackle this issue. Firstly, we introduce a cross-domain feature alignment
+strategy, Inter-domain Mixup, that incorporates label information into model
+adaptation. Specifically, we employ sample-level and manifold-level data mixing
+to generate compatible training samples. These newly established samples,
+combined with reliable and actual label information, display diversity and
+compatibility across domains, while such extra supervision thus facilitates
+cross-domain feature alignment and mitigates label mismatch. Additionally, we
+utilize Neighborhood Expansion to leverage high-confidence pseudo-labeled
+samples in the target domain, diversifying the label information of the target
+domain and thereby further increasing the performance of the adaptation model.
+Accordingly, the proposed approach outperforms existing state-of-the-art
+methods, achieving significant accuracy improvements on popular SSDA
+benchmarks, including DomainNet, Office-Home, and Office-31.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Publisted to Elsevier PR2024, available at
+  https://www.sciencedirect.com/science/article/pii/S0031320323007203?via%3Dihub</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Betweenness Clustering for Semi-Supervised Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11448v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11448v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jichang Li, Guanbin Li, Yizhou Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compared to unsupervised domain adaptation, semi-supervised domain adaptation
+(SSDA) aims to significantly improve the classification performance and
+generalization capability of the model by leveraging the presence of a small
+amount of labeled data from the target domain. Several SSDA approaches have
+been developed to enable semantic-aligned feature confusion between labeled (or
+pseudo labeled) samples across domains; nevertheless, owing to the scarcity of
+semantic label information of the target domain, they were arduous to fully
+realize their potential. In this study, we propose a novel SSDA approach named
+Graph-based Adaptive Betweenness Clustering (G-ABC) for achieving categorical
+domain alignment, which enables cross-domain semantic alignment by mandating
+semantic transfer from labeled data of both the source and target domains to
+unlabeled target samples. In particular, a heterogeneous graph is initially
+constructed to reflect the pairwise relationships between labeled samples from
+both domains and unlabeled ones of the target domain. Then, to degrade the
+noisy connectivity in the graph, connectivity refinement is conducted by
+introducing two strategies, namely Confidence Uncertainty based Node Removal
+and Prediction Dissimilarity based Edge Pruning. Once the graph has been
+refined, Adaptive Betweenness Clustering is introduced to facilitate semantic
+transfer by using across-domain betweenness clustering and within-domain
+betweenness clustering, thereby propagating semantic label information from
+labeled samples across domains to unlabeled target data. Extensive experiments
+on three standard benchmark datasets, namely DomainNet, Office-Home, and
+Office-31, indicated that our method outperforms previous state-of-the-art SSDA
+approaches, demonstrating the superiority of the proposed G-ABC algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures, published to IEEE TIP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ General Flow as Foundation Affordance for Scalable Robot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11439v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11439v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengbo Yuan, Chuan Wen, Tong Zhang, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the challenge of acquiring real-world manipulation skills with a
+scalable framework.Inspired by the success of large-scale auto-regressive
+prediction in Large Language Models (LLMs), we hold the belief that identifying
+an appropriate prediction target capable of leveraging large-scale datasets is
+crucial for achieving efficient and universal learning. Therefore, we propose
+to utilize flow, which represents the future trajectories of 3D points on
+objects of interest, as an ideal prediction target in robot learning. To
+exploit scalable data resources, we turn our attention to cross-embodiment
+datasets. We develop, for the first time, a language-conditioned prediction
+model directly from large-scale RGBD human video datasets. Our predicted flow
+offers actionable geometric and physics guidance, thus facilitating stable
+zero-shot skill transfer in real-world scenarios.We deploy our method with a
+policy based on closed-loop flow prediction. Remarkably, without any additional
+training, our method achieves an impressive 81% success rate in human-to-robot
+skill transfer, covering 18 tasks in 6 scenes. Our framework features the
+following benefits: (1) scalability: leveraging cross-embodiment data
+resources; (2) universality: multiple object categories, including rigid,
+articulated, and soft bodies; (3) stable skill transfer: providing actionable
+guidance with a small inference domain-gap. These lead to a new pathway towards
+scalable general robot learning. Data, code, and model weights will be made
+publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Geometric Prior Guided Feature Representation Learning for Long-Tailed
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanbiao Ma, Licheng Jiao, Fang Liu, Shuyuan Yang, Xu Liu, Puhua Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world data are long-tailed, the lack of tail samples leads to a
+significant limitation in the generalization ability of the model. Although
+numerous approaches of class re-balancing perform well for moderate class
+imbalance problems, additional knowledge needs to be introduced to help the
+tail class recover the underlying true distribution when the observed
+distribution from a few tail samples does not represent its true distribution
+properly, thus allowing the model to learn valuable information outside the
+observed domain. In this work, we propose to leverage the geometric information
+of the feature distribution of the well-represented head class to guide the
+model to learn the underlying distribution of the tail class. Specifically, we
+first systematically define the geometry of the feature distribution and the
+similarity measures between the geometries, and discover four phenomena
+regarding the relationship between the geometries of different feature
+distributions. Then, based on four phenomena, feature uncertainty
+representation is proposed to perturb the tail features by utilizing the
+geometry of the head class feature distribution. It aims to make the perturbed
+features cover the underlying distribution of the tail class as much as
+possible, thus improving the model's generalization performance in the test
+domain. Finally, we design a three-stage training scheme enabling feature
+uncertainty modeling to be successfully applied. Experiments on
+CIFAR-10/100-LT, ImageNet-LT, and iNaturalist2018 show that our proposed
+approach outperforms other similar methods on most metrics. In addition, the
+experimental phenomena we discovered are able to provide new perspectives and
+theoretical foundations for subsequent studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work was accepted by the IJCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Diffusion Time-steps for Unsupervised Representation Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongqi Yue, Jiankun Wang, Qianru Sun, Lei Ji, Eric I-Chao Chang, Hanwang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning is all about discovering the hidden modular
+attributes that generate the data faithfully. We explore the potential of
+Denoising Diffusion Probabilistic Model (DM) in unsupervised learning of the
+modular attributes. We build a theoretical framework that connects the
+diffusion time-steps and the hidden attributes, which serves as an effective
+inductive bias for unsupervised learning. Specifically, the forward diffusion
+process incrementally adds Gaussian noise to samples at each time-step, which
+essentially collapses different samples into similar ones by losing attributes,
+e.g., fine-grained attributes such as texture are lost with less noise added
+(i.e., early time-steps), while coarse-grained ones such as shape are lost by
+adding more noise (i.e., late time-steps). To disentangle the modular
+attributes, at each time-step t, we learn a t-specific feature to compensate
+for the newly lost attribute, and the set of all 1,...,t-specific features,
+corresponding to the cumulative set of lost attributes, are trained to make up
+for the reconstruction error of a pre-trained DM at time-step t. On CelebA,
+FFHQ, and Bedroom datasets, the learned feature significantly improves
+attribute classification and enables faithful counterfactual generation, e.g.,
+interpolating only one specified attribute between two images, validating the
+disentanglement quality. Codes are in https://github.com/yue-zhongqi/diti.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grayscale Image Colorization with GAN and CycleGAN in Different Image
+  Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liang, Yunchen Sheng, Yichen Mo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic colorization of grayscale image has been a challenging task.
+Previous research have applied supervised methods in conquering this problem [
+1]. In this paper, we reproduces a GAN-based coloring model, and experiments
+one of its variant. We also proposed a CycleGAN based model and experiments
+those methods on various datasets. The result shows that the proposed CycleGAN
+model does well in human-face coloring and comic coloring, but lack the ability
+to diverse colorization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing the vision-language foundation model with key semantic
+  knowledge-emphasized report refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Li, Weijian Huang, Hao Yang, Jiarun Liu, Shanshan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, vision-language representation learning has made remarkable
+advancements in building up medical foundation models, holding immense
+potential for transforming the landscape of clinical research and medical care.
+The underlying hypothesis is that the rich knowledge embedded in radiology
+reports can effectively assist and guide the learning process, reducing the
+need for additional labels. However, these reports tend to be complex and
+sometimes even consist of redundant descriptions that make the representation
+learning too challenging to capture the key semantic information. This paper
+develops a novel iterative vision-language representation learning framework by
+proposing a key semantic knowledge-emphasized report refinement method.
+Particularly, raw radiology reports are refined to highlight the key
+information according to a constructed clinical dictionary and two
+model-optimized knowledge-enhancement metrics. The iterative framework is
+designed to progressively learn, starting from gaining a general understanding
+of the patient's condition based on raw reports and gradually refines and
+extracts critical information essential to the fine-grained analysis tasks. The
+effectiveness of the proposed framework is validated on various downstream
+medical image analysis tasks, including disease classification,
+region-of-interest segmentation, and phrase grounding. Our framework surpasses
+seven state-of-the-art methods in both fine-tuning and zero-shot settings,
+demonstrating its encouraging potential for different clinical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embedded Hyperspectral Band Selection with Adaptive Optimization for
+  Image Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaniv Zimmer, Oren Glickman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral band selection plays a pivotal role in remote sensing and image
+analysis, aiming to identify the most informative spectral bands while
+minimizing computational overhead. In this paper, we introduce a pioneering
+approach for hyperspectral band selection that offers an embedded solution,
+making it well-suited for resource-constrained or real-time applications. Our
+proposed method, embedded Hyperspectral Band Selection (EHBS), excels in
+selecting the best bands without the need for prior processing, seamlessly
+integrating with the downstream task model. This is achieved through the
+adaptation of the Stochastic Gates (STG) algorithm, originally designed for
+feature selection, for hyperspectral band selection in the context of image
+semantic segmentation and the integration of a dynamic optimizer, DoG, which
+removes the need for the required tuning the learning rate. To assess the
+performance of our method, we introduce a novel metric for evaluating band
+selection methods across different target numbers of selected bands quantified
+by the Area Under the Curve (AUC). We conduct experiments on two distinct
+semantic-segmentation hyperspectral benchmark datasets, demonstrating its
+superiority in terms of its resulting accuracy and its ease of use compared to
+many common and state-of-the-art methods. Furthermore, our contributions extend
+beyond the realm of hyperspectral band selection. The adaptability of our
+approach to other tasks, especially those involving grouped features, opens up
+promising avenues for broader applications within the realm of deep learning,
+such as feature selection for feature groups. The demonstrated success on the
+tested datasets and the potential for application to a variety of tasks
+underscore the value of our method as a substantial addition to the field of
+computer vision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ S$^3$M-Net: Joint Learning of Semantic Segmentation and Stereo Matching
+  for Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11414v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11414v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Wu, Yi Feng, Chuang-Wei Liu, Fisher Yu, Qijun Chen, Rui Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation and stereo matching are two essential components of 3D
+environmental perception systems for autonomous driving. Nevertheless,
+conventional approaches often address these two problems independently,
+employing separate models for each task. This approach poses practical
+limitations in real-world scenarios, particularly when computational resources
+are scarce or real-time performance is imperative. Hence, in this article, we
+introduce S$^3$M-Net, a novel joint learning framework developed to perform
+semantic segmentation and stereo matching simultaneously. Specifically,
+S$^3$M-Net shares the features extracted from RGB images between both tasks,
+resulting in an improved overall scene understanding capability. This feature
+sharing process is realized using a feature fusion adaption (FFA) module, which
+effectively transforms the shared features into semantic space and subsequently
+fuses them with the encoded disparity features. The entire joint learning
+framework is trained by minimizing a novel semantic consistency-guided (SCG)
+loss, which places emphasis on the structural consistency in both tasks.
+Extensive experimental results conducted on the vKITTI2 and KITTI datasets
+demonstrate the effectiveness of our proposed joint learning framework and its
+superior performance compared to other state-of-the-art single-task networks.
+Our project webpage is accessible at mias.group/S3M-Net.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to IEEE Trans. on Intelligent Vehicles (T-IV)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Augmentation Training Makes Action Recognition Models More
+  Robust to Realistic Video Distribution Shifts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11406v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11406v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kiyoon Kim, Shreyank N Gowda, Panagiotis Eustratiadis, Antreas Antoniou, Robert B Fisher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recent advances in video action recognition achieving strong
+performance on existing benchmarks, these models often lack robustness when
+faced with natural distribution shifts between training and test data. We
+propose two novel evaluation methods to assess model resilience to such
+distribution disparity. One method uses two different datasets collected from
+different sources and uses one for training and validation, and the other for
+testing. More precisely, we created dataset splits of HMDB-51 or UCF-101 for
+training, and Kinetics-400 for testing, using the subset of the classes that
+are overlapping in both train and test datasets. The other proposed method
+extracts the feature mean of each class from the target evaluation dataset's
+training data (i.e. class prototype) and estimates test video prediction as a
+cosine similarity score between each sample to the class prototypes of each
+target class. This procedure does not alter model weights using the target
+dataset and it does not require aligning overlapping classes of two different
+datasets, thus is a very efficient method to test the model robustness to
+distribution shifts without prior knowledge of the target distribution. We
+address the robustness problem by adversarial augmentation training -
+generating augmented views of videos that are "hard" for the classification
+model by applying gradient ascent on the augmentation parameters - as well as
+"curriculum" scheduling the strength of the video augmentations. We
+experimentally demonstrate the superior performance of the proposed adversarial
+augmentation approach over baselines across three state-of-the-art action
+recognition models - TSM, Video Swin Transformer, and Uniformer. The presented
+work provides critical insight into model robustness to distribution shifts and
+presents effective techniques to enhance video action recognition performance
+in a real-world deployment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaskDiff: Modeling Mask Distribution with Diffusion Probabilistic Model
+  for Few-Shot Instance Segmentation <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05105v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05105v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh-Quan Le, Tam V. Nguyen, Trung-Nghia Le, Thanh-Toan Do, Minh N. Do, Minh-Triet Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot instance segmentation extends the few-shot learning paradigm to the
+instance segmentation task, which tries to segment instance objects from a
+query image with a few annotated examples of novel categories. Conventional
+approaches have attempted to address the task via prototype learning, known as
+point estimation. However, this mechanism depends on prototypes (\eg mean of
+$K-$shot) for prediction, leading to performance instability. To overcome the
+disadvantage of the point estimation mechanism, we propose a novel approach,
+dubbed MaskDiff, which models the underlying conditional distribution of a
+binary mask, which is conditioned on an object region and $K-$shot information.
+Inspired by augmentation approaches that perturb data with Gaussian noise for
+populating low data density regions, we model the mask distribution with a
+diffusion probabilistic model. We also propose to utilize classifier-free
+guided mask sampling to integrate category information into the binary mask
+generation process. Without bells and whistles, our proposed method
+consistently outperforms state-of-the-art methods on both base and novel
+classes of the COCO dataset while simultaneously being more stable than
+existing methods. The source code is available at:
+https://github.com/minhquanlecs/MaskDiff.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI 2024 (oral presentation)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predicting Age from White Matter Diffusivity with Residual Learning <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03500v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03500v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyu Gao, Michael E. Kim, Ho Hin Lee, Qi Yang, Nazirah Mohd Khairi, Praitayini Kanakaraj, Nancy R. Newlin, Derek B. Archer, Angela L. Jefferson, Warren D. Taylor, Brian D. Boyd, Lori L. Beason-Held, Susan M. Resnick, The BIOCARD Study Team, Yuankai Huo, Katherine D. Van Schaik, Kurt G. Schilling, Daniel Moyer, Ivana Išgum, Bennett A. Landman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imaging findings inconsistent with those expected at specific chronological
+age ranges may serve as early indicators of neurological disorders and
+increased mortality risk. Estimation of chronological age, and deviations from
+expected results, from structural MRI data has become an important task for
+developing biomarkers that are sensitive to such deviations. Complementary to
+structural analysis, diffusion tensor imaging (DTI) has proven effective in
+identifying age-related microstructural changes within the brain white matter,
+thereby presenting itself as a promising additional modality for brain age
+prediction. Although early studies have sought to harness DTI's advantages for
+age estimation, there is no evidence that the success of this prediction is
+owed to the unique microstructural and diffusivity features that DTI provides,
+rather than the macrostructural features that are also available in DTI data.
+Therefore, we seek to develop white-matter-specific age estimation to capture
+deviations from normal white matter aging. Specifically, we deliberately
+disregard the macrostructural information when predicting age from DTI scalar
+images, using two distinct methods. The first method relies on extracting only
+microstructural features from regions of interest. The second applies 3D
+residual neural networks (ResNets) to learn features directly from the images,
+which are non-linearly registered and warped to a template to minimize
+macrostructural variations. When tested on unseen data, the first method yields
+mean absolute error (MAE) of 6.11 years for cognitively normal participants and
+MAE of 6.62 years for cognitively impaired participants, while the second
+method achieves MAE of 4.69 years for cognitively normal participants and MAE
+of 4.96 years for cognitively impaired participants. We find that the ResNet
+model captures subtler, non-macrostructural features for brain age prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SPIE Medical Imaging: Image Processing. San Diego, CA. February 2024
+  (accepted as poster presentation)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenSim: Generating Robotic Simulation Tasks via Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01361v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01361v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lirui Wang, Yiyang Ling, Zhecheng Yuan, Mohit Shridhar, Chen Bao, Yuzhe Qin, Bailin Wang, Huazhe Xu, Xiaolong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collecting large amounts of real-world interaction data to train general
+robotic policies is often prohibitively expensive, thus motivating the use of
+simulation data. However, existing methods for data generation have generally
+focused on scene-level diversity (e.g., object instances and poses) rather than
+task-level diversity, due to the human effort required to come up with and
+verify novel tasks. This has made it challenging for policies trained on
+simulation data to demonstrate significant task-level generalization. In this
+paper, we propose to automatically generate rich simulation environments and
+expert demonstrations by exploiting a large language models' (LLM) grounding
+and coding ability. Our approach, dubbed GenSim, has two modes: goal-directed
+generation, wherein a target task is given to the LLM and the LLM proposes a
+task curriculum to solve the target task, and exploratory generation, wherein
+the LLM bootstraps from previous tasks and iteratively proposes novel tasks
+that would be helpful in solving more complex tasks. We use GPT4 to expand the
+existing benchmark by ten times to over 100 tasks, on which we conduct
+supervised finetuning and evaluate several LLMs including finetuned GPTs and
+Code Llama on code generation for robotic simulation tasks. Furthermore, we
+observe that LLMs-generated simulation programs can enhance task-level
+generalization significantly when used for multitask policy training. We
+further find that with minimal sim-to-real adaptation, the multitask policies
+pretrained on GPT4-generated simulation tasks exhibit stronger transfer to
+unseen long-horizon tasks in the real world and outperform baselines by 25%.
+See the project website (https://liruiw.github.io/gensim) for code, demos, and
+videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See our project website (https://liruiw.github.io/gensim), demo and
+  datasets (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code
+  (https://github.com/liruiw/GenSim) for more details</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dominating Set Database Selection for Visual Place Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05123v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05123v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasiia Kornilova, Ivan Moskalenko, Timofei Pushkin, Fakhriddin Tojiboev, Rahim Tariverdizadeh, Gonzalo Ferrer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an approach for creating a visual place recognition (VPR)
+database for localization in indoor environments from RGBD scanning sequences.
+The proposed approach is formulated as a minimization problem in terms of
+dominating set algorithm for graph, constructed from spatial information, and
+referred as DominatingSet. Our algorithm shows better scene coverage in
+comparison to other methodologies that are used for database creation. Also, we
+demonstrate that using DominatingSet, a database size could be up to 250-1400
+times smaller than the original scanning sequence while maintaining a recall
+rate of more than 80% on testing sequences. We evaluated our algorithm on
+7-scenes and BundleFusion datasets and an additionally recorded sequence in a
+highly repetitive office setting. In addition, the database selection can
+produce weakly-supervised labels for fine-tuning neural place recognition
+algorithms to particular settings, improving even more their accuracy. The
+paper also presents a fully automated pipeline for VPR database creation from
+RGBD scanning sequences, as well as a set of metrics for VPR database
+evaluation. The code and released data are available on our web-page~ --
+https://prime-slam.github.io/place-recognition-db/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gated Cross-Attention Network for Depth Completion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16301v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16301v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaogang Jia, Songlei Jian, Yusong Tan, Yonggang Che, Wei Chen, Zhengfa Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth completion is a popular research direction in the field of depth
+estimation. The fusion of color and depth features is the current critical
+challenge in this task, mainly due to the asymmetry between the rich scene
+details in color images and the sparse pixels in depth maps. To tackle this
+issue, we design an efficient Gated Cross-Attention Network that propagates
+confidence via a gating mechanism, simultaneously extracting and refining key
+information in both color and depth branches to achieve local spatial feature
+fusion. Additionally, we employ an attention network based on the Transformer
+in low-dimensional space to effectively fuse global features and increase the
+network's receptive field. With a simple yet efficient gating mechanism, our
+proposed method achieves fast and accurate depth completion without the need
+for additional branches or post-processing steps. At the same time, we use the
+Ray Tune mechanism with the AsyncHyperBandScheduler scheduler and the
+HyperOptSearch algorithm to automatically search for the optimal number of
+module iterations, which also allows us to achieve performance comparable to
+state-of-the-art methods. We conduct experiments on both indoor and outdoor
+scene datasets. Our fast network achieves Pareto-optimal solutions in terms of
+time and accuracy, and at the time of submission, our accurate network ranks
+first among all published papers on the KITTI official website in terms of
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>able Game Models: Text-Guided Game Simulation via Masked Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13472v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13472v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Willi Menapace, Aliaksandr Siarohin, Stéphane Lathuilière, Panos Achlioptas, Vladislav Golyanik, Sergey Tulyakov, Elisa Ricci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural video game simulators emerged as powerful tools to generate and edit
+videos. Their idea is to represent games as the evolution of an environment's
+state driven by the actions of its agents. While such a paradigm enables users
+to play a game action-by-action, its rigidity precludes more semantic forms of
+control. To overcome this limitation, we augment game models with prompts
+specified as a set of natural language actions and desired states. The result-a
+Promptable Game Model (PGM)-makes it possible for a user to play the game by
+prompting it with high- and low-level action sequences. Most captivatingly, our
+PGM unlocks the director's mode, where the game is played by specifying goals
+for the agents in the form of a prompt. This requires learning "game AI",
+encapsulated by our animation model, to navigate the scene using high-level
+constraints, play against an adversary, and devise a strategy to win a point.
+To render the resulting state, we use a compositional NeRF representation
+encapsulated in our synthesis model. To foster future research, we present
+newly collected, annotated and calibrated Tennis and Minecraft datasets. Our
+method significantly outperforms existing neural video game simulators in terms
+of rendering quality and unlocks applications beyond the capabilities of the
+current state of the art. Our framework, data, and models are available at
+https://snap-research.github.io/promptable-game-models/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Transactions on Graphics \c{opyright} Copyright is held by the
+  owner/author(s) 2023. This is the author's version of the work. It is posted
+  here for your personal use. Not for redistribution. The definitive Version of
+  Record was published in ACM Transactions on Graphics,
+  http://dx.doi.org/10.1145/3635705</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DePT: Decomposed <span class="highlight-title">Prompt</span> Tuning for Parameter-Efficient Fine-tuning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05173v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05173v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxiang Shi, Aldo Lipani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt tuning (PT), where a small amount of trainable soft (continuous)
+prompt vectors is affixed to the input of language models (LM), has shown
+promising results across various tasks and models for parameter-efficient
+fine-tuning (PEFT). PT stands out from other PEFT approaches because it
+maintains competitive performance with fewer trainable parameters and does not
+drastically scale up its parameters as the model size expands. However, PT
+introduces additional soft prompt tokens, leading to longer input sequences,
+which significantly impacts training and inference time and memory usage due to
+the Transformer's quadratic complexity. Particularly concerning for Large
+Language Models (LLMs) that face heavy daily querying. To address this issue,
+we propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt
+into a shorter soft prompt and a pair of low-rank matrices that are then
+optimised with two different learning rates. This allows DePT to achieve better
+performance while saving substantial memory and time costs compared to vanilla
+PT and its variants, without changing trainable parameter sizes. Through
+extensive experiments on 23 natural language processing (NLP) and
+vision-language (VL) tasks, we demonstrate that DePT outperforms
+state-of-the-art PEFT approaches, including the full fine-tuning baseline, in
+some scenarios. Additionally, we empirically show that DEPT grows more
+efficient as the model size increases. Our further study reveals that DePT
+integrates seamlessly with parameter-efficient transfer learning in the
+few-shot learning setting and highlights its adaptability to various model
+architectures and sizes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. Code is available at https://github.com/ZhengxiangShi/DePT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffuMask: Synthesizing Images with Pixel-level Annotations for Semantic
+  Segmentation Using Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11681v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11681v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijia Wu, Yuzhong Zhao, Mike Zheng Shou, Hong Zhou, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collecting and annotating images with pixel-wise labels is time-consuming and
+laborious. In contrast, synthetic data can be freely available using a
+generative model (e.g., DALL-E, Stable Diffusion). In this paper, we show that
+it is possible to automatically obtain accurate semantic masks of synthetic
+images generated by the Off-the-shelf Stable Diffusion model, which uses only
+text-image pairs during training. Our approach, called DiffuMask, exploits the
+potential of the cross-attention map between text and image, which is natural
+and seamless to extend the text-driven image synthesis to semantic mask
+generation. DiffuMask uses text-guided cross-attention information to localize
+class/word-specific regions, which are combined with practical techniques to
+create a novel high-resolution and class-discriminative pixel-wise mask. The
+methods help to reduce data collection and annotation costs obviously.
+Experiments demonstrate that the existing segmentation methods trained on
+synthetic data of DiffuMask can achieve a competitive performance over the
+counterpart of real data (VOC 2012, Cityscapes). For some classes (e.g., bird),
+DiffuMask presents promising performance, close to the stateof-the-art result
+of real data (within 3% mIoU gap). Moreover, in the open-vocabulary
+segmentation (zero-shot) setting, DiffuMask achieves a new SOTA result on
+Unseen class of VOC 2012. The project website can be found at
+https://weijiawu.github.io/DiffusionMask/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Visibility in Nighttime Haze Images Using Guided APSF and
+  Gradient Adaptive Convolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01738v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01738v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeying Jin, Beibei Lin, Wending Yan, Yuan Yuan, Wei Ye, Robby T. Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visibility in hazy nighttime scenes is frequently reduced by multiple
+factors, including low light, intense glow, light scattering, and the presence
+of multicolored light sources. Existing nighttime dehazing methods often
+struggle with handling glow or low-light conditions, resulting in either
+excessively dark visuals or unsuppressed glow outputs. In this paper, we
+enhance the visibility from a single nighttime haze image by suppressing glow
+and enhancing low-light regions. To handle glow effects, our framework learns
+from the rendered glow pairs. Specifically, a light source aware network is
+proposed to detect light sources of night images, followed by the APSF
+(Atmospheric Point Spread Function)-guided glow rendering. Our framework is
+then trained on the rendered images, resulting in glow suppression. Moreover,
+we utilize gradient-adaptive convolution, to capture edges and textures in hazy
+scenes. By leveraging extracted edges and textures, we enhance the contrast of
+the scene without losing important structural details. To boost low-light
+intensity, our network learns an attention map, then adjusted by gamma
+correction. This attention has high values on low-light regions and low values
+on haze and glow regions. Extensive evaluation on real nighttime haze images,
+demonstrates the effectiveness of our method. Our experiments demonstrate that
+our method achieves a PSNR of 30.38dB, outperforming state-of-the-art methods
+by 13% on GTA5 nighttime haze dataset. Our data and code is available at
+https://github.com/jinyeying/nighttime_dehaze.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM'MM2023, https://github.com/jinyeying/nighttime_dehaze</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ultrafast and Ultralight Network-Based Intelligent System for Real-time
+  Diagnosis of Ear diseases in Any Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10610v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10610v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubiao Yue, Xinyu Zeng, Xiaoqiang Shi, Meiping Zhang, Haihua Liang, Fan Zhang, Yanmei Chen, Zefeng Xie, Wenrui Wu, Zhenzhang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional ear disease diagnosis heavily depends on experienced specialists
+and specialized equipment, frequently resulting in misdiagnoses, treatment
+delays, and financial burdens for some patients. Utilizing deep learning models
+for efficient ear disease diagnosis has proven effective and affordable.
+However, existing research overlooked model inference speed and parameter size
+required for deployment. To tackle these challenges, we constructed a
+large-scale dataset comprising eight ear disease categories and normal ear
+canal samples from two hospitals. Inspired by ShuffleNetV2, we developed
+Best-EarNet, an ultrafast and ultralight network enabling real-time ear disease
+diagnosis. Best-EarNet incorporates the novel Local-Global Spatial Feature
+Fusion Module which can capture global and local spatial information
+simultaneously and guide the network to focus on crucial regions within feature
+maps at various levels, mitigating low accuracy issues. Moreover, our network
+uses multiple auxiliary classification heads for efficient parameter
+optimization. With 0.77M parameters, Best-EarNet achieves an average frames per
+second of 80 on CPU. Employing transfer learning and five-fold cross-validation
+with 22,581 images from Hospital-1, the model achieves an impressive 95.23%
+accuracy. External testing on 1,652 images from Hospital-2 validates its
+performance, yielding 92.14% accuracy. Compared to state-of-the-art networks,
+Best-EarNet establishes a new state-of-the-art (SOTA) in practical
+applications. Most importantly, we developed an intelligent diagnosis system
+called Ear Keeper, which can be deployed on common electronic devices. By
+manipulating a compact electronic otoscope, users can perform comprehensive
+scanning and diagnosis of the ear canal using real-time video. This study
+provides a novel paradigm for ear endoscopy and other medical endoscopic image
+recognition applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generic Knowledge Boosted <span class="highlight-title">Pre-train</span>ing For Remote Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04614v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04614v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyue Huang, Mingming Zhang, Yuan Gong, Qingjie Liu, Yunhong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models are essential for scene classification, change
+detection, land cover segmentation, and other remote sensing image
+understanding tasks. Most backbones of existing remote sensing deep learning
+models are typically initialized by pre-trained weights obtained from ImageNet
+pre-training (IMP). However, domain gaps exist between remote sensing images
+and natural images (e.g., ImageNet), making deep learning models initialized by
+pre-trained weights of IMP perform poorly for remote sensing image
+understanding. Although some pre-training methods are studied in the remote
+sensing community, current remote sensing pre-training methods face the problem
+of vague generalization by only using remote sensing images. In this paper, we
+propose a novel remote sensing pre-training framework, Generic Knowledge
+Boosted Remote Sensing Pre-training (GeRSP), to learn robust representations
+from remote sensing and natural images for remote sensing understanding tasks.
+GeRSP contains two pre-training branches: (1) A self-supervised pre-training
+branch is adopted to learn domain-related representations from unlabeled remote
+sensing images. (2) A supervised pre-training branch is integrated into GeRSP
+for general knowledge learning from labeled natural images. Moreover, GeRSP
+combines two pre-training branches using a teacher-student architecture to
+simultaneously learn representations with general and special knowledge, which
+generates a powerful pre-trained model for deep learning model initialization.
+Finally, we evaluate GeRSP and other remote sensing pre-training methods on
+three downstream tasks, i.e., object detection, semantic segmentation, and
+scene classification. The extensive experimental results consistently
+demonstrate that GeRSP can effectively learn robust representations in a
+unified manner, improving the performance of remote sensing downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GaussianHead: High-fidelity Head Avatars with Learnable Gaussian
+  Derivation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01632v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01632v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Wang, Jiu-Cheng Xie, Xianyan Li, Feng Xu, Chi-Man Pun, Hao Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Constructing vivid 3D head avatars for given subjects and realizing a series
+of animations on them is valuable yet challenging. This paper presents
+GaussianHead, which models the actional human head with anisotropic 3D
+Gaussians. In our framework, a motion deformation field and multi-resolution
+tri-plane are constructed respectively to deal with the head's dynamic geometry
+and complex texture. Notably, we impose an exclusive derivation scheme on each
+Gaussian, which generates its multiple doppelgangers through a set of learnable
+parameters for position transformation. With this design, we can compactly and
+accurately encode the appearance information of Gaussians, even those fitting
+the head's particular components with sophisticated structures. In addition, an
+inherited derivation strategy for newly added Gaussians is adopted to
+facilitate training acceleration. Extensive experiments show that our method
+can produce high-fidelity renderings, outperforming state-of-the-art approaches
+in reconstruction, cross-identity reenactment, and novel view synthesis tasks.
+Our code is available at: https://github.com/chiehwangs/gaussian-head.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Toward Sufficient Spatial-Frequency Interaction for Gradient-aware
+  Underwater Image Enhancement <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.04089v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.04089v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Zhao, Weiling Cai, Chenyu Dong, Ziqi Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underwater images suffer from complex and diverse degradation, which
+inevitably affects the performance of underwater visual tasks. However, most
+existing learning-based Underwater image enhancement (UIE) methods mainly
+restore such degradations in the spatial domain, and rarely pay attention to
+the fourier frequency information. In this paper, we develop a novel UIE
+framework based on spatial-frequency interaction and gradient maps, namely
+SFGNet, which consists of two stages. Specifically, in the first stage, we
+propose a dense spatial-frequency fusion network (DSFFNet), mainly including
+our designed dense fourier fusion block and dense spatial fusion block,
+achieving sufficient spatial-frequency interaction by cross connections between
+these two blocks. In the second stage, we propose a gradient-aware corrector
+(GAC) to further enhance perceptual details and geometric structures of images
+by gradient map. Experimental results on two real-world underwater image
+datasets show that our approach can successfully enhance underwater images, and
+achieves competitive performance in visual quality improvement. The code is
+available at https://github.com/zhihefang/SFGNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLID: Controlled-Length Image Descriptions with Limited Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14835v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14835v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elad Hirsch, Ayellet Tal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Controllable image captioning models generate human-like image descriptions,
+enabling some kind of control over the generated captions. This paper focuses
+on controlling the caption length, i.e. a short and concise description or a
+long and detailed one. Since existing image captioning datasets contain mostly
+short captions, generating long captions is challenging. To address the
+shortage of long training examples, we propose to enrich the dataset with
+varying-length self-generated captions. These, however, might be of varying
+quality and are thus unsuitable for conventional training. We introduce a novel
+training strategy that selects the data points to be used at different times
+during the training. Our method dramatically improves the length-control
+abilities, while exhibiting SoTA performance in terms of caption quality. Our
+approach is general and is shown to be applicable also to paragraph generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Tiny Network for Recognition-Oriented Face Image Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.04852v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.04852v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baoyun Peng, Min Liu, Zhaoning Zhang, Kai Xu, Dongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition has made significant progress in recent years due to deep
+convolutional neural networks (CNN). In many face recognition (FR) scenarios,
+face images are acquired from a sequence with huge intra-variations. These
+intra-variations, which are mainly affected by the low-quality face images,
+cause instability of recognition performance. Previous works have focused on
+ad-hoc methods to select frames from a video or use face image quality
+assessment (FIQA) methods, which consider only a particular or combination of
+several distortions.
+  In this work, we present an efficient non-reference image quality assessment
+for FR that directly links image quality assessment (IQA) and FR. More
+specifically, we propose a new measurement to evaluate image quality without
+any reference. Based on the proposed quality measurement, we propose a deep
+Tiny Face Quality network (tinyFQnet) to learn a quality prediction function
+from data.
+  We evaluate the proposed method for different powerful FR models on two
+classical video-based (or template-based) benchmark: IJB-B and YTF. Extensive
+experiments show that, although the tinyFQnet is much smaller than the others,
+the proposed method outperforms state-of-the-art quality assessment methods in
+terms of effectiveness and efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Generalize over Subpartitions for Heterogeneity-aware Domain
+  Adaptive Nuclei Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianan Fan, Dongnan Liu, Hang Chang, Weidong Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Annotation scarcity and cross-modality/stain data distribution shifts are two
+major obstacles hindering the application of deep learning models for nuclei
+analysis, which holds a broad spectrum of potential applications in digital
+pathology. Recently, unsupervised domain adaptation (UDA) methods have been
+proposed to mitigate the distributional gap between different imaging
+modalities for unsupervised nuclei segmentation in histopathology images.
+However, existing UDA methods are built upon the assumption that data
+distributions within each domain should be uniform. Based on the
+over-simplified supposition, they propose to align the histopathology target
+domain with the source domain integrally, neglecting severe intra-domain
+discrepancy over subpartitions incurred by mixed cancer types and sampling
+organs. In this paper, for the first time, we propose to explicitly consider
+the heterogeneity within the histopathology domain and introduce open compound
+domain adaptation (OCDA) to resolve the crux. In specific, a two-stage
+disentanglement framework is proposed to acquire domain-invariant feature
+representations at both image and instance levels. The holistic design
+addresses the limitations of existing OCDA approaches which struggle to capture
+instance-wise variations. Two regularization strategies are specifically
+devised herein to leverage the rich subpartition-specific characteristics in
+histopathology images and facilitate subdomain decomposition. Moreover, we
+propose a dual-branch nucleus shape and structure preserving module to prevent
+nucleus over-generation and deformation in the synthesized images. Experimental
+results on both cross-modality and cross-stain scenarios over a broad range of
+diverse datasets demonstrate the superiority of our method compared with
+state-of-the-art UDA and OCDA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MVBench: A Comprehensive Multi-modal Video Understanding Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.17005v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.17005v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunchang Li, Yali Wang, Yinan He, Yizhuo Li, Yi Wang, Yi Liu, Zun Wang, Jilan Xu, Guo Chen, Ping Luo, Limin Wang, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of Multi-modal Large Language Models (MLLMs), a
+number of diagnostic benchmarks have recently emerged to evaluate the
+comprehension capabilities of these models. However, most benchmarks
+predominantly assess spatial understanding in the static image tasks, while
+overlooking temporal understanding in the dynamic video tasks. To alleviate
+this issue, we introduce a comprehensive Multi-modal Video understanding
+Benchmark, namely MVBench, which covers 20 challenging video tasks that cannot
+be effectively solved with a single frame. Specifically, we first introduce a
+novel static-to-dynamic method to define these temporal-related tasks. By
+transforming various static tasks into dynamic ones, we enable the systematic
+generation of video tasks that require a broad spectrum of temporal skills,
+ranging from perception to cognition. Then, guided by the task definition, we
+automatically convert public video annotations into multiple-choice QA to
+evaluate each task. On one hand, such a distinct paradigm allows us to build
+MVBench efficiently, without much manual intervention. On the other hand, it
+guarantees evaluation fairness with ground-truth video annotations, avoiding
+the biased scoring of LLMs. Moreover, we further develop a robust video MLLM
+baseline, i.e., VideoChat2, by progressive multi-modal training with diverse
+instruction-tuning data. The extensive results on our MVBench reveal that, the
+existing MLLMs are far from satisfactory in temporal understanding, while our
+VideoChat2 largely surpasses these leading models by over 15% on MVBench. All
+models and data are available at https://github.com/OpenGVLab/Ask-Anything.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 7 figures, 19 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Identifiable Unsupervised Domain Translation: A Diversified
+  Distribution Matching Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09671v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09671v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sagar Shrestha, Xiao Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain translation (UDT) aims to find functions that convert
+samples from one domain (e.g., sketches) to another domain (e.g., photos)
+without changing the high-level semantic meaning (also referred to as
+``content''). The translation functions are often sought by probability
+distribution matching of the transformed source domain and target domain.
+CycleGAN stands as arguably the most representative approach among this line of
+work. However, it was noticed in the literature that CycleGAN and variants
+could fail to identify the desired translation functions and produce
+content-misaligned translations. This limitation arises due to the presence of
+multiple translation functions -- referred to as ``measure-preserving
+automorphism" (MPA) -- in the solution space of the learning criteria. Despite
+awareness of such identifiability issues, solutions have remained elusive. This
+study delves into the core identifiability inquiry and introduces an MPA
+elimination theory. Our analysis shows that MPA is unlikely to exist, if
+multiple pairs of diverse cross-domain conditional distributions are matched by
+the learning function. Our theory leads to a UDT learner using distribution
+matching over auxiliary variable-induced subsets of the domains -- other than
+over the entire data domains as in the classical approaches. The proposed
+framework is the first to rigorously establish translation identifiability
+under reasonable UDT settings, to our best knowledge. Experiments corroborate
+with our theoretical claims.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Animal3D: A Comprehensive <span class="highlight-title">Dataset</span> of 3D Animal Pose and Shape 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11737v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11737v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacong Xu, Yi Zhang, Jiawei Peng, Wufei Ma, Artur Jesslen, Pengliang Ji, Qixin Hu, Jiehua Zhang, Qihao Liu, Jiahao Wang, Wei Ji, Chen Wang, Xiaoding Yuan, Prakhar Kaushik, Guofeng Zhang, Jie Liu, Yushan Xie, Yawen Cui, Alan Yuille, Adam Kortylewski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately estimating the 3D pose and shape is an essential step towards
+understanding animal behavior, and can potentially benefit many downstream
+applications, such as wildlife conservation. However, research in this area is
+held back by the lack of a comprehensive and diverse dataset with high-quality
+3D pose and shape annotations. In this paper, we propose Animal3D, the first
+comprehensive dataset for mammal animal 3D pose and shape estimation. Animal3D
+consists of 3379 images collected from 40 mammal species, high-quality
+annotations of 26 keypoints, and importantly the pose and shape parameters of
+the SMAL model. All annotations were labeled and checked manually in a
+multi-stage process to ensure highest quality results. Based on the Animal3D
+dataset, we benchmark representative shape and pose estimation models at: (1)
+supervised learning from only the Animal3D data, (2) synthetic to real transfer
+from synthetically generated images, and (3) fine-tuning human pose and shape
+estimation models. Our experimental results demonstrate that predicting the 3D
+shape and pose of animals across species remains a very challenging task,
+despite significant advances in human pose estimation. Our results further
+demonstrate that synthetic pre-training is a viable strategy to boost the model
+performance. Overall, Animal3D opens new directions for facilitating future
+research in animal 3D pose and shape estimation, and is publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures, link to the dataset:
+  https://xujiacong.github.io/Animal3D/</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">9</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Are We Optimizing For? A Human-centric Evaluation Of Deep
+  Learning-based Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixuan Sun, Avinash Akella, Xinyi Wu, Ruoyan Kong, Joseph A. Konstan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based (DL) models in recommender systems (RecSys) have gained
+significant recognition for their remarkable accuracy in predicting user
+preferences. However, their performance often lacks a comprehensive evaluation
+from a human-centric perspective, which encompasses various dimensions beyond
+simple interest matching. In this work, we have developed a robust
+human-centric evaluation framework that incorporates seven diverse metrics to
+assess the quality of recommendations generated by five recent open-sourced DL
+models. Our evaluation datasets consist of both offline benchmark data and
+personalized online recommendation feedback collected from 445 real users. We
+find that (1) different DL models have different pros and cons in the
+multi-dimensional metrics that we test with; (2) users generally want a
+combination of accuracy with at least one another human values in the
+recommendation; (3) the degree of combination of different values needs to be
+carefully experimented to user preferred level.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-context Learning with Retrieved Demonstrations for Language Models: A
+  <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        an Luo, Xin Xu, Yue Liu, Panupong Pasupat, Mehran Kazemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models, especially pre-trained large language models, have showcased
+remarkable abilities as few-shot in-context learners (ICL), adept at adapting
+to new tasks with just a few demonstrations in the input context. However, the
+model's ability to perform ICL is sensitive to the choice of the few-shot
+demonstrations. Instead of using a fixed set of demonstrations, one recent
+development is to retrieve demonstrations tailored to each input query. The
+implementation of demonstration retrieval is relatively straightforward,
+leveraging existing databases and retrieval systems. This not only improves the
+efficiency and scalability of the learning process but also has been shown to
+reduce biases inherent in manual example selection. In light of the encouraging
+results and growing research in ICL with retrieved demonstrations, we conduct
+an extensive review of studies in this area. In this survey, we discuss and
+compare different design choices for retrieval models, retrieval training
+procedures, and inference algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simple Domain Adaptation for Sparse Retrievers <span class="chip">ECIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11509v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11509v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathias Vast, Yuxuan Zong, Basile Van Cooten, Benjamin Piwowarski, Laure Soulier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Information Retrieval, and more generally in Natural Language Processing,
+adapting models to specific domains is conducted through fine-tuning. Despite
+the successes achieved by this method and its versatility, the need for
+human-curated and labeled data makes it impractical to transfer to new tasks,
+domains, and/or languages when training data doesn't exist. Using the model
+without training (zero-shot) is another option that however suffers an
+effectiveness cost, especially in the case of first-stage retrievers. Numerous
+research directions have emerged to tackle these issues, most of them in the
+context of adapting to a task or a language. However, the literature is scarcer
+for domain (or topic) adaptation. In this paper, we address this issue of
+cross-topic discrepancy for a sparse first-stage retriever by transposing a
+method initially designed for language adaptation. By leveraging pre-training
+on the target data to learn domain-specific knowledge, this technique
+alleviates the need for annotated data and expands the scope of domain
+adaptation. Despite their relatively good generalization ability, we show that
+even sparse retrievers can benefit from our simple domain adaptation method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECIR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Recommendation Diversity by Re-ranking with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Carraro, Derek Bridge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has long been recognized that it is not enough for a Recommender System
+(RS) to provide recommendations based only on their relevance to users. Among
+many other criteria, the set of recommendations may need to be diverse in order
+to handle uncertainty and offer a meaningful choice. The literature reports
+many ways of measuring diversity and ways of improving the diversity of a set
+of recommendations, most notably by re-ranking and selecting from a larger set
+of candidate recommendations. Driven by promising insights from the literature
+on how to incorporate versatile Large Language Models (LLMs) into the RS
+pipeline, in this paper, we show how LLMs can be used for diversity re-ranking.
+  We begin with an informal study that verifies that LLMs can be used for
+re-ranking tasks and do have some understanding of the concept of diversity.
+Then, we design a more rigorous methodology where LLMs are prompted to generate
+a diverse ranking from a candidate ranking using various prompt templates with
+different re-ranking instructions in a zero-shot fashion. We conduct
+comprehensive experiments testing state-of-the-art conversational LLMs from the
+GPT and Llama families. We compare their re-ranking capabilities with random
+re-ranking and various traditional re-ranking methods from the literature (MMR,
+xQuAD and RxQuAD). We find that LLM-based re-ranking outperforms random
+re-ranking across all the metrics that we use but does not perform as well as
+the traditional re-ranking methods. We gain insight into prompt design for this
+task (e.g.\ on the whole, it is better to prompt for diversity rather than a
+balance of diversity and relevance). Given that no special knowledge
+engineering is needed, we conclude that LLM-based re-ranking is a promising
+approach, and we highlight directions for future research. We open-source the
+code of our experiments for reproducibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CheX-<span class="highlight-title">GPT</span>: Harnessing Large Language Models for Enhanced Chest X-ray
+  Report Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11505v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11505v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jawook Gu, Han-Cheol Cho, Jiho Kim, Kihyun You, Eun Kyoung Hong, Byungseok Roh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Free-text radiology reports present a rich data source for various medical
+tasks, but effectively labeling these texts remains challenging. Traditional
+rule-based labeling methods fall short of capturing the nuances of diverse
+free-text patterns. Moreover, models using expert-annotated data are limited by
+data scarcity and pre-defined classes, impacting their performance, flexibility
+and scalability. To address these issues, our study offers three main
+contributions: 1) We demonstrate the potential of GPT as an adept labeler using
+carefully designed prompts. 2) Utilizing only the data labeled by GPT, we
+trained a BERT-based labeler, CheX-GPT, which operates faster and more
+efficiently than its GPT counterpart. 3) To benchmark labeler performance, we
+introduced a publicly available expert-annotated test set, MIMIC-500,
+comprising 500 cases from the MIMIC validation set. Our findings demonstrate
+that CheX-GPT not only excels in labeling accuracy over existing models, but
+also showcases superior efficiency, flexibility, and scalability, supported by
+our introduction of the MIMIC-500 dataset for robust benchmarking. Code and
+models are available at https://github.com/kakaobrain/CheXGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ D2K: Turning Historical Data into Retrievable Knowledge for Recommender
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarui Qin, Weiwen Liu, Ruiming Tang, Weinan Zhang, Yong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A vast amount of user behavior data is constantly accumulating on today's
+large recommendation platforms, recording users' various interests and tastes.
+Preserving knowledge from the old data while new data continually arrives is a
+vital problem for recommender systems. Existing approaches generally seek to
+save the knowledge implicitly in the model parameters. However, such a
+parameter-centric approach lacks scalability and flexibility -- the capacity is
+hard to scale, and the knowledge is inflexible to utilize. Hence, in this work,
+we propose a framework that turns massive user behavior data to retrievable
+knowledge (D2K). It is a data-centric approach that is model-agnostic and easy
+to scale up. Different from only storing unary knowledge such as the user-side
+or item-side information, D2K propose to store ternary knowledge for
+recommendation, which is determined by the complete recommendation factors --
+user, item, and context. The knowledge retrieved by target samples can be
+directly used to enhance the performance of any recommendation algorithms.
+Specifically, we introduce a Transformer-based knowledge encoder to transform
+the old data into knowledge with the user-item-context cross features. A
+personalized knowledge adaptation unit is devised to effectively exploit the
+information from the knowledge base by adapting the retrieved knowledge to the
+target samples. Extensive experiments on two public datasets show that D2K
+significantly outperforms existing baselines and is compatible with a major
+collection of recommendation algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimating the Usefulness of Clarifying Questions and Answers for
+  Conversational Search <span class="chip">ECIR '24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan Sekulić, Weronika Łajewska, Krisztian Balog, Fabio Crestani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the body of research directed towards constructing and generating
+clarifying questions in mixed-initiative conversational search systems is vast,
+research aimed at processing and comprehending users' answers to such questions
+is scarce. To this end, we present a simple yet effective method for processing
+answers to clarifying questions, moving away from previous work that simply
+appends answers to the original query and thus potentially degrades retrieval
+performance. Specifically, we propose a classifier for assessing usefulness of
+the prompted clarifying question and an answer given by the user. Useful
+questions or answers are further appended to the conversation history and
+passed to a transformer-based query rewriting module. Results demonstrate
+significant improvements over strong non-mixed-initiative baselines.
+Furthermore, the proposed approach mitigates the performance drops when non
+useful questions and answers are utilized.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the author's version of the work. The definitive version is
+  published in: Proceedings of the 46th European Conference on Information
+  Retrieval (ECIR '24), March 24-28, 2024, Glasgow, Scotland</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Reliable and Factual Response Generation: Detecting Unanswerable
+  Questions in Information-Seeking Conversations <span class="chip">ECIR '24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11452v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11452v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weronika Łajewska, Krisztian Balog
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI models face the challenge of hallucinations that can undermine
+users' trust in such systems. We approach the problem of conversational
+information seeking as a two-step process, where relevant passages in a corpus
+are identified first and then summarized into a final system response. This way
+we can automatically assess if the answer to the user's question is present in
+the corpus. Specifically, our proposed method employs a sentence-level
+classifier to detect if the answer is present, then aggregates these
+predictions on the passage level, and eventually across the top-ranked passages
+to arrive at a final answerability estimate. For training and evaluation, we
+develop a dataset based on the TREC CAsT benchmark that includes answerability
+labels on the sentence, passage, and ranking levels. We demonstrate that our
+proposed method represents a strong baseline and outperforms a state-of-the-art
+LLM on the answerability prediction task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the author's version of the work. The definitive version is
+  published in: Proceedings of the 46th European Conference on Information
+  Retrieval} (ECIR '24), March 24--28, 2024, Glasgow, Scotland</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On-Device Recommender Systems: A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongzhi Yin, Liang Qu, Tong Chen, Wei Yuan, Ruiqi Zheng, Jing Long, Xin Xia, Yuhui Shi, Chengqi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems have been widely deployed in various real-world
+applications to help users identify content of interest from massive amounts of
+information. Traditional recommender systems work by collecting user-item
+interaction data in a cloud-based data center and training a centralized model
+to perform the recommendation service. However, such cloud-based recommender
+systems (CloudRSs) inevitably suffer from excessive resource consumption,
+response latency, as well as privacy and security risks concerning both data
+and models. Recently, driven by the advances in storage, communication, and
+computation capabilities of edge devices, there has been a shift of focus from
+CloudRSs to on-device recommender systems (DeviceRSs), which leverage the
+capabilities of edge devices to minimize centralized data storage requirements,
+reduce the response latency caused by communication overheads, and enhance user
+privacy and security by localizing data processing and model training. Despite
+the rapid rise of DeviceRSs, there is a clear absence of timely literature
+reviews that systematically introduce, categorize and contrast these methods.
+To bridge this gap, we aim to provide a comprehensive survey of DeviceRSs,
+covering three main aspects: (1) the deployment and inference of DeviceRSs (2)
+the training and update of DeviceRSs (3) the security and privacy of DeviceRSs.
+Furthermore, we provide a fine-grained and systematic taxonomy of the methods
+involved in each aspect, followed by a discussion regarding challenges and
+future research directions. This is the first comprehensive survey on DeviceRSs
+that covers a spectrum of tasks to fit various needs. We believe this survey
+will help readers effectively grasp the current research status in this field,
+equip them with relevant technical foundations, and stimulate new research
+ideas for developing DeviceRSs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">13</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Are We Optimizing For? A Human-centric Evaluation Of Deep
+  Learning-based Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixuan Sun, Avinash Akella, Xinyi Wu, Ruoyan Kong, Joseph A. Konstan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based (DL) models in recommender systems (RecSys) have gained
+significant recognition for their remarkable accuracy in predicting user
+preferences. However, their performance often lacks a comprehensive evaluation
+from a human-centric perspective, which encompasses various dimensions beyond
+simple interest matching. In this work, we have developed a robust
+human-centric evaluation framework that incorporates seven diverse metrics to
+assess the quality of recommendations generated by five recent open-sourced DL
+models. Our evaluation datasets consist of both offline benchmark data and
+personalized online recommendation feedback collected from 445 real users. We
+find that (1) different DL models have different pros and cons in the
+multi-dimensional metrics that we test with; (2) users generally want a
+combination of accuracy with at least one another human values in the
+recommendation; (3) the degree of combination of different values needs to be
+carefully experimented to user preferred level.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-to-Image Cross-Modal Generation: A Systematic <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maciej Żelaszczyk, Jacek Mańdziuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We review research on generating visual data from text from the angle of
+"cross-modal generation." This point of view allows us to draw parallels
+between various methods geared towards working on input text and producing
+visual output, without limiting the analysis to narrow sub-areas. It also
+results in the identification of common templates in the field, which are then
+compared and contrasted both within pools of similar methods and across lines
+of research. We provide a breakdown of text-to-image generation into various
+flavors of image-from-text methods, video-from-text methods, image editing,
+self-supervised and graph-based approaches. In this discussion, we focus on
+research papers published at 8 leading machine learning conferences in the
+years 2016-2022, also incorporating a number of relevant papers not matching
+the outlined search criteria. The conducted review suggests a significant
+increase in the number of papers published in the area and highlights research
+gaps and potential lines of investigation. To our knowledge, this is the first
+review to systematically look at text-to-image generation from the perspective
+of "cross-modal generation."
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reframing Offline Reinforcement Learning as a Regression Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11630v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11630v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prajwal Koirala, Cody Fleming
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The study proposes the reformulation of offline reinforcement learning as a
+regression problem that can be solved with decision trees. Aiming to predict
+actions based on input states, return-to-go (RTG), and timestep information, we
+observe that with gradient-boosted trees, the agent training and inference are
+very fast, the former taking less than a minute. Despite the simplification
+inherent in this reformulated problem, our agent demonstrates performance that
+is at least on par with established methods. This assertion is validated by
+testing it across standard datasets associated with D4RL Gym-MuJoCo tasks. We
+further discuss the agent's ability to generalize by testing it on two extreme
+cases, how it learns to model the return distributions effectively even with
+highly skewed expert datasets, and how it exhibits robust performance in
+scenarios with sparse/delayed rewards.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tight Verification of Probabilistic Robustness in Bayesian Neural
+  Networks <span class="chip">AISTATS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11627v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11627v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Batten, Mehran Hosseini, Alessio Lomuscio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce two algorithms for computing tight guarantees on the
+probabilistic robustness of Bayesian Neural Networks (BNNs). Computing
+robustness guarantees for BNNs is a significantly more challenging task than
+verifying the robustness of standard Neural Networks (NNs) because it requires
+searching the parameters' space for safe weights. Moreover, tight and complete
+approaches for the verification of standard NNs, such as those based on
+Mixed-Integer Linear Programming (MILP), cannot be directly used for the
+verification of BNNs because of the polynomial terms resulting from the
+consecutive multiplication of variables encoding the weights. Our algorithms
+efficiently and effectively search the parameters' space for safe weights by
+using iterative expansion and the network's gradient and can be used with any
+verification algorithm of choice for BNNs. In addition to proving that our
+algorithms compute tighter bounds than the SoA, we also evaluate our algorithms
+against the SoA on standard benchmarks, such as MNIST and CIFAR10, showing that
+our algorithms compute bounds up to 40% tighter than the SoA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AISTATS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Freely Long-Thinking <span class="highlight-title">Transformer</span> (FraiLT) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11626v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11626v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akbay Tabak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Freely Long-Thinking Transformer (FraiLT) is an improved transformer model
+designed to enhance processing capabilities without scaling up size. It
+utilizes a recursive approach, iterating over a subset of layers multiple
+times, and introduces iteration encodings to maintain awareness across these
+cycles. Iteration encoding allows FraiLT to achieve the interpretive depth of
+larger models in a compact form. When evaluated on a synthetic story dataset,
+FraiLT outperformed larger models, showcasing its ability to deliver
+high-quality performance while reducing memory demands. This model represents a
+step forward towards more efficient and accessible language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient local linearity regularization to overcome catastrophic
+  overfitting <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11618v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11618v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elias Abad Rocamora, Fanghui Liu, Grigorios G. Chrysos, Pablo M. Olmos, Volkan Cevher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Catastrophic overfitting (CO) in single-step adversarial training (AT)
+results in abrupt drops in the adversarial test accuracy (even down to 0%). For
+models trained with multi-step AT, it has been observed that the loss function
+behaves locally linearly with respect to the input, this is however lost in
+single-step AT. To address CO in single-step AT, several methods have been
+proposed to enforce local linearity of the loss via regularization. However,
+these regularization terms considerably slow down training due to Double
+Backpropagation. Instead, in this work, we introduce a regularization term,
+called ELLE, to mitigate CO effectively and efficiently in classical AT
+evaluations, as well as some more difficult regimes, e.g., large adversarial
+perturbations and long training schedules. Our regularization term can be
+theoretically linked to curvature of the loss function and is computationally
+cheaper than previous methods by avoiding Double Backpropagation. Our thorough
+experimental validation demonstrates that our work does not suffer from CO,
+even in challenging settings where previous works suffer from it. We also
+notice that adapting our regularization parameter during training (ELLE-A)
+greatly improves the performance, specially in large $\epsilon$ setups. Our
+implementation is available in https://github.com/LIONS-EPFL/ELLE .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continuous Field Reconstruction from Sparse Observations with Implicit
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xihaier Luo, Wei Xu, Yihui Ren, Shinjae Yoo, Balu Nadiga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliably reconstructing physical fields from sparse sensor data is a
+challenge that frequently arises in many scientific domains. In practice, the
+process generating the data often is not understood to sufficient accuracy.
+Therefore, there is a growing interest in using the deep neural network route
+to address the problem. This work presents a novel approach that learns a
+continuous representation of the physical field using implicit neural
+representations (INRs). Specifically, after factorizing spatiotemporal
+variability into spatial and temporal components using the separation of
+variables technique, the method learns relevant basis functions from sparsely
+sampled irregular data points to develop a continuous representation of the
+data. In experimental evaluations, the proposed model outperforms recent INR
+methods, offering superior reconstruction quality on simulation data from a
+state-of-the-art climate model and a second dataset that comprises ultra-high
+resolution satellite-based sea surface temperature fields.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages,21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Edits for Counterfactual Explanations: A Unified GNN Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolaos Chaidos, Angeliki Dimitriou, Maria Lymperaiou, Giorgos Stamou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactuals have been established as a popular explainability technique
+which leverages a set of minimal edits to alter the prediction of a classifier.
+When considering conceptual counterfactuals, the edits requested should
+correspond to salient concepts present in the input data. At the same time,
+conceptual distances are defined by knowledge graphs, ensuring the optimality
+of conceptual edits. In this work, we extend previous endeavors on conceptual
+counterfactuals by introducing \textit{graph edits as counterfactual
+explanations}: should we represent input data as graphs, which is the shortest
+graph edit path that results in an alternative classification label as provided
+by a black-box classifier?
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ $\texttt{immrax}$: A Parallelizable and Differentiable Toolbox for
+  Interval Analysis and Mixed Monotone Reachability in JAX 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akash Harapanahalli, Saber Jafarpour, Samuel Coogan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an implementation of interval analysis and mixed monotone interval
+reachability analysis as function transforms in Python, fully composable with
+the computational framework JAX. The resulting toolbox inherits several key
+features from JAX, including computational efficiency through Just-In-Time
+Compilation, GPU acceleration for quick parallelized computations, and
+Automatic Differentiability. We demonstrate the toolbox's performance on
+several case studies, including a reachability problem on a vehicle model
+controlled by a neural network, and a robust closed-loop optimal control
+problem for a swinging pendulum.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable High-Resolution Pixel-Space Image Synthesis with Hourglass
+  Diffusion <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katherine Crowson, Stefan Andreas Baumann, Alex Birch, Tanishq Mathew Abraham, Daniel Z. Kaplan, Enrico Shippole
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Hourglass Diffusion Transformer (HDiT), an image generative
+model that exhibits linear scaling with pixel count, supporting training at
+high-resolution (e.g. $1024 \times 1024$) directly in pixel-space. Building on
+the Transformer architecture, which is known to scale to billions of
+parameters, it bridges the gap between the efficiency of convolutional U-Nets
+and the scalability of Transformers. HDiT trains successfully without typical
+high-resolution training techniques such as multiscale architectures, latent
+autoencoders or self-conditioning. We demonstrate that HDiT performs
+competitively with existing models on ImageNet $256^2$, and sets a new
+state-of-the-art for diffusion models on FFHQ-$1024^2$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 13 figures, project page and code available at
+  https://crowsonkb.github.io/hourglass-diffusion-transformers/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generator Identification for Linear SDEs with Additive and
+  Multiplicative Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.19491v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.19491v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyuan Wang, Xi Geng, Wei Huang, Biwei Huang, Mingming Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present conditions for identifying the generator of a
+linear stochastic differential equation (SDE) from the distribution of its
+solution process with a given fixed initial state. These identifiability
+conditions are crucial in causal inference using linear SDEs as they enable the
+identification of the post-intervention distributions from its observational
+distribution. Specifically, we derive a sufficient and necessary condition for
+identifying the generator of linear SDEs with additive noise, as well as a
+sufficient condition for identifying the generator of linear SDEs with
+multiplicative noise. We show that the conditions derived for both types of
+SDEs are generic. Moreover, we offer geometric interpretations of the derived
+identifiability conditions to enhance their understanding. To validate our
+theoretical results, we perform a series of simulations, which support and
+substantiate the established findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PDE Generalization of In-Context Operator Networks: A Study on 1D Scalar
+  Nonlinear Conservation Laws 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07364v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07364v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Yang, Stanley J. Osher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can we build a single large model for a wide range of PDE-related scientific
+learning tasks? Can this model generalize to new PDEs, even of new forms,
+without any fine-tuning? In-context operator learning and the corresponding
+model In-Context Operator Networks (ICON) represent an initial exploration of
+these questions. The capability of ICON regarding the first question has been
+demonstrated previously. In this paper, we present a detailed methodology for
+solving PDE problems with ICON, and show how a single ICON model can make
+forward and reverse predictions for different equations with different strides,
+provided with appropriately designed data prompts. We show the positive
+evidence to the second question, i.e., ICON can generalize well to some PDEs
+with new forms without any fine-tuning. This is exemplified through a study on
+1D scalar nonlinear conservation laws, a family of PDEs with temporal
+evolution. We also show how to broaden the range of problems that an ICON model
+can address, by transforming functions and equations to ICON's capability
+scope. We believe that the progress in this paper is a significant step towards
+the goal of training a foundation model for PDE-related tasks under the
+in-context operator learning framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The GPU Phase Folding and Deep Learning Method for Detecting Exoplanet
+  Transits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaitlyn Wang, Jian Ge, Kevin Willis, Kevin Wang, Yinan Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents GPFC, a novel Graphics Processing Unit (GPU) Phase
+Folding and Convolutional Neural Network (CNN) system to detect exoplanets
+using the transit method. We devise a fast folding algorithm parallelized on a
+GPU to amplify low signal-to-noise ratio transit signals, allowing a search at
+high precision and speed. A CNN trained on two million synthetic light curves
+reports a score indicating the likelihood of a planetary signal at each period.
+While the GPFC method has broad applicability across period ranges, this
+research specifically focuses on detecting ultra-short-period planets with
+orbital periods less than one day. GPFC improves on speed by three orders of
+magnitude over the predominant Box-fitting Least Squares (BLS) method. Our
+simulation results show GPFC achieves $97%$ training accuracy, higher true
+positive rate at the same false positive rate of detection, and higher
+precision at the same recall rate when compared to BLS. GPFC recovers $100\%$
+of known ultra-short-period planets in $\textit{Kepler}$ light curves from a
+blind search. These results highlight the promise of GPFC as an alternative
+approach to the traditional BLS algorithm for finding new transiting exoplanets
+in data taken with $\textit{Kepler}$ and other space transit missions such as
+K2, TESS and future PLATO and Earth 2.0.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 19 figures; Accepted for publication in the peer-reviewed
+  journal, Monthly Notices of the Royal Astronomical Society (MNRAS), on
+  January 20, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-01-20T00:00:00Z">2024-01-20</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">28</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing Task-Encoding Tokens in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Bai, Heyan Huang, Cesare Spinoso-Di Piano, Marc-Antoine Rondeau, Sanxing Chen, Yang Gao, Jackie Chi Kit Cheung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning (ICL) has become an effective solution for few-shot
+learning in natural language processing. Past work has found that, during this
+process, representations of the last prompt token are utilized to store task
+reasoning procedures, thereby explaining the working mechanism of in-context
+learning. In this paper, we seek to locate and analyze other task-encoding
+tokens whose representations store task reasoning procedures. Supported by
+experiments that ablate the representations of different token types, we find
+that template and stopword tokens are the most prone to be task-encoding
+tokens. In addition, we demonstrate experimentally that lexical cues,
+repetition, and text formats are the main distinguishing characteristics of
+these tokens. Our work provides additional insights into how large language
+models (LLMs) leverage task reasoning procedures in ICL and suggests that
+future work may involve using task-encoding tokens to improve the computational
+efficiency of LLMs at inference time and their ability to handle long
+sequences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PRILoRA: Pruned and Rank-Increasing Low-Rank Adaptation <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nadav Benedek, Lior Wolf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the proliferation of large pre-trained language models (PLMs),
+fine-tuning all model parameters becomes increasingly inefficient, particularly
+when dealing with numerous downstream tasks that entail substantial training
+and storage costs. Several approaches aimed at achieving parameter-efficient
+fine-tuning (PEFT) have been proposed. Among them, Low-Rank Adaptation (LoRA)
+stands out as an archetypal method, incorporating trainable rank decomposition
+matrices into each target module. Nevertheless, LoRA does not consider the
+varying importance of each layer. To address these challenges, we introduce
+PRILoRA, which linearly allocates a different rank for each layer, in an
+increasing manner, and performs pruning throughout the training process,
+considering both the temporary magnitude of weights and the accumulated
+statistics of the input to any given layer. We validate the effectiveness of
+PRILoRA through extensive experiments on eight GLUE benchmarks, setting a new
+state of the art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progress in Privacy Protection: A <span class="highlight-title">Review</span> of Privacy Preserving
+  Techniques in Recommender Systems, Edge Computing, and Cloud Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Syed Raza Bashir, Shaina Raza, Vojislav Misic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As digital technology evolves, the increasing use of connected devices brings
+both challenges and opportunities in the areas of mobile crowdsourcing, edge
+computing, and recommender systems. This survey focuses on these dynamic
+fields, emphasizing the critical need for privacy protection in our
+increasingly data-oriented world. It explores the latest trends in these
+interconnected areas, with a special emphasis on privacy and data security. Our
+method involves an in-depth analysis of various academic works, which helps us
+to gain a comprehensive understanding of these sectors and their shifting focus
+towards privacy concerns. We present new insights and marks a significant
+advancement in addressing privacy issues within these technologies. The survey
+is a valuable resource for researchers, industry practitioners, and policy
+makers, offering an extensive overview of these fields and their related
+privacy challenges, catering to a wide audience in the modern digital era.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Word-Level ASR Quality Estimation for Efficient Corpus Sampling and
+  Post-Editing through Analyzing Attentions of a Reference-Free Metric 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Golara Javadi, Kamer Ali Yuksel, Yunsu Kim, Thiago Castro Ferreira, Mohamed Al-Badrashiny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of automatic speech recognition (ASR), the quest for models that
+not only perform with high accuracy but also offer transparency in their
+decision-making processes is crucial. The potential of quality estimation (QE)
+metrics is introduced and evaluated as a novel tool to enhance explainable
+artificial intelligence (XAI) in ASR systems. Through experiments and analyses,
+the capabilities of the NoRefER (No Reference Error Rate) metric are explored
+in identifying word-level errors to aid post-editors in refining ASR
+hypotheses. The investigation also extends to the utility of NoRefER in the
+corpus-building process, demonstrating its effectiveness in augmenting datasets
+with insightful annotations. The diagnostic aspects of NoRefER are examined,
+revealing its ability to provide valuable insights into model behaviors and
+decision patterns. This has proven beneficial for prioritizing hypotheses in
+post-editing workflows and fine-tuning ASR models. The findings suggest that
+NoRefER is not merely a tool for error detection but also a comprehensive
+framework for enhancing ASR systems' transparency, efficiency, and
+effectiveness. To ensure the reproducibility of the results, all source codes
+of this study are made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Drop your Decoder: <span class="highlight-title">Pre-train</span>ing with Bag-of-Word Prediction for Dense
+  Passage Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyuan Ma, Xing Wu, Zijia Lin, Songlin Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Masked auto-encoder pre-training has emerged as a prevalent technique for
+initializing and enhancing dense retrieval systems. It generally utilizes
+additional Transformer decoder blocks to provide sustainable supervision
+signals and compress contextual information into dense representations.
+However, the underlying reasons for the effectiveness of such a pre-training
+technique remain unclear. The usage of additional Transformer-based decoders
+also incurs significant computational costs. In this study, we aim to shed
+light on this issue by revealing that masked auto-encoder (MAE) pre-training
+with enhanced decoding significantly improves the term coverage of input tokens
+in dense representations, compared to vanilla BERT checkpoints. Building upon
+this observation, we propose a modification to the traditional MAE by replacing
+the decoder of a masked auto-encoder with a completely simplified Bag-of-Word
+prediction task. This modification enables the efficient compression of lexical
+signals into dense representations through unsupervised pre-training.
+Remarkably, our proposed method achieves state-of-the-art retrieval performance
+on several large-scale retrieval benchmarks without requiring any additional
+parameters, which provides a 67% training speed-up compared to standard masked
+auto-encoder pre-training with enhanced decoding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Working in progress. Our code will be available at
+  https://github.com/ma787639046/bowdpr</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>-RAG: Pioneering Vector Embedding-Free Retrieval-Augmented
+  Generation in Niche Domains, Exemplified by Korean Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bongsu Kang, Jundong Kim, Tae-Rim Yun, Chang-Eop Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a natural language prompt-based retrieval augmented generation
+(Prompt-RAG), a novel approach to enhance the performance of generative large
+language models (LLMs) in niche domains. Conventional RAG methods mostly
+require vector embeddings, yet the suitability of generic LLM-based embedding
+representations for specialized domains remains uncertain. To explore and
+exemplify this point, we compared vector embeddings from Korean Medicine (KM)
+and Conventional Medicine (CM) documents, finding that KM document embeddings
+correlated more with token overlaps and less with human-assessed document
+relatedness, in contrast to CM embeddings. Prompt-RAG, distinct from
+conventional RAG models, operates without the need for embedding vectors. Its
+performance was assessed through a Question-Answering (QA) chatbot application,
+where responses were evaluated for relevance, readability, and informativeness.
+The results showed that Prompt-RAG outperformed existing models, including
+ChatGPT and conventional vector embedding-based RAGs, in terms of relevance and
+informativeness. Despite challenges like content structuring and response
+latency, the advancements in LLMs are expected to encourage the use of
+Prompt-RAG, making it a promising tool for other domains in need of RAG
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 4 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ End-to-End Argument Mining over Varying Rhetorical Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elena Chistova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rhetorical Structure Theory implies no single discourse interpretation of a
+text, and the limitations of RST parsers further exacerbate inconsistent
+parsing of similar structures. Therefore, it is important to take into account
+that the same argumentative structure can be found in semantically similar
+texts with varying rhetorical structures. In this work, the differences between
+paraphrases within the same argument scheme are evaluated from a rhetorical
+perspective. The study proposes a deep dependency parsing model to assess the
+connection between rhetorical and argument structures. The model utilizes
+rhetorical relations; RST structures of paraphrases serve as training data
+augmentations. The method allows for end-to-end argumentation analysis using a
+rhetorical tree instead of a word sequence. It is evaluated on the bilingual
+Microtexts corpus, and the first results on fully-fledged argument parsing for
+the Russian version of the corpus are reported. The results suggest that
+argument mining can benefit from multiple variants of discourse structure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unfair TOS: An Automated Approach using Customized <span class="highlight-title">BERT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bathini Sai Akash, Akshara Kupireddy, Lalita Bhanu Murthy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Terms of Service (ToS) form an integral part of any agreement as it defines
+the legal relationship between a service provider and an end-user. Not only do
+they establish and delineate reciprocal rights and responsibilities, but they
+also provide users with information on essential aspects of contracts that
+pertain to the use of digital spaces. These aspects include a wide range of
+topics, including limitation of liability, data protection, etc. Users tend to
+accept the ToS without going through it before using any application or
+service. Such ignorance puts them in a potentially weaker situation in case any
+action is required. Existing methodologies for the detection or classification
+of unfair clauses are however obsolete and show modest performance. In this
+research paper, we present SOTA(State of The Art) results on unfair clause
+detection from ToS documents based on unprecedented Fine-tuning BERT in
+integration with SVC(Support Vector Classifier). The study shows proficient
+performance with a macro F1-score of 0.922 at unfair clause detection, and
+superior performance is also shown in the classification of unfair clauses by
+each tag. Further, a comparative analysis is performed by answering research
+questions on the Transformer models utilized. In order to further research and
+experimentation the code and results are made available on
+https://github.com/batking24/Unfair-TOS-An-Automated-Approach-based-on-Fine-tuning-BERT-in-conjunction-with-ML.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InferAligner: Inference-Time Alignment for Harmlessness through
+  Cross-Model Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyu Wang, Dong Zhang, Linyang Li, Chenkun Tan, Xinghao Wang, Ke Ren, Botian Jiang, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of large language models (LLMs), they are not only
+used as general-purpose AI assistants but are also customized through further
+fine-tuning to meet the requirements of different applications. A pivotal
+factor in the success of current LLMs is the alignment process. Current
+alignment methods, such as supervised fine-tuning (SFT) and reinforcement
+learning from human feedback (RLHF), focus on training-time alignment and are
+often complex and cumbersome to implement. Therefore, we develop
+\textbf{InferAligner}, a novel inference-time alignment method that utilizes
+cross-model guidance for harmlessness alignment. InferAligner utilizes safety
+steering vectors extracted from safety-aligned model to modify the activations
+of the target model when responding to harmful inputs, thereby guiding the
+target model to provide harmless responses. Experimental results show that our
+method can be very effectively applied to domain-specific models in finance,
+medicine, and mathematics, as well as to multimodal large language models
+(MLLMs) such as LLaVA. It significantly diminishes the Attack Success Rate
+(ASR) of both harmful instructions and jailbreak attacks, while maintaining
+almost unchanged performance in downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How the Advent of Ubiquitous Large Language Models both Stymie and
+  Turbocharge Dynamic Adversarial Question Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoo Yeon Sung, Ishani Mondal, Jordan Boyd-Graber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic adversarial question generation, where humans write examples to stump
+a model, aims to create examples that are realistic and informative. However,
+the advent of large language models (LLMs) has been a double-edged sword for
+human authors: more people are interested in seeing and pushing the limits of
+these models, but because the models are so much stronger an opponent, they are
+harder to defeat. To understand how these models impact adversarial question
+writing process, we enrich the writing guidance with LLMs and retrieval models
+for the authors to reason why their questions are not adversarial. While
+authors could create interesting, challenging adversarial questions, they
+sometimes resort to tricks that result in poor questions that are ambiguous,
+subjective, or confusing not just to a computer but also to humans. To address
+these issues, we propose new metrics and incentives for eliciting good,
+challenging questions and present a new dataset of adversarially authored
+questions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gaussian Adaptive Attention is All You Need: Robust Contextual
+  Representations Across Multiple Modalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georgios Ioannides, Aman Chadha, Aaron Elkins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose the Multi-Head Gaussian Adaptive Attention Mechanism (GAAM), a
+novel probabilistic attention framework, and the Gaussian Adaptive Transformer
+(GAT), designed to enhance information aggregation across multiple modalities,
+including Speech, Text and Vision. GAAM integrates learnable mean and variance
+into its attention mechanism, implemented in a Multi-Headed framework enabling
+it to collectively model any Probability Distribution for dynamic recalibration
+of feature significance. This method demonstrates significant improvements,
+especially with highly non-stationary data, surpassing the state-of-the-art
+attention techniques in model performance (up to approximately +20% in
+accuracy) by identifying key elements within the feature space. GAAM's
+compatibility with dot-product-based attention models and relatively low number
+of parameters showcases its adaptability and potential to boost existing
+attention frameworks. Empirically, GAAM exhibits superior adaptability and
+efficacy across a diverse range of tasks, including emotion recognition in
+speech, image classification, and text classification, thereby establishing its
+robustness and versatility in handling multi-modal data. Furthermore, we
+introduce the Importance Factor (IF), a new learning-based metric that enhances
+the explainability of models trained with GAAM-based methods. Overall, GAAM
+represents an advancement towards development of better performing and more
+explainable attention models across multiple modalities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Large Language Models for Clinical Decision Support by
+  Incorporating Clinical Practice Guidelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11120v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11120v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Oniani, Xizhi Wu, Shyam Visweswaran, Sumit Kapoor, Shravan Kooragayalu, Katelyn Polanska, Yanshan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background Large Language Models (LLMs), enhanced with Clinical Practice
+Guidelines (CPGs), can significantly improve Clinical Decision Support (CDS).
+However, methods for incorporating CPGs into LLMs are not well studied. Methods
+We develop three distinct methods for incorporating CPGs into LLMs: Binary
+Decision Tree (BDT), Program-Aided Graph Construction (PAGC), and
+Chain-of-Thought-Few-Shot Prompting (CoT-FSP). To evaluate the effectiveness of
+the proposed methods, we create a set of synthetic patient descriptions and
+conduct both automatic and human evaluation of the responses generated by four
+LLMs: GPT-4, GPT-3.5 Turbo, LLaMA, and PaLM 2. Zero-Shot Prompting (ZSP) was
+used as the baseline method. We focus on CDS for COVID-19 outpatient treatment
+as the case study. Results All four LLMs exhibit improved performance when
+enhanced with CPGs compared to the baseline ZSP. BDT outperformed both CoT-FSP
+and PAGC in automatic evaluation. All of the proposed methods demonstrated high
+performance in human evaluation. Conclusion LLMs enhanced with CPGs demonstrate
+superior performance, as compared to plain LLMs with ZSP, in providing accurate
+recommendations for COVID-19 outpatient treatment, which also highlights the
+potential for broader applications beyond the case study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Duality in Open Information Extraction with Predicate <span class="highlight-title">Prompt</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Chen, Jingping Liu, Deqing Yang, Yanghua Xiao, Huimin Xu, Zongyu Wang, Rui Xie, Yunsen Xian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open information extraction (OpenIE) aims to extract the schema-free triplets
+in the form of (\emph{subject}, \emph{predicate}, \emph{object}) from a given
+sentence. Compared with general information extraction (IE), OpenIE poses more
+challenges for the IE models, {especially when multiple complicated triplets
+exist in a sentence. To extract these complicated triplets more effectively, in
+this paper we propose a novel generative OpenIE model, namely \emph{DualOIE},
+which achieves a dual task at the same time as extracting some triplets from
+the sentence, i.e., converting the triplets into the sentence.} Such dual task
+encourages the model to correctly recognize the structure of the given sentence
+and thus is helpful to extract all potential triplets from the sentence.
+Specifically, DualOIE extracts the triplets in two steps: 1) first extracting a
+sequence of all potential predicates, 2) then using the predicate sequence as a
+prompt to induce the generation of triplets. Our experiments on two benchmarks
+and our dataset constructed from Meituan demonstrate that DualOIE achieves the
+best performance among the state-of-the-art baselines. Furthermore, the online
+A/B test on Meituan platform shows that 0.93\% improvement of QV-CTR and 0.56\%
+improvement of UV-CTR have been obtained when the triplets extracted by DualOIE
+were leveraged in Meituan's search system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Shared Vocabulary: Increasing Representational Word Similarities
+  across Languages for Multilingual Machine Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14189v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14189v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Wu, Christof Monz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using a vocabulary that is shared across languages is common practice in
+Multilingual Neural Machine Translation (MNMT). In addition to its simple
+design, shared tokens play an important role in positive knowledge transfer,
+assuming that shared tokens refer to similar meanings across languages.
+However, when word overlap is small, especially due to different writing
+systems, transfer is inhibited. In this paper, we define word-level information
+transfer pathways via word equivalence classes and rely on graph networks to
+fuse word embeddings across languages. Our experiments demonstrate the
+advantages of our approach: 1) embeddings of words with similar meanings are
+better aligned across languages, 2) our method achieves consistent BLEU
+improvements of up to 2.3 points for high- and low-resource MNMT, and 3) less
+than 1.0\% additional trainable parameters are required with a limited increase
+in computational costs, while inference time remains identical to the baseline.
+We release the codebase to the community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Developing Chat<span class="highlight-title">GPT</span> for Biology and Medicine: A Complete <span class="highlight-title">Review</span> of
+  Biomedical Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07510v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07510v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qing Li, Lei Li, Yu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ChatGPT explores a strategic blueprint of question answering (QA) in
+delivering medical diagnosis, treatment recommendations, and other healthcare
+support. This is achieved through the increasing incorporation of medical
+domain data via natural language processing (NLP) and multimodal paradigms. By
+transitioning the distribution of text, images, videos, and other modalities
+from the general domain to the medical domain, these techniques have expedited
+the progress of medical domain question answering (MDQA). They bridge the gap
+between human natural language and sophisticated medical domain knowledge or
+expert manual annotations, handling large-scale, diverse, unbalanced, or even
+unlabeled data analysis scenarios in medical contexts. Central to our focus is
+the utilizing of language models and multimodal paradigms for medical question
+answering, aiming to guide the research community in selecting appropriate
+mechanisms for their specific medical research requirements. Specialized tasks
+such as unimodal-related question answering, reading comprehension, reasoning,
+diagnosis, relation extraction, probability modeling, and others, as well as
+multimodal-related tasks like vision question answering, image caption,
+cross-modal retrieval, report summarization, and generation, are discussed in
+detail. Each section delves into the intricate specifics of the respective
+method under consideration. This paper highlights the structures and
+advancements of medical domain explorations against general domain methods,
+emphasizing their applications across different tasks and datasets. It also
+outlines current challenges and opportunities for future medical domain
+research, paving the way for continued innovation and application in this
+rapidly evolving field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GNN2R: Weakly-Supervised Rationale-Providing Question Answering over
+  Knowledge Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02317v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02317v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruijie Wang, Luca Rossetto, Michael Cochez, Abraham Bernstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most current methods for multi-hop question answering (QA) over knowledge
+graphs (KGs) only provide final conclusive answers without explanations, such
+as a set of KG entities that is difficult for normal users to review and
+comprehend. This issue severely limits the application of KG-based QA in
+real-world scenarios. However, it is non-trivial to solve due to two
+challenges: First, annotations of reasoning chains of multi-hop questions,
+which could serve as supervision for explanation generation, are usually
+lacking. Second, it is difficult to maintain high efficiency when explicit KG
+triples need to be retrieved to generate explanations. In this paper, we
+propose a novel Graph Neural Network-based Two-Step Reasoning model (GNN2R) to
+solve this issue. GNN2R can provide both final answers and reasoning subgraphs
+as a rationale behind final answers efficiently with only weak supervision that
+is available through question-final answer pairs. We extensively evaluated
+GNN2R with detailed analyses in experiments. The results demonstrate that, in
+terms of effectiveness, efficiency, and quality of generated explanations,
+GNN2R outperforms existing state-of-the-art methods that are applicable to this
+task. Our code and pre-trained models are available at
+https://github.com/ruijie-wang-uzh/GNN2R.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Impact of Reasoning Step Length on Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04925v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04925v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyu Jin, Qinkai Yu, Dong Shu, Haiyan Zhao, Wenyue Hua, Yanda Meng, Yongfeng Zhang, Mengnan Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain of Thought (CoT) is significant in improving the reasoning abilities of
+large language models (LLMs). However, the correlation between the
+effectiveness of CoT and the length of reasoning steps in prompts remains
+largely unknown. To shed light on this, we have conducted several empirical
+experiments to explore the relations. Specifically, we design experiments that
+expand and compress the rationale reasoning steps within CoT demonstrations,
+while keeping all other factors constant. We have the following key findings.
+First, the results indicate that lengthening the reasoning steps in prompts,
+even without adding new information into the prompt, considerably enhances
+LLMs' reasoning abilities across multiple datasets. Alternatively, shortening
+the reasoning steps, even while preserving the key information, significantly
+diminishes the reasoning abilities of models. This finding highlights the
+importance of the number of steps in CoT prompts and provides practical
+guidance to make better use of LLMs' potential in complex problem-solving
+scenarios. Second, we also investigated the relationship between the
+performance of CoT and the rationales used in demonstrations. Surprisingly, the
+result shows that even incorrect rationales can yield favorable outcomes if
+they maintain the requisite length of inference. Third, we observed that the
+advantages of increasing reasoning steps are task-dependent: simpler tasks
+require fewer steps, whereas complex tasks gain significantly from longer
+inference sequences.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Assertion Enhanced Few-Shot Learning: Instructive Technique for Large
+  Language Models to Generate Educational Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03122v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03122v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tasmia Shahriar, Kelly Ramos, Noboru Matsuda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human educators possess an intrinsic ability to anticipate and seek
+educational explanations from students, which drives them to pose
+thought-provoking questions when students cannot articulate these explanations
+independently. We aim to imbue Intelligent Tutoring Systems with this ability
+using few-shot learning capability of Large Language Models. Our work proposes
+a novel prompting technique, Assertion Enhanced Few-Shot Learning, to
+facilitate the generation of accurate, detailed oriented educational
+explanations. Our central hypothesis is that, in educational domain, few-shot
+demonstrations are necessary but not a sufficient condition for quality
+explanation generation. We conducted a study involving 12 in-service teachers,
+comparing our approach to Traditional Few-Shot Learning. The results show that
+Assertion Enhanced Few-Shot Learning improves explanation accuracy by 15% and
+yields higher-quality explanations, as evaluated by teachers. We also conduct a
+qualitative ablation study to factor the impact of assertions to provide
+educator-friendly prompting guidelines for generating explanations in their
+domain of interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machines Do See Color: A Guideline to Classify Different Forms of Racist
+  Discourse in Large Corpora 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09333v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09333v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diana Davila Gordillo, Joan Timoneda, Sebastian Vallejo Vera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current methods to identify and classify racist language in text rely on
+small-n qualitative approaches or large-n approaches focusing exclusively on
+overt forms of racist discourse. This article provides a step-by-step
+generalizable guideline to identify and classify different forms of racist
+discourse in large corpora. In our approach, we start by conceptualizing racism
+and its different manifestations. We then contextualize these racist
+manifestations to the time and place of interest, which allows researchers to
+identify their discursive form. Finally, we apply XLM-RoBERTa (XLM-R), a
+cross-lingual model for supervised text classification with a cutting-edge
+contextual understanding of text. We show that XLM-R and XLM-R-Racismo, our
+pretrained model, outperform other state-of-the-art approaches in classifying
+racism in large corpora. We illustrate our approach using a corpus of tweets
+relating to the Ecuadorian ind\'igena community between 2018 and 2021.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large language models in biomedical natural language processing:
+  benchmarks, baselines, and recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16326v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16326v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyu Chen, Jingcheng Du, Yan Hu, Vipina Kuttichi Keloth, Xueqing Peng, Kalpana Raja, Rui Zhang, Zhiyong Lu, Hua Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical literature is growing rapidly, making it challenging to curate and
+extract knowledge manually. Biomedical natural language processing (BioNLP)
+techniques that can automatically extract information from biomedical
+literature help alleviate this burden. Recently, large Language Models (LLMs),
+such as GPT-3 and GPT-4, have gained significant attention for their impressive
+performance. However, their effectiveness in BioNLP tasks and impact on method
+development and downstream users remain understudied. This pilot study (1)
+establishes the baseline performance of GPT-3 and GPT-4 at both zero-shot and
+one-shot settings in eight BioNLP datasets across four applications: named
+entity recognition, relation extraction, multi-label document classification,
+and semantic similarity and reasoning, (2) examines the errors produced by the
+LLMs and categorized the errors into three types: missingness, inconsistencies,
+and unwanted artificial content, and (3) provides suggestions for using LLMs in
+BioNLP applications. We make the datasets, baselines, and results publicly
+available to the community via
+https://github.com/qingyu-qc/gpt_bionlp_benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MR-GSM8K: A Meta-Reasoning Revolution in Large Language Model Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.17080v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.17080v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongshen Zeng, Pengguang Chen, Shu Liu, Haiyun Jiang, Jiaya Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce a novel evaluation paradigm for Large Language
+Models, one that challenges them to engage in meta-reasoning. This approach
+addresses critical shortcomings in existing math problem-solving benchmarks,
+traditionally used to evaluate the cognitive capabilities of agents. Our
+paradigm shifts the focus from result-oriented assessments, which often
+overlook the reasoning process, to a more holistic evaluation that effectively
+differentiates the cognitive capabilities among models. For example, in our
+benchmark, GPT-4 demonstrates a performance five times better than GPT3-5. The
+significance of this new paradigm lies in its ability to reveal potential
+cognitive deficiencies in LLMs that current benchmarks, such as GSM8K, fail to
+uncover due to their saturation and lack of effective differentiation among
+varying reasoning abilities. Our comprehensive analysis includes several
+state-of-the-art math models from both open-source and closed-source
+communities, uncovering fundamental deficiencies in their training and
+evaluation approaches. This paper not only advocates for a paradigm shift in
+the assessment of LLMs but also contributes to the ongoing discourse on the
+trajectory towards Artificial General Intelligence (AGI). By promoting the
+adoption of meta-reasoning evaluation methods similar to ours, we aim to
+facilitate a more accurate assessment of the true cognitive abilities of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/dvlab-research/MR-GSM8K</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Universal Vulnerabilities in Large Language Models: In-context Learning
+  Backdoor Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.05949v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.05949v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Zhao, Meihuizi Jia, Luu Anh Tuan, Jinming Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning, a paradigm bridging the gap between pre-training and
+fine-tuning, has demonstrated high efficacy in several NLP tasks, especially in
+few-shot settings. Unlike traditional fine-tuning methods, in-context learning
+adapts pre-trained models to unseen tasks without updating any parameters.
+Despite being widely applied, in-context learning is vulnerable to malicious
+attacks. In this work, we raise security concerns regarding this paradigm. Our
+studies demonstrate that an attacker can manipulate the behavior of large
+language models by poisoning the demonstration context, without the need for
+fine-tuning the model. Specifically, we have designed a new backdoor attack
+method, named ICLAttack, to target large language models based on in-context
+learning. Our method encompasses two types of attacks: poisoning demonstration
+examples and poisoning prompts, which can make models behave in accordance with
+predefined intentions. ICLAttack does not require additional fine-tuning to
+implant a backdoor, thus preserving the model's generality. Furthermore, the
+poisoned examples are correctly labeled, enhancing the natural stealth of our
+attack method. Extensive experimental results across several language models,
+ranging in size from 1.3B to 40B parameters, demonstrate the effectiveness of
+our attack method, exemplified by a high average attack success rate of 95.0%
+across the three datasets on OPT models. Our findings highlight the
+vulnerabilities of language models, and we hope this work will raise awareness
+of the possible security threats associated with in-context learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Agent Alignment in Evolving Social Norms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04620v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04620v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shimin Li, Tianxiang Sun, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Agents based on Large Language Models (LLMs) are increasingly permeating
+various domains of human production and life, highlighting the importance of
+aligning them with human values. The current alignment of AI systems primarily
+focuses on passively aligning LLMs through human intervention. However, agents
+possess characteristics like receiving environmental feedback and
+self-evolution, rendering the LLM alignment methods inadequate. In response, we
+propose an evolutionary framework for agent evolution and alignment, named
+EvolutionaryAgent, which transforms agent alignment into a process of evolution
+and selection under the principle of survival of the fittest. In an environment
+where social norms continuously evolve, agents better adapted to the current
+social norms will have a higher probability of survival and proliferation,
+while those inadequately aligned dwindle over time. Experimental results
+assessing the agents from multiple perspectives in aligning with social norms
+demonstrate that EvolutionaryAgent can align progressively better with the
+evolving social norms while maintaining its proficiency in general tasks.
+Effectiveness tests conducted on various open and closed-source LLMs as the
+foundation for agents also prove the applicability of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Augmenting Math Word Problems via Iterative Question Composing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09003v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09003v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxiong Liu, Andrew Chi-Chih Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recent progress in improving the mathematical reasoning ability of
+large language models(LLMs), solving competition-level math problems without
+the use of external tools remains challenging for open-source LLMs. In this
+work, we introduce the MMIQC dataset, a mixture of processed web data and
+synthetic question-response pairs, to equip base models with better
+mathematical reasoning skills. In different model sizes, the models fine-tuned
+on MMIQC consistently outperform their counterparts by a clear margin on MATH
+test set. Notably, DeepSeek-67B-MMIQC achieves a 41.0% accuracy, 4.2% higher
+than the previous open-source SOTA. Our experiments also show that a large part
+of the improvement can be attributed to our novel augmentation method
+IQC(Iterative Question Composing), where we iteratively ask an LLM to compose
+new questions from the given seed problems and do rejection sampling from
+another LLM. MMIQC has now been released on
+https://huggingface.co/datasets/Vivacem/MMIQC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EMO: Earth Mover Distance Optimization for Auto-Regressive Language
+  Modeling <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04691v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04691v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Ren, Zhiyong Wu, Kenny Q. Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural language models are probabilistic models of human text. They are
+predominantly trained using maximum likelihood estimation (MLE), which is
+equivalent to minimizing the forward cross-entropy between the empirical data
+distribution and the model distribution. However, various degeneration
+phenomena are still widely observed when decoding from the distributions
+learned by such models. We establish that the forward cross-entropy is
+suboptimal as a distance metric for aligning human and model distribution due
+to its (1) recall-prioritization (2) negative diversity ignorance and (3)
+train-test mismatch. In this paper, we propose Earth Mover Distance
+Optimization (EMO) for auto-regressive language modeling. EMO capitalizes on
+the inherent properties of earth mover distance to address the aforementioned
+challenges. Due to the high complexity of direct computation, we further
+introduce a feasible upper bound for EMO to ease end-to-end training. Upon
+extensive evaluation of language models trained using EMO and MLE. We find that
+EMO demonstrates a consistently better language modeling performance than MLE
+across domains. Moreover, EMO demonstrates noteworthy enhancements in
+downstream performance with minimal fine-tuning on merely 25,000 sentences.
+This highlights the tremendous potential of EMO as a lightweight calibration
+method for enhancing large-scale pre-trained language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive Analysis of the Effectiveness of Large Language Models
+  as Automatic Dialogue Evaluators <span class="chip">AAAI-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.15407v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.15407v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Zhang, Luis Fernando D'Haro, Yiming Chen, Malu Zhang, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic evaluation is an integral aspect of dialogue system research. The
+traditional reference-based NLG metrics are generally found to be unsuitable
+for dialogue assessment. Consequently, recent studies have suggested various
+unique, reference-free neural metrics that better align with human evaluations.
+Notably among them, large language models (LLMs), particularly the
+instruction-tuned variants like ChatGPT, are shown to be promising substitutes
+for human judges. Yet, existing works on utilizing LLMs for automatic dialogue
+evaluation are limited in their scope in terms of the number of meta-evaluation
+datasets, mode of evaluation, coverage of LLMs, etc. Hence, it remains
+inconclusive how effective these LLMs are. To this end, we conduct a
+comprehensive study on the application of LLMs for automatic dialogue
+evaluation. Specifically, we analyze the multi-dimensional evaluation
+capability of 30 recently emerged LLMs at both turn and dialogue levels, using
+a comprehensive set of 12 meta-evaluation datasets. Additionally, we probe the
+robustness of the LLMs in handling various adversarial perturbations at both
+turn and dialogue levels. Finally, we explore how model-level and
+dimension-level ensembles impact the evaluation performance. All resources are
+available at https://github.com/e0397123/comp-analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An extended version of AAAI-2024 camera-ready paper (appendix
+  included, 16 pages)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IDEAL: Influence-Driven Selective Annotations Empower In-Context
+  Learners in Large Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10873v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10873v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaokun Zhang, Xiaobo Xia, Zhaoqing Wang, Ling-Hao Chen, Jiale Liu, Qingyun Wu, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning is a promising paradigm that utilizes in-context examples
+as prompts for the predictions of large language models. These prompts are
+crucial for achieving strong performance. However, since the prompts need to be
+sampled from a large volume of annotated examples, finding the right prompt may
+result in high annotation costs. To address this challenge, this paper
+introduces an influence-driven selective annotation method that aims to
+minimize annotation costs while improving the quality of in-context examples.
+The essence of our method is to select a pivotal subset from a large-scale
+unlabeled data pool to annotate for the subsequent sampling of prompts.
+Specifically, a directed graph is first constructed to represent unlabeled
+data. Afterward, the influence of candidate unlabeled subsets is quantified
+with a diffusion process. A simple yet effective greedy algorithm for unlabeled
+data selection is lastly introduced. It iteratively selects the data if it
+provides a maximum marginal gain with respect to quantified influence. Compared
+with previous efforts on selective annotations, our influence-driven method
+works in an end-to-end manner, avoids an intractable explicit balance between
+data diversity and representativeness, and enjoys theoretical support.
+Experiments confirm the superiority of the proposed method on various
+benchmarks, achieving better performance under lower time consumption during
+subset selection. The project page is available at
+https://skzhang1.github.io/IDEAL/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ For Generated Text, Is NLI-Neutral Text the Best Text? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08577v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08577v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michail Mersinias, Kyle Mahowald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore incorporating natural language inference (NLI) into the text
+generative pipeline by using a pre-trained NLI model to assess whether a
+generated sentence entails, contradicts, or is neutral to the prompt and
+preceding text. First, we show that the NLI task is predictive of generation
+errors made by GPT-3. We use these results to develop an NLI-informed
+generation procedure for GPT-J. Then, we evaluate these generations by
+obtaining human annotations on error types and overall quality. We find that an
+NLI strategy of maximizing entailment improves text generation when the nucleus
+sampling randomness parameter value is high, while one which maximizes
+contradiction is in fact productive when the parameter value is low. Overall,
+though, we demonstrate that an NLI strategy of maximizing the neutral class
+provides the highest quality of generated text (significantly better than the
+vanilla generations), regardless of parameter value.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">10</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Drop your Decoder: <span class="highlight-title">Pre-train</span>ing with Bag-of-Word Prediction for Dense
+  Passage Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyuan Ma, Xing Wu, Zijia Lin, Songlin Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Masked auto-encoder pre-training has emerged as a prevalent technique for
+initializing and enhancing dense retrieval systems. It generally utilizes
+additional Transformer decoder blocks to provide sustainable supervision
+signals and compress contextual information into dense representations.
+However, the underlying reasons for the effectiveness of such a pre-training
+technique remain unclear. The usage of additional Transformer-based decoders
+also incurs significant computational costs. In this study, we aim to shed
+light on this issue by revealing that masked auto-encoder (MAE) pre-training
+with enhanced decoding significantly improves the term coverage of input tokens
+in dense representations, compared to vanilla BERT checkpoints. Building upon
+this observation, we propose a modification to the traditional MAE by replacing
+the decoder of a masked auto-encoder with a completely simplified Bag-of-Word
+prediction task. This modification enables the efficient compression of lexical
+signals into dense representations through unsupervised pre-training.
+Remarkably, our proposed method achieves state-of-the-art retrieval performance
+on several large-scale retrieval benchmarks without requiring any additional
+parameters, which provides a 67% training speed-up compared to standard masked
+auto-encoder pre-training with enhanced decoding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Working in progress. Our code will be available at
+  https://github.com/ma787639046/bowdpr</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>-RAG: Pioneering Vector Embedding-Free Retrieval-Augmented
+  Generation in Niche Domains, Exemplified by Korean Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bongsu Kang, Jundong Kim, Tae-Rim Yun, Chang-Eop Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a natural language prompt-based retrieval augmented generation
+(Prompt-RAG), a novel approach to enhance the performance of generative large
+language models (LLMs) in niche domains. Conventional RAG methods mostly
+require vector embeddings, yet the suitability of generic LLM-based embedding
+representations for specialized domains remains uncertain. To explore and
+exemplify this point, we compared vector embeddings from Korean Medicine (KM)
+and Conventional Medicine (CM) documents, finding that KM document embeddings
+correlated more with token overlaps and less with human-assessed document
+relatedness, in contrast to CM embeddings. Prompt-RAG, distinct from
+conventional RAG models, operates without the need for embedding vectors. Its
+performance was assessed through a Question-Answering (QA) chatbot application,
+where responses were evaluated for relevance, readability, and informativeness.
+The results showed that Prompt-RAG outperformed existing models, including
+ChatGPT and conventional vector embedding-based RAGs, in terms of relevance and
+informativeness. Despite challenges like content structuring and response
+latency, the advancements in LLMs are expected to encourage the use of
+Prompt-RAG, making it a promising tool for other domains in need of RAG
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 4 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Navigating the Thin Line: Examining User Behavior in Search to Detect
+  Engagement and Backfire Effects <span class="chip">ECIR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        F. M. Cau, N. Tintarev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Opinionated users often seek information that aligns with their preexisting
+beliefs while dismissing contradictory evidence due to confirmation bias. This
+conduct hinders their ability to consider alternative stances when searching
+the web. Despite this, few studies have analyzed how the diversification of
+search results on disputed topics influences the search behavior of highly
+opinionated users. To this end, we present a preregistered user study (n = 257)
+investigating whether different levels (low and high) of bias metrics and
+search results presentation (with or without AI-predicted stances labels) can
+affect the stance diversity consumption and search behavior of opinionated
+users on three debated topics (i.e., atheism, intellectual property rights, and
+school uniforms). Our results show that exposing participants to
+(counter-attitudinally) biased search results increases their consumption of
+attitude-opposing content, but we also found that bias was associated with a
+trend toward overall fewer interactions within the search page. We also found
+that 19% of users interacted with queries and search pages but did not select
+any search results. When we removed these participants in a post-hoc analysis,
+we found that stance labels increased the diversity of stances consumed by
+users, particularly when the search results were biased. Our findings highlight
+the need for future research to explore distinct search scenario settings to
+gain insight into opinionated users' behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 3 figures, ECIR2024 (46th European Conference on
+  Information Retrieval - IR4Good track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep Learning Approach for Selective Relevance Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11198v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11198v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suchana Datta, Debasis Ganguly, Sean MacAvaney, Derek Greene
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pseudo-relevance feedback (PRF) can enhance average retrieval effectiveness
+over a sufficiently large number of queries. However, PRF often introduces a
+drift into the original information need, thus hurting the retrieval
+effectiveness of several queries. While a selective application of PRF can
+potentially alleviate this issue, previous approaches have largely relied on
+unsupervised or feature-based learning to determine whether a query should be
+expanded. In contrast, we revisit the problem of selective PRF from a deep
+learning perspective, presenting a model that is entirely data-driven and
+trained in an end-to-end manner. The proposed model leverages a
+transformer-based bi-encoder architecture. Additionally, to further improve
+retrieval effectiveness with this selective PRF approach, we make use of the
+model's confidence estimates to combine the information from the original and
+expanded queries. In our experiments, we apply this selective feedback on a
+number of different combinations of ranking and feedback models, and show that
+our proposed approach consistently improves retrieval effectiveness for both
+sparse and dense ranking models, with the feedback models being either sparse,
+dense or generative.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Document Set Expansion with Positive-Unlabeled Learning: A Density
+  Estimation-based Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11145v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11145v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haiyang Zhang, Qiuyi Chen, Yuanjie Zou, Yushan Pan, Jia Wang, Mark Stevenson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document set expansion aims to identify relevant documents from a large
+collection based on a small set of documents that are on a fine-grained topic.
+Previous work shows that PU learning is a promising method for this task.
+However, some serious issues remain unresolved, i.e. typical challenges that PU
+methods suffer such as unknown class prior and imbalanced data, and the need
+for transductive experimental settings. In this paper, we propose a novel PU
+learning framework based on density estimation, called puDE, that can handle
+the above issues. The advantage of puDE is that it neither constrained to the
+SCAR assumption and nor require any class prior knowledge. We demonstrate the
+effectiveness of the proposed method using a series of real-world datasets and
+conclude that our method is a better alternative for the DSE task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Duality in Open Information Extraction with Predicate <span class="highlight-title">Prompt</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Chen, Jingping Liu, Deqing Yang, Yanghua Xiao, Huimin Xu, Zongyu Wang, Rui Xie, Yunsen Xian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open information extraction (OpenIE) aims to extract the schema-free triplets
+in the form of (\emph{subject}, \emph{predicate}, \emph{object}) from a given
+sentence. Compared with general information extraction (IE), OpenIE poses more
+challenges for the IE models, {especially when multiple complicated triplets
+exist in a sentence. To extract these complicated triplets more effectively, in
+this paper we propose a novel generative OpenIE model, namely \emph{DualOIE},
+which achieves a dual task at the same time as extracting some triplets from
+the sentence, i.e., converting the triplets into the sentence.} Such dual task
+encourages the model to correctly recognize the structure of the given sentence
+and thus is helpful to extract all potential triplets from the sentence.
+Specifically, DualOIE extracts the triplets in two steps: 1) first extracting a
+sequence of all potential predicates, 2) then using the predicate sequence as a
+prompt to induce the generation of triplets. Our experiments on two benchmarks
+and our dataset constructed from Meituan demonstrate that DualOIE achieves the
+best performance among the state-of-the-art baselines. Furthermore, the online
+A/B test on Meituan platform shows that 0.93\% improvement of QV-CTR and 0.56\%
+improvement of UV-CTR have been obtained when the triplets extracted by DualOIE
+were leveraged in Meituan's search system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedRKG: A Privacy-preserving Federated Recommendation Framework via
+  Knowledge Graph Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dezhong Yao, Tongtong Liu, Qi Cao, Hai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has emerged as a promising approach for preserving
+data privacy in recommendation systems by training models locally. Recently,
+Graph Neural Networks (GNN) have gained popularity in recommendation tasks due
+to their ability to capture high-order interactions between users and items.
+However, privacy concerns prevent the global sharing of the entire user-item
+graph. To address this limitation, some methods create pseudo-interacted items
+or users in the graph to compensate for missing information for each client.
+Unfortunately, these methods introduce random noise and raise privacy concerns.
+In this paper, we propose FedRKG, a novel federated recommendation system,
+where a global knowledge graph (KG) is constructed and maintained on the server
+using publicly available item information, enabling higher-order user-item
+interactions. On the client side, a relation-aware GNN model leverages diverse
+KG relationships. To protect local interaction items and obscure gradients, we
+employ pseudo-labeling and Local Differential Privacy (LDP). Extensive
+experiments conducted on three real-world datasets demonstrate the competitive
+performance of our approach compared to centralized algorithms while ensuring
+privacy preservation. Moreover, FedRKG achieves an average accuracy improvement
+of 4% compared to existing federated learning baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large language models in biomedical natural language processing:
+  benchmarks, baselines, and recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16326v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16326v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyu Chen, Jingcheng Du, Yan Hu, Vipina Kuttichi Keloth, Xueqing Peng, Kalpana Raja, Rui Zhang, Zhiyong Lu, Hua Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical literature is growing rapidly, making it challenging to curate and
+extract knowledge manually. Biomedical natural language processing (BioNLP)
+techniques that can automatically extract information from biomedical
+literature help alleviate this burden. Recently, large Language Models (LLMs),
+such as GPT-3 and GPT-4, have gained significant attention for their impressive
+performance. However, their effectiveness in BioNLP tasks and impact on method
+development and downstream users remain understudied. This pilot study (1)
+establishes the baseline performance of GPT-3 and GPT-4 at both zero-shot and
+one-shot settings in eight BioNLP datasets across four applications: named
+entity recognition, relation extraction, multi-label document classification,
+and semantic similarity and reasoning, (2) examines the errors produced by the
+LLMs and categorized the errors into three types: missingness, inconsistencies,
+and unwanted artificial content, and (3) provides suggestions for using LLMs in
+BioNLP applications. We make the datasets, baselines, and results publicly
+available to the community via
+https://github.com/qingyu-qc/gpt_bionlp_benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Graph ODE for Continuous-Time Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07042v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07042v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifang Qin, Wei Ju, Hongjun Wu, Xiao Luo, Ming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation aims at understanding user preference by capturing
+successive behavior correlations, which are usually represented as the item
+purchasing sequences based on their past interactions. Existing efforts
+generally predict the next item via modeling the sequential patterns. Despite
+effectiveness, there exist two natural deficiencies: (i) user preference is
+dynamic in nature, and the evolution of collaborative signals is often ignored;
+and (ii) the observed interactions are often irregularly-sampled, while
+existing methods model item transitions assuming uniform intervals. Thus, how
+to effectively model and predict the underlying dynamics for user preference
+becomes a critical research problem. To tackle the above challenges, in this
+paper, we focus on continuous-time sequential recommendation and propose a
+principled graph ordinary differential equation framework named GDERec.
+Technically, GDERec is characterized by an autoregressive graph ordinary
+differential equation consisting of two components, which are parameterized by
+two tailored graph neural networks (GNNs) respectively to capture user
+preference from the perspective of hybrid dynamical systems. The two customized
+GNNs are trained alternately in an autoregressive manner to track the evolution
+of the underlying system from irregular observations, and thus learn effective
+representations of users and items beneficial to the sequential recommendation.
+Extensive experiments on five benchmark datasets demonstrate the superiority of
+our model over various state-of-the-art recommendation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EEE Transactions on Knowledge and Data Engineering (TKDE
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Graph Reasoning Based on Attention GCN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10049v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10049v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meera Gupta, Ravi Khanna, Divya Choudhary, Nandini Rao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel technique to enhance Knowledge Graph Reasoning by
+combining Graph Convolution Neural Network (GCN) with the Attention Mechanism.
+This approach utilizes the Attention Mechanism to examine the relationships
+between entities and their neighboring nodes, which helps to develop detailed
+feature vectors for each entity. The GCN uses shared parameters to effectively
+represent the characteristics of adjacent entities. We first learn the
+similarity of entities for node representation learning. By integrating the
+attributes of the entities and their interactions, this method generates
+extensive implicit feature vectors for each entity, improving performance in
+tasks including entity classification and link prediction, outperforming
+traditional neural network models. To conclude, this work provides crucial
+methodological support for a range of applications, such as search engines,
+question-answering systems, recommendation systems, and data integration tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-01-19T00:00:00Z">2024-01-19</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">61</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement learning for question answering in programming domain
+  using public community scoring as a human feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexey Gorbatovski, Sergey Kovalchuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we investigate the enhancement of the GPT Neo 125M performance
+in Community Question Answering (CQA) with a focus on programming, through the
+integration of Reinforcement Learning from Human Feedback (RLHF) and the
+utilization of scores from Stack Overflow. Two distinct reward model training
+strategies are employed for fine-tuning with Proximal Policy Optimization
+(PPO). Notably, the improvements in performance achieved through this method
+are comparable to those of GPT Neo 2.7B parameter variant. Additionally, an
+auxiliary scoring mechanism is introduced, which demonstrates the limitations
+of conventional linguistic metrics in evaluating responses in the programming
+domain. Through accurate analysis, this paper looks at the divergence between
+traditional linguistic metrics and our human-preferences-based reward model,
+underscoring the imperative for domain-specific evaluation methods. By
+elucidating the complexities involved in applying RLHF to programming CQA and
+accentuating the significance of context-aware evaluation, this study
+contributes to the ongoing efforts in refining Large Language Models through
+focused human feedback.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pruning for Protection: Increasing Jailbreak Resistance in Aligned LLMs
+  Without Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adib Hasan, Ileana Rugina, Alex Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are vulnerable to `Jailbreaking' prompts, a type
+of attack that can coax these models into generating harmful and illegal
+content. In this paper, we show that pruning up to 20% of LLM parameters
+markedly increases their resistance to such attacks without additional training
+and without sacrificing their performance in standard benchmarks. Intriguingly,
+we discovered that the enhanced safety observed post-pruning correlates to the
+initial safety training level of the model, hinting that the effect of pruning
+could be more general and may hold for other LLM behaviors beyond safety.
+Additionally, we introduce a curated dataset of 225 harmful tasks across five
+categories, inserted into ten different Jailbreaking prompts, showing that
+pruning aids LLMs in concentrating attention on task-relevant tokens in
+jailbreaking prompts. Lastly, our experiments reveal that the prominent chat
+models, such as LLaMA-2 Chat, Vicuna, and Mistral Instruct exhibit high
+susceptibility to jailbreaking attacks, with some categories achieving nearly
+70-100% success rate. These insights underline the potential of pruning as a
+generalizable approach for improving LLM safety, reliability, and potentially
+other desired behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancements in eHealth Data Analytics through Natural Language
+  Processing and Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elena-Simona Apostol, Ciprian-Octavian Truică
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The healthcare environment is commonly referred to as "information-rich" but
+also "knowledge poor". Healthcare systems collect huge amounts of data from
+various sources: lab reports, medical letters, logs of medical tools or
+programs, medical prescriptions, etc. These massive sets of data can provide
+great knowledge and information that can improve the medical services, and
+overall the healthcare domain, such as disease prediction by analyzing the
+patient's symptoms or disease prevention, by facilitating the discovery of
+behavioral factors for diseases. Unfortunately, only a relatively small volume
+of the textual eHealth data is processed and interpreted, an important factor
+being the difficulty in efficiently performing Big Data operations. In the
+medical field, detecting domain-specific multi-word terms is a crucial task as
+they can define an entire concept with a few words. A term can be defined as a
+linguistic structure or a concept, and it is composed of one or more words with
+a specific meaning to a domain. All the terms of a domain create its
+terminology. This chapter offers a critical study of the current, most
+performant solutions for analyzing unstructured (image and textual) eHealth
+data. This study also provides a comparison of the current Natural Language
+Processing and Deep Learning techniques in the eHealth context. Finally, we
+examine and discuss some of the current issues, and we define a set of research
+directions in this area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using LLMs to discover emerging coded antisemitic hate-speech emergence
+  in extremist social media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhanush Kikkisetti, Raza Ul Mustafa, Wendy Melillo, Roberto Corizzo, Zois Boukouvalas, Jeff Gill, Nathalie Japkowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online hate speech proliferation has created a difficult problem for social
+media platforms. A particular challenge relates to the use of coded language by
+groups interested in both creating a sense of belonging for its users and
+evading detection. Coded language evolves quickly and its use varies over time.
+This paper proposes a methodology for detecting emerging coded hate-laden
+terminology. The methodology is tested in the context of online antisemitic
+discourse. The approach considers posts scraped from social media platforms,
+often used by extremist users. The posts are scraped using seed expressions
+related to previously known discourse of hatred towards Jews. The method begins
+by identifying the expressions most representative of each post and calculating
+their frequency in the whole corpus. It filters out grammatically incoherent
+expressions as well as previously encountered ones so as to focus on emergent
+well-formed terminology. This is followed by an assessment of semantic
+similarity to known antisemitic terminology using a fine-tuned large language
+model, and subsequent filtering out of the expressions that are too distant
+from known expressions of hatred. Emergent antisemitic expressions containing
+terms clearly relating to Jewish topics are then removed to return only coded
+expressions of hatred.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, 2 algorithms, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">survey</span> on recent advances in named entity recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Imed Keraghel, Stanislas Morbieu, Mohamed Nadif
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Named Entity Recognition seeks to extract substrings within a text that name
+real-world objects and to determine their type (for example, whether they refer
+to persons or organizations). In this survey, we first present an overview of
+recent popular approaches, but we also look at graph- and transformer- based
+methods including Large Language Models (LLMs) that have not had much coverage
+in other surveys. Second, we focus on methods designed for datasets with scarce
+annotations. Third, we evaluate the performance of the main NER implementations
+on a variety of datasets with differing characteristics (as regards their
+domain, their size, and their number of classes). We thus provide a deep
+comparison of algorithms that are never considered together. Our experiments
+shed some light on how the characteristics of datasets affect the behavior of
+the methods that we compare.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Medusa: Simple LLM Inference Acceleration Framework with Multiple
+  Decoding Heads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianle Cai, Yuhong Li, Zhengyang Geng, Hongwu Peng, Jason D. Lee, Deming Chen, Tri Dao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The inference process in Large Language Models (LLMs) is often limited due to
+the absence of parallelism in the auto-regressive decoding process, resulting
+in most operations being restricted by the memory bandwidth of accelerators.
+While methods such as speculative decoding have been suggested to address this
+issue, their implementation is impeded by the challenges associated with
+acquiring and maintaining a separate draft model. In this paper, we present
+Medusa, an efficient method that augments LLM inference by adding extra
+decoding heads to predict multiple subsequent tokens in parallel. Using a
+tree-based attention mechanism, Medusa constructs multiple candidate
+continuations and verifies them simultaneously in each decoding step. By
+leveraging parallel processing, Medusa introduces only minimal overhead in
+terms of single-step latency while substantially reducing the number of
+decoding steps required.
+  We present two levels of fine-tuning procedures for Medusa to meet the needs
+of different use cases: Medusa-1: Medusa is directly fine-tuned on top of a
+frozen backbone LLM, enabling lossless inference acceleration. Medusa-2: Medusa
+is fine-tuned together with the backbone LLM, enabling better prediction
+accuracy of Medusa heads and higher speedup but needing a special training
+recipe that preserves the backbone model's capabilities.
+  Moreover, we propose several extensions that improve or expand the utility of
+Medusa, including a self-distillation to handle situations where no training
+data is available and a typical acceptance scheme to boost the acceptance rate
+while maintaining generation quality. We evaluate Medusa on models of various
+sizes and training procedures. Our experiments demonstrate that Medusa-1 can
+achieve over 2.2x speedup without compromising generation quality, while
+Medusa-2 further improves the speedup to 2.3-3.6x.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code for this implementation is available at
+  https://github.com/FasterDecoding/Medusa</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Hallucinations of Large Language Models via Knowledge
+  Consistent Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanqi Wan, Xinting Huang, Leyang Cui, Xiaojun Quan, Wei Bi, Shuming Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Large Language Models (LLMs) have proven to be exceptional on a variety
+of tasks after alignment, they may still produce responses that contradict the
+context or world knowledge confidently, a phenomenon known as
+``hallucination''. In this paper, we demonstrate that reducing the
+inconsistency between the external knowledge encapsulated in the training data
+and the intrinsic knowledge inherited in the pretraining corpus could mitigate
+hallucination in alignment. Specifically, we introduce a novel knowledge
+consistent alignment (KCA) approach, which involves automatically formulating
+examinations based on external knowledge for accessing the comprehension of
+LLMs. For data encompassing knowledge inconsistency, KCA implements several
+simple yet efficient strategies for processing. We illustrate the superior
+performance of the proposed KCA approach in mitigating hallucinations across
+six benchmarks using LLMs of different backbones and scales. Furthermore, we
+confirm the correlation between knowledge inconsistency and hallucination,
+signifying the effectiveness of reducing knowledge inconsistency in alleviating
+hallucinations. Our code, model weights, and data are public at
+\url{https://github.com/fanqiwan/KCA}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Structured Code Representations Enable Data-Efficient Adaptation of Code
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10716v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10716v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayank Agarwal, Yikang Shen, Bailin Wang, Yoon Kim, Jie Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current language models tailored for code tasks often adopt the
+pre-training-then-fine-tuning paradigm from natural language processing,
+modeling source code as plain text. This approach, however, overlooks the
+unambiguous structures inherent in programming languages. In this work, we
+explore data-efficient adaptation of pre-trained code models by further
+pre-training and fine-tuning them with program structures. Specifically, we
+represent programs as parse trees -- also known as concrete syntax trees (CSTs)
+-- and adapt pre-trained models on serialized CSTs. Although the models that we
+adapt have been pre-trained only on the surface form of programs, we find that
+a small amount of continual pre-training and fine-tuning on CSTs without
+changing the model architecture yields improvements over the baseline approach
+across various code tasks. The improvements are found to be particularly
+significant when there are limited training examples, demonstrating the
+effectiveness of integrating program structures with plain-text representation
+even when working with backbone models that have not been pre-trained with
+structures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Q&A <span class="highlight-title">Prompt</span>s: Discovering Rich Visual Clues through Mining
+  Question-Answer <span class="highlight-title">Prompt</span>s for VQA requiring Diverse World Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haibi Wang, Weifeng Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the breakthrough of multi-modal large language models, answering complex
+visual questions that demand advanced reasoning abilities and world knowledge
+has become a much more important testbed for developing AI models than ever.
+However, equipping AI models with robust cross-modality reasoning ability
+remains challenging since the cognition scheme of humans has not been
+understood systematically. In this paper, we believe that if we can collect
+visual clues in the given image as much as possible, we will recognize the
+image more accurately, understand the question better, recall relevant
+knowledge more easily, and finally reason out the answer. We discover these
+rich visual clues by mining question-answer pairs in images and sending them
+into multi-modal large language models as prompts. We call the proposed method
+Q&A Prompts. Specifically, we first use the image-answer pairs and the
+corresponding questions in the training set as inputs and outputs to train a
+visual question generation model. Then, we use an image tagging model to
+identify various instances and send packaged image-tag pairs into the visual
+question generation model to generate relevant questions with the extracted
+image tags as answers. Finally, we encode these generated question-answer pairs
+as prompts with a visual-aware prompting module and send them into pre-trained
+multi-modal large language models to reason out the final answers. Experimental
+results show that, compared with state-of-the-art methods, our Q&A Prompts
+achieves substantial improvements on the challenging visual question answering
+datasets requiring reasoning over diverse world knowledge, such as OK-VQA and
+A-OKVQA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weakly Supervised Gaussian Contrastive Grounding with Large Multimodal
+  Models for Video Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haibo Wang, Chenghang Lai, Yixuan Sun, Weifeng Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Question Answering (VideoQA) aims to answer natural language questions
+based on the information observed in videos. Despite the recent success of
+Large Multimodal Models (LMMs) in image-language understanding and reasoning,
+they deal with VideoQA insufficiently by simply taking uniformly sampled frames
+as visual inputs, which ignores question-relevant visual clues. Moreover, there
+are no human annotations for question-critical timestamps in existing VideoQA
+datasets. In light of this, we propose a novel weakly supervised framework to
+enforce the LMMs to reason out the answers with question-critical moments as
+visual inputs. Specifically, we fuse the question and answer pairs as event
+descriptions to find multiple keyframes as target moments, which will be
+pseudo-labels. With these pseudo-labels as additionally weak supervision, we
+devise a lightweight Gaussian-based Contrastive Grounding (GCG) module. GCG
+learns multiple Gaussian functions to characterize the temporal structure of
+the video, and sample question-critical frames as positive moments to be the
+visual inputs of LMMs. Extensive experiments on several VideoQA benchmarks
+verify the effectiveness of our framework, and we achieve substantial
+improvements compared to previous state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LangBridge: Multilingual Reasoning Without Multilingual Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongkeun Yoon, Joel Jang, Sungdong Kim, Seungone Kim, Sheikh Shafayat, Minjoon Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce LangBridge, a zero-shot approach to adapt language models for
+multilingual reasoning tasks without multilingual supervision. LangBridge
+operates by bridging two models, each specialized in different aspects: (1) one
+specialized in understanding multiple languages (e.g., mT5 encoder) and (2) one
+specialized in reasoning (e.g., Orca 2). LangBridge connects the two models by
+introducing minimal trainable parameters between them. Despite utilizing only
+English data for training, LangBridge considerably enhances the performance of
+language models on low-resource languages across mathematical reasoning,
+coding, and logical reasoning. Our analysis suggests that the efficacy of
+LangBridge stems from the language-agnostic characteristics of multilingual
+representations. We publicly release our code and models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simple Framework to Accelerate Multilingual Language Model for
+  Monolingual Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jimin Hong, Gibbeum Lee, Jaewoong Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large language models have facilitated the execution
+of complex language tasks, not only in English but also in non-English
+languages. However, the tokenizers of most language models, such as Llama,
+trained on English-centric corpora, tend to excessively fragment tokens in
+non-English languages. This issue is especially pronounced in non-roman
+alphabetic languages, which are often divided at a character or even Unicode
+level, leading to slower text generation. To address this, our study introduces
+a novel framework designed to expedite text generation in these languages. This
+framework predicts larger linguistic units than those of conventional
+multilingual tokenizers and is specifically tailored to the target language,
+thereby reducing the number of decoding steps required. Our empirical results
+demonstrate that the proposed framework increases the generation speed by a
+factor of 1.9 compared to standard decoding while maintaining the performance
+of a pre-trained multilingual model on monolingual tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attentive Fusion: A <span class="highlight-title">Transformer</span>-based Approach to Multimodal Hate Speech
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atanu Mandal, Gargi Roy, Amit Barman, Indranil Dutta, Sudip Kumar Naskar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent surge and exponential growth of social media usage,
+scrutinizing social media content for the presence of any hateful content is of
+utmost importance. Researchers have been diligently working since the past
+decade on distinguishing between content that promotes hatred and content that
+does not. Traditionally, the main focus has been on analyzing textual content.
+However, recent research attempts have also commenced into the identification
+of audio-based content. Nevertheless, studies have shown that relying solely on
+audio or text-based content may be ineffective, as recent upsurge indicates
+that individuals often employ sarcasm in their speech and writing. To overcome
+these challenges, we present an approach to identify whether a speech promotes
+hate or not utilizing both audio and textual representations. Our methodology
+is based on the Transformer framework that incorporates both audio and text
+sampling, accompanied by our very own layer called "Attentive Fusion". The
+results of our study surpassed previous state-of-the-art techniques, achieving
+an impressive macro F1 score of 0.927 on the Test Set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 20th International Conference on Natural Language
+  Processing (ICON)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sowing the Wind, Reaping the Whirlwind: The Impact of Editing Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rima Hazra, Sayan Layek, Somnath Banerjee, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly advancing field of artificial intelligence, the concept of
+Red-Teaming or Jailbreaking large language models (LLMs) has emerged as a
+crucial area of study. This approach is especially significant in terms of
+assessing and enhancing the safety and robustness of these models. This paper
+investigates the intricate consequences of such modifications through model
+editing, uncovering a complex relationship between enhancing model accuracy and
+preserving its ethical integrity. Our in-depth analysis reveals a striking
+paradox: while injecting accurate information is crucial for model reliability,
+it can paradoxically destabilize the model's foundational framework, resulting
+in unpredictable and potentially unsafe behaviors. Additionally, we propose a
+benchmark dataset NicheHazardQA to investigate this unsafe behavior both within
+the same and cross topical domain. This aspect of our research sheds light on
+how the edits, impact the model's safety metrics and guardrails. Our findings
+show that model editing serves as a cost-effective tool for topical red-teaming
+by methodically applying targeted edits and evaluating the resultant model
+behavior
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PHOENIX: Open-Source Language Adaption for Direct Preference
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Uhlig, Sigurd Schacht, Sudarshan Kamath Barkur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have gained immense importance in recent years and have
+demonstrated outstanding results in solving various tasks. However, despite
+these achievements, many questions remain unanswered in the context of large
+language models. Besides the optimal use of the models for inference and the
+alignment of the results to the desired specifications, the transfer of models
+to other languages is still an underdeveloped area of research. The recent
+publication of models such as Llama-2 and Zephyr has provided new insights into
+architectural improvements and the use of human feedback. However, insights
+into adapting these techniques to other languages remain scarce. In this paper,
+we build on latest improvements and apply the Direct Preference
+Optimization(DPO) approach to the German language. The model is available at
+https://huggingface.co/DRXD1000/Phoenix.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-training from Self-memory in Data-to-text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoang-Thang Ta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel training model, self-training from self-memory
+(STSM) in data-to-text generation (DTG), allowing the model to self-train on
+subsets, including self-memory as outputs inferred directly from the trained
+models and/or the new data. The quality of self-memory is validated by two
+models, data-to-text (D2T) and text-to-data (T2D), by two pre-defined
+conditions: (1) the appearance of all source values in the outputs of the D2T
+model and (2) the ability to convert back to source data in the outputs in the
+T2D model. We utilize a greedy algorithm to generate shorter D2T outputs if
+they contain all source values. Subsequently, we use the T2D model to confirm
+that these outputs can capture input relationships by demonstrating their
+capacity to convert text back into data. With 30% of the dataset, we can train
+the D2T model with a competitive performance compared to full training in the
+same setup. We experiment with our model on two datasets, E2E NLG and DART.
+STSM offers the D2T model a generalization capability from its subset memory
+while reducing training data volume. Ultimately, we anticipate that this paper
+will contribute to continual learning solutions that adapt to new training
+data, incorporating it as a form of self-memory in DTG tasks. The curated
+dataset is publicly available at: https://github.com/hoangthangta/STSM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OrchMoE: Efficient Multi-Adapter Learning with Task-Skill Synergy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10559v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10559v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowen Wang, Tao Sun, Kaixiang Ji, Jian Wang, Cong Fan, Jinjie Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We advance the field of Parameter-Efficient Fine-Tuning (PEFT) with our novel
+multi-adapter method, OrchMoE, which capitalizes on modular skill architecture
+for enhanced forward transfer in neural networks. Unlike prior models that
+depend on explicit task identification inputs, OrchMoE automatically discerns
+task categories, streamlining the learning process. This is achieved through an
+integrated mechanism comprising an Automatic Task Classification module and a
+Task-Skill Allocation module, which collectively deduce task-specific
+classifications and tailor skill allocation matrices. Our extensive evaluations
+on the 'Super Natural Instructions' dataset, featuring 1,600 diverse
+instructional tasks, indicate that OrchMoE substantially outperforms comparable
+multi-adapter baselines in terms of both performance and sample utilization
+efficiency, all while operating within the same parameter constraints. These
+findings suggest that OrchMoE offers a significant leap forward in multi-task
+learning efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multilingual acoustic word embeddings for zero-resource languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christiaan Jacobs, Herman Kamper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research addresses the challenge of developing speech applications for
+zero-resource languages that lack labelled data. It specifically uses acoustic
+word embedding (AWE) -- fixed-dimensional representations of variable-duration
+speech segments -- employing multilingual transfer, where labelled data from
+several well-resourced languages are used for pertaining. The study introduces
+a new neural network that outperforms existing AWE models on zero-resource
+languages. It explores the impact of the choice of well-resourced languages.
+AWEs are applied to a keyword-spotting system for hate speech detection in
+Swahili radio broadcasts, demonstrating robustness in real-world scenarios.
+Additionally, novel semantic AWE models improve semantic query-by-example
+search.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speech Swin-<span class="highlight-title">Transformer</span>: Exploring a Hierarchical <span class="highlight-title">Transformer</span> with
+  Shifted Windows for Speech Emotion Recognition <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10536v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10536v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong Wang, Cheng Lu, Hailun Lian, Yan Zhao, Björn Schuller, Yuan Zong, Wenming Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Swin-Transformer has demonstrated remarkable success in computer vision by
+leveraging its hierarchical feature representation based on Transformer. In
+speech signals, emotional information is distributed across different scales of
+speech features, e.\,g., word, phrase, and utterance. Drawing above
+inspiration, this paper presents a hierarchical speech Transformer with shifted
+windows to aggregate multi-scale emotion features for speech emotion
+recognition (SER), called Speech Swin-Transformer. Specifically, we first
+divide the speech spectrogram into segment-level patches in the time domain,
+composed of multiple frame patches. These segment-level patches are then
+encoded using a stack of Swin blocks, in which a local window Transformer is
+utilized to explore local inter-frame emotional information across frame
+patches of each segment patch. After that, we also design a shifted window
+Transformer to compensate for patch correlations near the boundaries of segment
+patches. Finally, we employ a patch merging operation to aggregate
+segment-level emotional features for hierarchical speech representation by
+expanding the receptive field of Transformer from frame-level to segment-level.
+Experimental results demonstrate that our proposed Speech Swin-Transformer
+outperforms the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The "Colonial Impulse" of Natural Language Processing: An Audit of
+  Bengali Sentiment Analysis Tools and Their Identity-based Biases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dipto Das, Shion Guha, Jed Brubaker, Bryan Semaan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While colonization has sociohistorically impacted people's identities across
+various dimensions, those colonial values and biases continue to be perpetuated
+by sociotechnical systems. One category of sociotechnical systems--sentiment
+analysis tools--can also perpetuate colonial values and bias, yet less
+attention has been paid to how such tools may be complicit in perpetuating
+coloniality, although they are often used to guide various practices (e.g.,
+content moderation). In this paper, we explore potential bias in sentiment
+analysis tools in the context of Bengali communities that have experienced and
+continue to experience the impacts of colonialism. Drawing on identity
+categories most impacted by colonialism amongst local Bengali communities, we
+focused our analytic attention on gender, religion, and nationality. We
+conducted an algorithmic audit of all sentiment analysis tools for Bengali,
+available on the Python package index (PyPI) and GitHub. Despite similar
+semantic content and structure, our analyses showed that in addition to
+inconsistencies in output from different tools, Bengali sentiment analysis
+tools exhibit bias between different identity categories and respond
+differently to different ways of identity expression. Connecting our findings
+with colonially shaped sociocultural structures of Bengali communities, we
+discuss the implications of downstream bias of sentiment analysis tools.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mementos: A Comprehensive Benchmark for Multimodal Large Language Model
+  Reasoning over Image Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiyao Wang, Yuhang Zhou, Xiaoyu Liu, Hongjin Lu, Yuancheng Xu, Feihong He, Jaehong Yoon, Taixi Lu, Gedas Bertasius, Mohit Bansal, Huaxiu Yao, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in
+handling a variety of visual-language tasks. However, current MLLM benchmarks
+are predominantly designed to evaluate reasoning based on static information
+about a single image, and the ability of modern MLLMs to extrapolate from image
+sequences, which is essential for understanding our ever-changing world, has
+been less investigated. To address this challenge, this paper introduces
+Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning
+abilities. Mementos features 4,761 diverse image sequences with varying
+lengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning
+performance. Through a careful evaluation of nine recent MLLMs on Mementos,
+including GPT-4V and Gemini, we find that they struggle to accurately describe
+dynamic information about given image sequences, often leading to
+hallucinations/misrepresentations of objects and their corresponding behaviors.
+Our quantitative analysis and case studies identify three key factors impacting
+MLLMs' sequential image reasoning: the correlation between object and
+behavioral hallucinations, the influence of cooccurring behaviors, and the
+compounding impact of behavioral hallucinations. Our dataset is available at
+https://github.com/umd-huang-lab/Mementos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 23 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-lingual Editing in Multilingual Language Models <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10521v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10521v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Himanshu Beniwal, Kowsik Nandagopan D, Mayank Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The training of large language models (LLMs) necessitates substantial data
+and computational resources, and updating outdated LLMs entails significant
+efforts and resources. While numerous model editing techniques (METs) have
+emerged to efficiently update model outputs without retraining, their
+effectiveness in multilingual LLMs, where knowledge is stored in diverse
+languages, remains an underexplored research area. This research paper
+introduces the cross-lingual model editing (\textbf{XME}) paradigm, wherein a
+fact is edited in one language, and the subsequent update propagation is
+observed across other languages. To investigate the XME paradigm, we conducted
+experiments using BLOOM, mBERT, and XLM-RoBERTa using the two writing scripts:
+\textit{Latin} (English, French, and Spanish) and \textit{Indic} (Hindi,
+Gujarati, and Bengali). The results reveal notable performance limitations of
+state-of-the-art METs under the XME setting, mainly when the languages involved
+belong to two distinct script families. These findings highlight the need for
+further research and development of XME techniques to address these challenges.
+For more comprehensive information, the dataset used in this research and the
+associated code are publicly available at the following
+URL\url{https://github.com/lingo-iitgn/XME}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A match made in consistency heaven: when large language models meet
+  evolutionary algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang Chao, Jiaxuan Zhao, Licheng Jiao, Lingling Li, Fang Liu, Shuyuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large language models (LLMs) have powerful capabilities for
+generating creative natural text. Evolutionary algorithms (EAs) can discover
+diverse solutions to complex real-world problems. Motivated by the common
+collective and directionality of text sequence generation and evolution, this
+paper illustrates the strong consistency of LLMs and EAs, which includes
+multiple one-to-one key characteristics: token embedding and genotype-phenotype
+mapping, position encoding and fitness shaping, position embedding and
+selection, attention and crossover, feed-forward neural network and mutation,
+model training and parameter update, and multi-task learning and
+multi-objective optimization. Based on this consistency perspective, existing
+coupling studies are analyzed, including evolutionary fine-tuning and
+LLM-enhanced EAs. Leveraging these insights, we outline a fundamental roadmap
+for future research in coupling LLMs and EAs, while highlighting key challenges
+along the way. The consistency not only reveals the evolution mechanism behind
+LLMs but also facilitates the development of evolved artificial agents that
+approach or surpass biological organisms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A perspective article under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FinSQL: Model-Agnostic LLMs-based Text-to-SQL Framework for Financial
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Zhang, Yuren Mao, Yijiang Fan, Yu Mi, Yunjun Gao, Lu Chen, Dongfang Lou, Jinshu Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-SQL, which provides zero-code interface for operating relational
+databases, has gained much attention in financial analysis; because, financial
+professionals may not well-skilled in SQL programming. However, until now,
+there is no practical Text-to-SQL benchmark dataset for financial analysis, and
+existing Text-to-SQL methods have not considered the unique characteristics of
+databases in financial applications, such as commonly existing wide tables. To
+address these issues, we collect a practical Text-to-SQL benchmark dataset and
+propose a model-agnostic Large Language Model (LLMs)-based Text-to-SQL
+framework for financial analysis. The benchmark dataset, BULL, is collected
+from the practical financial analysis business of Hundsun Technologies Inc.,
+including databases for fund, stock, and macro economy. Besides, the proposed
+LLMs-based Text-to-SQL framework, FinSQL, provides a systematic treatment for
+financial Text-to-SQL from the perspectives of prompt construction,
+parameter-efficient fine-tuning and output calibration. Extensive experimental
+results on BULL demonstrate that FinSQL achieves the state-of-the-art
+Text-to-SQL performance at a small cost; furthermore, FinSQL can bring up to
+36.64% performance improvement in scenarios requiring few-shot cross-database
+model transfer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Fusion of Large Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanqi Wan, Xinting Huang, Deng Cai, Xiaojun Quan, Wei Bi, Shuming Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While training large language models (LLMs) from scratch can generate models
+with distinct functionalities and strengths, it comes at significant costs and
+may result in redundant capabilities. Alternatively, a cost-effective and
+compelling approach is to merge existing pre-trained LLMs into a more potent
+model. However, due to the varying architectures of these LLMs, directly
+blending their weights is impractical. In this paper, we introduce the notion
+of knowledge fusion for LLMs, aimed at combining the capabilities of existing
+LLMs and transferring them into a single LLM. By leveraging the generative
+distributions of source LLMs, we externalize their collective knowledge and
+unique strengths, thereby potentially elevating the capabilities of the target
+model beyond those of any individual source LLM. We validate our approach using
+three popular LLMs with different architectures--Llama-2, MPT, and
+OpenLLaMA--across various benchmarks and tasks. Our findings confirm that the
+fusion of LLMs can improve the performance of the target model across a range
+of capabilities such as reasoning, commonsense, and code generation. Our code,
+model weights, and data are public at
+\url{https://github.com/fanqiwan/FuseLLM}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Dense Retrieval: Memory Can Be a Burden <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiwen Yuan, Xinglin Wang, Shaoxiong Feng, Boyuan Pan, Yiwei Li, Heda Wang, Xupeng Miao, Kan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Retrieval (GR), autoregressively decoding relevant document
+identifiers given a query, has been shown to perform well under the setting of
+small-scale corpora. By memorizing the document corpus with model parameters,
+GR implicitly achieves deep interaction between query and document. However,
+such a memorizing mechanism faces three drawbacks: (1) Poor memory accuracy for
+fine-grained features of documents; (2) Memory confusion gets worse as the
+corpus size increases; (3) Huge memory update costs for new documents. To
+alleviate these problems, we propose the Generative Dense Retrieval (GDR)
+paradigm. Specifically, GDR first uses the limited memory volume to achieve
+inter-cluster matching from query to relevant document clusters.
+Memorizing-free matching mechanism from Dense Retrieval (DR) is then introduced
+to conduct fine-grained intra-cluster matching from clusters to relevant
+documents. The coarse-to-fine process maximizes the advantages of GR's deep
+interaction and DR's scalability. Besides, we design a cluster identifier
+constructing strategy to facilitate corpus memory and a cluster-adaptive
+negative sampling strategy to enhance the intra-cluster mapping ability.
+Empirical results show that GDR obtains an average of 3.0 R@100 improvement on
+NQ dataset under multiple settings and has better scalability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EACL 2024 main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Escape Sky-high Cost: Early-stopping Self-Consistency for Multi-step
+  Reasoning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiwei Li, Peiwen Yuan, Shaoxiong Feng, Boyuan Pan, Xinglin Wang, Bin Sun, Heda Wang, Kan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-consistency (SC) has been a widely used decoding strategy for
+chain-of-thought reasoning. Despite bringing significant performance
+improvements across a variety of multi-step reasoning tasks, it is a high-cost
+method that requires multiple sampling with the preset size. In this paper, we
+propose a simple and scalable sampling process, \textbf{E}arly-Stopping
+\textbf{S}elf-\textbf{C}onsistency (ESC), to greatly reduce the cost of SC
+without sacrificing performance. On this basis, one control scheme for ESC is
+further derivated to dynamically choose the performance-cost balance for
+different tasks and models. To demonstrate ESC's effectiveness, we conducted
+extensive experiments on three popular categories of reasoning tasks:
+arithmetic, commonsense and symbolic reasoning over language models with
+varying scales. The empirical results show that ESC reduces the average number
+of sampling of chain-of-thought reasoning by a significant margin on six
+benchmarks, including MATH (-33.8%), GSM8K (-80.1%), StrategyQA (-76.8%),
+CommonsenseQA (-78.5%), Coin Flip (-84.2%) and Last Letters (-67.4%), while
+attaining comparable performances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Name Tagging Under Domain Shift via Metric Learning for Life Sciences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyi Liu, Qingyun Wang, Payam Karisani, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Name tagging is a key component of Information Extraction (IE), particularly
+in scientific domains such as biomedicine and chemistry, where large language
+models (LLMs), e.g., ChatGPT, fall short. We investigate the applicability of
+transfer learning for enhancing a name tagging model trained in the biomedical
+domain (the source domain) to be used in the chemical domain (the target
+domain). A common practice for training such a model in a few-shot learning
+setting is to pretrain the model on the labeled source data, and then, to
+finetune it on a hand-full of labeled target examples. In our experiments we
+observed that such a model is prone to mis-labeling the source entities, which
+can often appear in the text, as the target entities. To alleviate this
+problem, we propose a model to transfer the knowledge from the source domain to
+the target domain, however, at the same time, to project the source entities
+and target entities into separate regions of the feature space. This diminishes
+the risk of mis-labeling the source entities as the target entities. Our model
+consists of two stages: 1) entity grouping in the source domain, which
+incorporates knowledge from annotated events to establish relations between
+entities, and 2) entity discrimination in the target domain, which relies on
+pseudo labeling and contrastive learning to enhance discrimination between the
+entities in the two domains. We carry out our extensive experiments across
+three source and three target datasets, and demonstrate that our method
+outperforms the baselines, in some scenarios by 5\% absolute value.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepEdit: Knowledge Editing as Decoding with Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiwei Wang, Muhao Chen, Nanyun Peng, Kai-Wei Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a new perspective of knowledge editing for large language models
+(LLMs) as decoding with constraints. We propose DeepEdit (Depth-first Search
+based Progressive Decoding for Knowledge Editing), a neuro-symbolic method that
+improves knowledge editing with better coherence of reasoning, relevance to the
+question, and awareness of updated knowledge. DeepEdit can be flexibly applied
+to all black-box LLMs: it does not require any access to the model parameters,
+representations, or output vocabulary distributions. DeepEdit progressively
+produces the high-quality reasoning steps towards effective knowledge editing.
+It utilizes a depth-first search to revise the LLMs' output, which improves the
+output's informativeness to the input question and awareness of the updated
+knowledge. Qualitatively, DeepEdit effectively controls LLMs to produce more
+succinct reasoning in accord with knowledge editing. Quantitatively, DeepEdit
+yields significant gains on MQuaKE, a challenging multi-hop question-answering
+dataset with knowledge editing. We release the source code at
+https://github.com/wangywUST/DeepEdit.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-driven grapheme-to-phoneme representations for a lexicon-free
+  text-to-speech <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhinav Garg, Jiyeon Kim, Sushil Khyalia, Chanwoo Kim, Dhananjaya Gowda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Grapheme-to-Phoneme (G2P) is an essential first step in any modern,
+high-quality Text-to-Speech (TTS) system. Most of the current G2P systems rely
+on carefully hand-crafted lexicons developed by experts. This poses a two-fold
+problem. Firstly, the lexicons are generated using a fixed phoneme set,
+usually, ARPABET or IPA, which might not be the most optimal way to represent
+phonemes for all languages. Secondly, the man-hours required to produce such an
+expert lexicon are very high. In this paper, we eliminate both of these issues
+by using recent advances in self-supervised learning to obtain data-driven
+phoneme representations instead of fixed representations. We compare our
+lexicon-free approach against strong baselines that utilize a well-crafted
+lexicon. Furthermore, we show that our data-driven lexicon-free method performs
+as good or even marginally better than the conventional rule-based or
+lexicon-based neural G2Ps in terms of Mean Opinion Score (MOS) while using no
+prior language lexicon or phoneme set, i.e. no linguistic expertise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Critical Data Size of Language Models from a Grokking Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuekai Zhu, Yao Fu, Bowen Zhou, Zhouhan Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the critical data size in language models, a threshold that marks
+a fundamental shift from quick memorization to slow generalization. We
+formalize the phase transition under the grokking configuration into the Data
+Efficiency Hypothesis and identify data insufficiency, sufficiency, and surplus
+regimes in language models training dynamics. We develop a grokking
+configuration to reproduce grokking on simplistic language models stably by
+rescaling initialization and weight decay. We show that generalization occurs
+only when language models reach a critical size. We analyze grokking across
+sample-wise and model-wise, verifying the proposed data efficiency hypothesis.
+Our experiments reveal smoother phase transitions occurring at the critical
+dataset size for language datasets. As the model size increases, this critical
+point also becomes larger, indicating that larger models require more data. Our
+results deepen the understanding of language model training, offering a novel
+perspective on the role of data in the learning mechanism of language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contextualized Automatic Speech Recognition with Attention-Based Bias
+  Phrase Boosted Beam Search <span class="chip">ICASSP20224</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10449v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10449v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yui Sudo, Muhammad Shakeel, Yosuke Fukumoto, Yifan Peng, Shinji Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end (E2E) automatic speech recognition (ASR) methods exhibit
+remarkable performance. However, since the performance of such methods is
+intrinsically linked to the context present in the training data, E2E-ASR
+methods do not perform as desired for unseen user contexts (e.g., technical
+terms, personal names, and playlists). Thus, E2E-ASR methods must be easily
+contextualized by the user or developer. This paper proposes an attention-based
+contextual biasing method that can be customized using an editable phrase list
+(referred to as a bias list). The proposed method can be trained effectively by
+combining a bias phrase index loss and special tokens to detect the bias
+phrases in the input speech data. In addition, to improve the contextualization
+performance during inference further, we propose a bias phrase boosted (BPB)
+beam search algorithm based on the bias phrase index probability. Experimental
+results demonstrate that the proposed method consistently improves the word
+error rate and the character error rate of the target phrases in the bias list
+on both the Librispeech-960 (English) and our in-house (Japanese) dataset,
+respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by ICASSP20224</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating Training Strategies and Model Robustness of Low-Rank
+  Adaptation for Language Modeling in Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Yu, Chao-Han Huck Yang, Tuan Dinh, Sungho Ryu, Jari Kolehmainen, Roger Ren, Denis Filimonov, Prashanth G. Shivakumar, Ankur Gandhe, Ariya Rastow, Jia Xu, Ivan Bulyko, Andreas Stolcke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of low-rank adaptation (LoRA) with frozen pretrained language models
+(PLMs) has become increasing popular as a mainstream, resource-efficient
+modeling approach for memory-constrained hardware. In this study, we first
+explore how to enhance model performance by introducing various LoRA training
+strategies, achieving relative word error rate reductions of 3.50\% on the
+public Librispeech dataset and of 3.67\% on an internal dataset in the
+messaging domain. To further characterize the stability of LoRA-based
+second-pass speech recognition models, we examine robustness against input
+perturbations. These perturbations are rooted in homophone replacements and a
+novel metric called N-best Perturbation-based Rescoring Robustness (NPRR), both
+designed to measure the relative degradation in the performance of rescoring
+models. Our experimental results indicate that while advanced variants of LoRA,
+such as dynamic rank-allocated LoRA, lead to performance degradation in
+$1$-best perturbation, they alleviate the degradation in $N$-best perturbation.
+This finding is in comparison to fully-tuned models and vanilla LoRA tuning
+baselines, suggesting that a comprehensive selection is needed when using
+LoRA-based adaptation for compute-cost savings and robust language modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models are Efficient Learners of Noise-Robust Speech
+  Recognition <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Hu, Chen Chen, Chao-Han Huck Yang, Ruizhe Li, Chao Zhang, Pin-Yu Chen, EnSiong Chng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language models (LLMs) have promoted generative
+error correction (GER) for automatic speech recognition (ASR), which leverages
+the rich linguistic knowledge and powerful reasoning ability of LLMs to improve
+recognition results. The latest work proposes a GER benchmark with HyPoradise
+dataset to learn the mapping from ASR N-best hypotheses to ground-truth
+transcription by efficient LLM finetuning, which shows great effectiveness but
+lacks specificity on noise-robust ASR. In this work, we extend the benchmark to
+noisy conditions and investigate if we can teach LLMs to perform denoising for
+GER just like what robust ASR do}, where one solution is introducing noise
+information as a conditioner into LLM. However, directly incorporating noise
+embeddings from audio encoder could harm the LLM tuning due to cross-modality
+gap. To this end, we propose to extract a language-space noise embedding from
+the N-best list to represent the noise conditions of source speech, which can
+promote the denoising process in GER. Furthermore, in order to enhance its
+representation ability of audio noise, we design a knowledge distillation (KD)
+approach via mutual information estimation to distill the real noise
+information in audio embeddings to our language embedding. Experiments on
+various latest LLMs demonstrate our approach achieves a new breakthrough with
+up to 53.9% correction improvement in terms of word error rate while with
+limited training data. Analysis shows that our language-space noise embedding
+can well represent the noise conditions of source speech, under which
+off-the-shelf LLMs show strong ability of language-space denoising.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR 2024, Spotlight top 5%, 24 pages. This work will be
+  open sourced at: https://github.com/YUCHEN005/RobustGER under MIT license</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Breaking the Curse of Multilinguality with Cross-lingual Expert Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Terra Blevins, Tomasz Limisiewicz, Suchin Gururangan, Margaret Li, Hila Gonen, Noah A. Smith, Luke Zettlemoyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their popularity in non-English NLP, multilingual language models
+often underperform monolingual ones due to inter-language competition for model
+parameters. We propose Cross-lingual Expert Language Models (X-ELM), which
+mitigate this competition by independently training language models on subsets
+of the multilingual corpus. This process specializes X-ELMs to different
+languages while remaining effective as a multilingual ensemble. Our experiments
+show that when given the same compute budget, X-ELM outperforms jointly trained
+multilingual models across all considered languages and that these gains
+transfer to downstream tasks. X-ELM provides additional benefits over
+performance improvements: new experts can be iteratively added, adapting X-ELM
+to new languages without catastrophic forgetting. Furthermore, training is
+asynchronous, reducing the hardware requirements for multilingual training and
+democratizing multilingual modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mining experimental data from Materials Science literature with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Foppiano, Guillaume Lambard, Toshiyuki Amagasa, Masashi Ishii
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study is dedicated to evaluating the capabilities of advanced large
+language models (LLMs) such as GPT-3.5-Turbo, GPT-4, and GPT-4-Turbo in the
+extraction of structured information from scientific documents within the field
+of materials science. We introduce a novel methodology for the comparative
+analysis of intricate material expressions, emphasising the standardisation of
+chemical formulas to tackle the complexities inherent in materials science
+information assessment. To this end, we primarily focus on two critical tasks
+of information extraction: (i) a named entity recognition (NER) of studied
+materials and physical properties and (ii) a relation extraction (RE) between
+these entities. The performance of LLMs in executing these tasks is benchmarked
+against traditional models based on the BERT architecture and rule-based
+approaches. For NER, LLMs fail to outperform the baseline with zero-shot
+prompting and exhibit only limited improvement with few-shot prompting.
+However, for RE, a GPT-3.5-Turbo fine-tuned with the appropriate strategy
+outperforms all models, including the baseline. Without any fine-tuning, GPT-4
+and GPT-4-Turbo display remarkable reasoning and relationship extraction
+capabilities after being provided with merely a couple of examples, surpassing
+the baseline. Overall, the results suggest that although LLMs demonstrate
+relevant reasoning skills in connecting concepts, for tasks requiring
+extracting complex domain-specific entities like materials, specialised models
+are currently a better choice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PubTator 3.0: an AI-powered Literature Resource for Unlocking Biomedical
+  Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chih-Hsuan Wei, Alexis Allot, Po-Ting Lai, Robert Leaman, Shubo Tian, Ling Luo, Qiao Jin, Zhizheng Wang, Qingyu Chen, Zhiyong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  PubTator 3.0 (https://www.ncbi.nlm.nih.gov/research/pubtator3/) is a
+biomedical literature resource using state-of-the-art AI techniques to offer
+semantic and relation searches for key concepts like proteins, genetic
+variants, diseases, and chemicals. It currently provides over one billion
+entity and relation annotations across approximately 36 million PubMed
+abstracts and 6 million full-text articles from the PMC open access subset,
+updated weekly. PubTator 3.0's online interface and API utilize these
+precomputed entity relations and synonyms to provide advanced search
+capabilities and enable large-scale analyses, streamlining many complex
+information needs. We showcase the retrieval quality of PubTator 3.0 using a
+series of entity pair queries, demonstrating that PubTator 3.0 retrieves a
+greater number of articles than either PubMed or Google Scholar, with higher
+precision in the top 20 results. We further show that integrating ChatGPT
+(GPT-4) with PubTator APIs dramatically improves the factuality and
+verifiability of its responses. In summary, PubTator 3.0 offers a comprehensive
+set of features and tools that allow researchers to navigate the ever-expanding
+wealth of biomedical literature, expediting research and unlocking valuable
+insights for scientific discovery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FAIR Enough: How Can We Develop and Assess a FAIR-Compliant <span class="highlight-title">Dataset</span> for
+  Large Language Models' Training? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaina Raza, Shardul Ghuge, Chen Ding, Deval Pandya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in Large Language Models (LLMs) highlight the need for ethical
+practices and data integrity. We introduce a framework that embeds FAIR
+(Findable, Accessible, Interoperable, Reusable) data principles into LLM
+training. This approach marks a shift towards practices compliant with FAIR
+standards. Our framework presents guidelines for integrating FAIR data
+principles into LLM training. This initiative includes a checklist for
+researchers and developers. We also demonstrate its practical application
+through a case study focused on bias identification and mitigation in our
+FAIR-compliant dataset. This work is a significant contribution to AI ethics
+and data science, advocating for balanced and ethical training methods in LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis and Detection of Multilingual Hate Speech Using <span class="highlight-title">Transformer</span>
+  Based Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arijit Das, Somashree Nandy, Rupam Saha, Srijan Das, Diganta Saha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hate speech is harmful content that directly attacks or promotes hatred
+against members of groups or individuals based on actual or perceived aspects
+of identity, such as racism, religion, or sexual orientation. This can affect
+social life on social media platforms as hateful content shared through social
+media can harm both individuals and communities. As the prevalence of hate
+speech increases online, the demand for automated detection as an NLP task is
+increasing. In this work, the proposed method is using transformer-based model
+to detect hate speech in social media, like twitter, Facebook, WhatsApp,
+Instagram, etc. The proposed model is independent of languages and has been
+tested on Italian, English, German, Bengali. The Gold standard datasets were
+collected from renowned researcher Zeerak Talat, Sara Tonelli, Melanie Siegel,
+and Rezaul Karim. The success rate of the proposed model for hate speech
+detection is higher than the existing baseline and state-of-the-art models with
+accuracy in Bengali dataset is 89%, in English: 91%, in German dataset 91% and
+in Italian dataset it is 77%. The proposed algorithm shows substantial
+improvement to the benchmark method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Radiation Oncology NLP Database 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengliang Liu, Jason Holmes, Wenxiong Liao, Chenbin Liu, Lian Zhang, Hongying Feng, Peilong Wang, Muhammad Ali Elahi, Hongmin Cai, Lichao Sun, Quanzheng Li, Xiang Li, Tianming Liu, Jiajian Shen, Wei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Radiation Oncology NLP Database (ROND), the first dedicated
+Natural Language Processing (NLP) dataset for radiation oncology, an important
+medical specialty that has received limited attention from the NLP community in
+the past. With the advent of Artificial General Intelligence (AGI), there is an
+increasing need for specialized datasets and benchmarks to facilitate research
+and development. ROND is specifically designed to address this gap in the
+domain of radiation oncology, a field that offers many opportunities for NLP
+exploration. It encompasses various NLP tasks including Logic Reasoning, Text
+Classification, Named Entity Recognition (NER), Question Answering (QA), Text
+Summarization, and Patient-Clinician Conversations, each with a distinct focus
+on radiation oncology concepts and application cases. In addition, we have
+developed an instruction-tuning dataset consisting of over 20k instruction
+pairs (based on ROND) and trained a large language model, CancerChat. This
+serves to demonstrate the potential of instruction-tuning large language models
+within a highly-specialized medical domain. The evaluation results in this
+study could serve as baseline results for future research. ROND aims to
+stimulate advancements in radiation oncology and clinical NLP by offering a
+platform for testing and improving algorithms and models in a domain-specific
+context. The ROND dataset is a joint effort of multiple U.S. health
+institutions. The data is available at
+https://github.com/zl-liu/Radiation-Oncology-NLP-Database.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14393v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14393v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmad Faiz, Sotaro Kaneda, Ruhan Wang, Rita Osi, Prateek Sharma, Fan Chen, Lei Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The carbon footprint associated with large language models (LLMs) is a
+significant concern, encompassing emissions from their training, inference,
+experimentation, and storage processes, including operational and embodied
+carbon emissions. An essential aspect is accurately estimating the carbon
+impact of emerging LLMs even before their training, which heavily relies on GPU
+usage. Existing studies have reported the carbon footprint of LLM training, but
+only one tool, mlco2, can predict the carbon footprint of new neural networks
+prior to physical training. However, mlco2 has several serious limitations. It
+cannot extend its estimation to dense or mixture-of-experts (MoE) LLMs,
+disregards critical architectural parameters, focuses solely on GPUs, and
+cannot model embodied carbon footprints. Addressing these gaps, we introduce
+\textit{\carb}, an end-to-end carbon footprint projection model designed for
+both dense and MoE LLMs. Compared to mlco2, \carb~significantly enhances the
+accuracy of carbon footprint estimations for various LLMs. The source code is
+released at \url{https://github.com/SotaroKaneda/MLCarbon}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ INACIA: Integrating Large Language Models in Brazilian Audit Courts:
+  Opportunities and Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.05273v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.05273v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jayr Pereira, Andre Assumpcao, Julio Trecenti, Luiz Airosa, Caio Lente, Jhonatan Cléto, Guilherme Dobins, Rodrigo Nogueira, Luis Mitchell, Roberto Lotufo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces INACIA (Instru\c{c}\~ao Assistida com Intelig\^encia
+Artificial), a groundbreaking system designed to integrate Large Language
+Models (LLMs) into the operational framework of Brazilian Federal Court of
+Accounts (TCU). The system automates various stages of case analysis, including
+basic information extraction, admissibility examination, Periculum in mora and
+Fumus boni iuris analyses, and recommendations generation. Through a series of
+experiments, we demonstrate INACIA's potential in extracting relevant
+information from case documents, evaluating its legal plausibility, and
+formulating propositions for judicial decision-making. Utilizing a validation
+dataset alongside LLMs, our evaluation methodology presents an innovative
+approach to assessing system performance, correlating highly with human
+judgment. The results highlight INACIA's proficiency in handling complex legal
+tasks, indicating its suitability for augmenting efficiency and judicial
+fairness within legal systems. The paper also discusses potential enhancements
+and future applications, positioning INACIA as a model for worldwide AI
+integration in legal domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Transferable are Attribute Controllers on <span class="highlight-title">Pretrain</span>ed Multilingual
+  Translation Models? <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danni Liu, Jan Niehues
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Customizing machine translation models to comply with fine-grained attributes
+such as formality has seen tremendous progress recently. However, current
+approaches mostly rely on at least some supervised data with attribute
+annotation. Data scarcity therefore remains a bottleneck to democratizing such
+customization possibilities to a wider range of languages, lower-resource ones
+in particular. Given recent progress in pretrained massively multilingual
+translation models, we use them as a foundation to transfer the attribute
+controlling capabilities to languages without supervised data. In this work, we
+present a comprehensive analysis of transferring attribute controllers based on
+a pretrained NLLB-200 model. We investigate both training- and inference-time
+control techniques under various data scenarios, and uncover their relative
+strengths and weaknesses in zero-shot performance and domain robustness. We
+show that both paradigms are complementary, as shown by consistent improvements
+on 5 zero-shot directions. Moreover, a human evaluation on a real low-resource
+language, Bengali, confirms our findings on zero-shot transfer to new target
+languages. The code is
+$\href{https://github.com/dannigt/attribute-controller-transfer}{\text{here}}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MCWDST: a Minimum-Cost Weighted Directed Spanning Tree Algorithm for
+  Real-Time Fake News Mitigation in Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ciprian-Octavian Truică, Elena-Simona Apostol, Radu-Cătălin Nicolescu, Panagiotis Karras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread availability of internet access and handheld devices confers
+to social media a power similar to the one newspapers used to have. People seek
+affordable information on social media and can reach it within seconds. Yet
+this convenience comes with dangers; any user may freely post whatever they
+please and the content can stay online for a long period, regardless of its
+truthfulness. A need to detect untruthful information, also known as fake news,
+arises. In this paper, we present an end-to-end solution that accurately
+detects fake news and immunizes network nodes that spread them in real-time. To
+detect fake news, we propose two new stack deep learning architectures that
+utilize convolutional and bidirectional LSTM layers. To mitigate the spread of
+fake news, we propose a real-time network-aware strategy that (1) constructs a
+minimum-cost weighted directed spanning tree for a detected node, and (2)
+immunizes nodes in that tree by scoring their harmfulness using a novel ranking
+function. We demonstrate the effectiveness of our solution on five real-world
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models for Information Retrieval: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07107v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07107v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Zhu, Huaying Yuan, Shuting Wang, Jiongnan Liu, Wenhan Liu, Chenlong Deng, Haonan Chen, Zhicheng Dou, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a primary means of information acquisition, information retrieval (IR)
+systems, such as search engines, have integrated themselves into our daily
+lives. These systems also serve as components of dialogue, question-answering,
+and recommender systems. The trajectory of IR has evolved dynamically from its
+origins in term-based methods to its integration with advanced neural models.
+While the neural models excel at capturing complex contextual signals and
+semantic nuances, thereby reshaping the IR landscape, they still face
+challenges such as data scarcity, interpretability, and the generation of
+contextually plausible yet potentially inaccurate responses. This evolution
+requires a combination of both traditional methods (such as term-based sparse
+retrieval methods with rapid response) and modern neural architectures (such as
+language models with powerful language understanding capacity). Meanwhile, the
+emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has
+revolutionized natural language processing due to their remarkable language
+understanding, generation, generalization, and reasoning abilities.
+Consequently, recent research has sought to leverage LLMs to improve IR
+systems. Given the rapid evolution of this research trajectory, it is necessary
+to consolidate existing methodologies and provide nuanced insights through a
+comprehensive overview. In this survey, we delve into the confluence of LLMs
+and IR systems, including crucial aspects such as query rewriters, retrievers,
+rerankers, and readers. Additionally, we explore promising directions, such as
+search agents, within this expanding field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>updated to version 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative User-Experience Research for Developing Domain-specific
+  Natural Language Processing Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16143v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16143v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasia Zhukova, Lukas von Sperl, Christian E. Matt, Bela Gipp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User experience (UX) is a part of human-computer interaction (HCI) research
+and focuses on increasing intuitiveness, transparency, simplicity, and trust
+for the system users. Most UX research for machine learning (ML) or natural
+language processing (NLP) focuses on a data-driven methodology. It engages
+domain users mainly for usability evaluation. Moreover, more typical UX methods
+tailor the systems towards user usability, unlike learning about the user needs
+first. This paper proposes a new methodology for integrating generative UX
+research into developing domain NLP applications. Generative UX research
+employs domain users at the initial stages of prototype development, i.e.,
+ideation and concept evaluation, and the last stage for evaluating system
+usefulness and user utility. The methodology emerged from and is evaluated on a
+case study about the full-cycle prototype development of a domain-specific
+semantic search for daily operations in the process industry. A key finding of
+our case study is that involving domain experts increases their interest and
+trust in the final NLP application. The combined UX+NLP research of the
+proposed method efficiently considers data- and user-driven opportunities and
+constraints, which can be crucial for developing NLP applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient slot labelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09343v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09343v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vladimir Vlasov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Slot labelling is an essential component of any dialogue system, aiming to
+find important arguments in every user turn. Common approaches involve large
+pre-trained language models (PLMs) like BERT or RoBERTa, but they face
+challenges such as high computational requirements and dependence on
+pre-training data. In this work, we propose a lightweight method which performs
+on par or better than the state-of-the-art PLM-based methods, while having
+almost 10x less trainable parameters. This makes it especially applicable for
+real-life industry scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Iterative Enhancement for Improving Learnersourced
+  Multiple-Choice Question Explanations with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10444v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10444v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiming Bao, Juho Leinonen, Alex Yuxuan Peng, Wanjun Zhong, Gaël Gendron, Timothy Pistotti, Alice Huang, Paul Denny, Michael Witbrock, Jiamou Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models exhibit superior capabilities in processing and
+understanding language, yet their applications in educational contexts remain
+underexplored. Learnersourcing enhances learning by engaging students in
+creating their own educational content. When learnersourcing multiple-choice
+questions, creating explanations for the solution of a question is a crucial
+step; it helps other students understand the solution and promotes a deeper
+understanding of related concepts. However, it is often difficult for students
+to craft effective solution explanations, due to limited subject understanding.
+To help scaffold the task of automated explanation generation, we present and
+evaluate a framework called "ILearner-LLM", that iteratively enhances the
+generated explanations for the given questions with large language models.
+Comprising an explanation generation model and an explanation evaluation model,
+the framework generates high-quality student-aligned explanations by
+iteratively feeding the quality rating score from the evaluation model back
+into the instruction prompt of the explanation generation model. Experimental
+results demonstrate the effectiveness of our ILearner-LLM on LLaMA2-13B and
+GPT-4 to generate higher quality explanations that are closer to those written
+by students on five PeerWise datasets. Our findings represent a promising path
+to enrich the learnersourcing experience for students and to enhance the
+capabilities of large language models for educational applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Measuring the Robustness of NLP Models to Domain Shifts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00168v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00168v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nitay Calderon, Naveh Porat, Eyal Ben-David, Alexander Chapanin, Zorik Gekhman, Nadav Oved, Vitaly Shalumov, Roi Reichart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing research on Domain Robustness (DR) suffers from disparate setups,
+lack of task variety, and scarce research on recent models and capabilities
+such as few-shot learning. Furthermore, we claim that the common practice of
+measuring DR might further obscure the picture. Current research focuses on
+challenge sets and relies solely on the Source Drop (SD): Using the source
+in-domain performance as a reference point for degradation. However, the Target
+Drop (TD) should be used as a complementary point of view. To understand the DR
+challenge in modern NLP models, we developed a benchmark comprised of seven NLP
+tasks, including classification, QA, and generation. Our benchmark focuses on
+natural topical domain shifts and enables measuring both the SD and the TD. Our
+comprehensive study, involving over 14,000 domain shifts across 18 fine-tuned
+and few-shot models, shows that both models suffer from drops upon domain
+shifts. While fine-tuned models excel in-domain, few-shot LLMs often surpass
+them cross-domain, showing better robustness. In addition, we found that a
+large SD can be explained by shifting to a harder domain rather than a genuine
+DR challenge. Thus, the TD is a more reliable metric.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A ripple in time: a discontinuity in American history 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01185v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01185v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Kolpakov, Igor Rivin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this note we use the State of the Union Address (SOTU) dataset from Kaggle
+to make some surprising (and some not so surprising) observations pertaining to
+the general timeline of American history, and the character and nature of the
+addresses themselves. Our main approach is using vector embeddings, such as
+BERT (DistilBERT) and GPT-2.
+  While it is widely believed that BERT (and its variations) is most suitable
+for NLP classification tasks, we find out that GPT-2 in conjunction with
+nonlinear dimension reduction methods such as UMAP provide better separation
+and stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In
+our case, no model fine-tuning is required, and the pre-trained out-of-the-box
+GPT-2 model is enough.
+  We also used a fine-tuned DistilBERT model for classification detecting which
+President delivered which address, with very good results (accuracy 93\% - 95\%
+depending on the run). An analogous task was performed to determine the year of
+writing, and we were able to pin it down to about 4 years (which is a single
+presidential term).
+  It is worth noting that SOTU addresses provide relatively small writing
+samples (with about 8000 words on average, and varying widely from under 2000
+words to more than 20000), and that the amount of authors is relatively large
+(we used SOTU addresses of 42 US presidents). This shows that the techniques
+employed turn out to be rather efficient, while all the computations described
+in this note can be performed using a single GPU instance of Google Colab.
+  The accompanying code is available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 8 figures; GitHub repository
+  https://github.com/sashakolpakov/ripple_in_time</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Summarization Performance through <span class="highlight-title">Transformer</span>-Based <span class="highlight-title">Prompt</span>
+  Engineering in Automated Medical Reporting <span class="chip">ALT</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.13274v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.13274v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daphne van Zandvoort, Laura Wiersema, Tom Huibers, Sandra van Dulmen, Sjaak Brinkkemper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Customized medical prompts enable Large Language Models (LLM) to effectively
+address medical dialogue summarization. The process of medical reporting is
+often time-consuming for healthcare professionals. Implementing medical
+dialogue summarization techniques presents a viable solution to alleviate this
+time constraint by generating automated medical reports. The effectiveness of
+LLMs in this process is significantly influenced by the formulation of the
+prompt, which plays a crucial role in determining the quality and relevance of
+the generated reports. In this research, we used a combination of two distinct
+prompting strategies, known as shot prompting and pattern prompting to enhance
+the performance of automated medical reporting. The evaluation of the automated
+medical reports is carried out using the ROUGE score and a human evaluation
+with the help of an expert panel. The two-shot prompting approach in
+combination with scope and domain context outperforms other methods and
+achieves the highest score when compared to the human reference set by a
+general practitioner. However, the automated reports are approximately twice as
+long as the human references, due to the addition of both redundant and
+relevant statements that are added to the report.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures, to be presented at HEALTHINF 2024, author
+  contributions: research conducted and written by Daphne van Zandvoort and
+  Laura Wiersema, research suggested and used software created by Tom Huibers,
+  data provided and feedback provided by Sandra van Dulmen, supervision and
+  feedback provided by Sjaak Brinkkemper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Graph Meets Large Language Model: Progress and Future
+  Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12399v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12399v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhan Li, Zhixun Li, Peisong Wang, Jia Li, Xiangguo Sun, Hong Cheng, Jeffrey Xu Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph plays a significant role in representing and analyzing complex
+relationships in real-world applications such as citation networks, social
+networks, and biological data. Recently, Large Language Models (LLMs), which
+have achieved tremendous success in various domains, have also been leveraged
+in graph-related tasks to surpass traditional Graph Neural Networks (GNNs)
+based methods and yield state-of-the-art performance. In this survey, we first
+present a comprehensive review and analysis of existing methods that integrate
+LLMs with graphs. First of all, we propose a new taxonomy, which organizes
+existing methods into three categories based on the role (i.e., enhancer,
+predictor, and alignment component) played by LLMs in graph-related tasks. Then
+we systematically survey the representative methods along the three categories
+of the taxonomy. Finally, we discuss the remaining limitations of existing
+studies and highlight promising avenues for future research. The relevant
+papers are summarized and will be consistently updated at:
+https://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress; 13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aligning Large Language Models with Counterfactual DPO 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09566v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09566v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bradley Butcher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in large language models (LLMs) have demonstrated remarkable
+capabilities across a diverse range of applications. These models excel in
+generating text completions that are contextually coherent and cover an
+extensive array of subjects. However, the vast datasets required for their
+training make aligning response styles during the pretraining and instruction
+tuning phases challenging. Consequently, an additional alignment phase is
+typically employed, wherein the model is further trained with human preference
+data to better align its outputs with human expectations. While this process
+doesn't introduce new capabilities per se, it does accentuate generation styles
+innate to the model. This paper explores the utilization of counterfactual
+prompting within the framework of Direct Preference Optimization (DPO) to align
+the model's style without relying on human intervention. We demonstrate that
+this method effectively instils desirable behaviour, mitigates undesirable
+ones, and encourages the model to disregard inappropriate instructions. Our
+findings suggest that counterfactual prompting with DPO presents a low-resource
+way to fine-tune LLMs to meet the demands for responsible and ethically aligned
+AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RoTBench: A Multi-Level Benchmark for Evaluating the Robustness of Large
+  Language Models in Tool Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08326v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08326v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Ye, Yilong Wu, Songyang Gao, Caishuang Huang, Sixian Li, Guanyu Li, Xiaoran Fan, Qi Zhang, Tao Gui, Xuanjing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tool learning has generated widespread interest as a vital means of
+interaction between Large Language Models (LLMs) and the physical world.
+Current research predominantly emphasizes LLMs' capacity to utilize tools in
+well-structured environments while overlooking their stability when confronted
+with the inevitable noise of the real world. To bridge this gap, we introduce
+RoTBench, a multi-level benchmark for evaluating the robustness of LLMs in tool
+learning. Specifically, we establish five external environments, each featuring
+varying levels of noise (i.e., Clean, Slight, Medium, Heavy, and Union),
+providing an in-depth analysis of the model's resilience across three critical
+phases: tool selection, parameter identification, and content filling.
+Experiments involving six widely-used models underscore the urgent necessity
+for enhancing the robustness of LLMs in tool learning. For instance, the
+performance of GPT-4 even drops significantly from 80.00 to 58.10 when there is
+no substantial change in manual accuracy. More surprisingly, the noise
+correction capability inherent in the GPT family paradoxically impedes its
+adaptability in the face of mild noise. In light of these findings, we propose
+RoTTuning, a strategy that enriches the diversity of training environments to
+bolster the robustness of LLMs in tool learning. The code and data are
+available at https://github.com/Junjie-Ye/RoTBench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TransNormerLLM: A Faster and Better Large Language Model with Improved
+  TransNormer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14995v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14995v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Qin, Dong Li, Weigao Sun, Weixuan Sun, Xuyang Shen, Xiaodong Han, Yunshen Wei, Baohong Lv, Xiao Luo, Yu Qiao, Yiran Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present TransNormerLLM, the first linear attention-based Large Language
+Model (LLM) that outperforms conventional softmax attention-based models in
+terms of both accuracy and efficiency. TransNormerLLM evolves from the previous
+linear attention architecture TransNormer by making advanced modifications that
+include positional embedding, linear attention acceleration, gating mechanisms,
+tensor normalization, and inference acceleration and stabilization.
+Specifically, we use LRPE together with an exponential decay to avoid attention
+dilution issues while allowing the model to retain global interactions between
+tokens. Additionally, we propose Lightning Attention, a cutting-edge technique
+that accelerates linear attention by more than twice in runtime and reduces
+memory usage by a remarkable four times. To further enhance the performance of
+TransNormer, we leverage a gating mechanism for smooth training and a new
+tensor normalization scheme to accelerate the model, resulting in an impressive
+acceleration of over $20\%$. Furthermore, we develop a robust inference
+algorithm that ensures numerical stability and consistent inference speed,
+regardless of the sequence length, showcasing superior efficiency during both
+training and inference stages. We also implement an efficient model parallel
+schema for TransNormerLLM, enabling seamless deployment on large-scale clusters
+and facilitating expansion to even more extensive models, i.e., LLMs with 175B
+parameters. We validate our model design through a series of ablations and
+train models with sizes of 385M, 1B, and 7B on our self-collected corpus.
+Benchmark results demonstrate that our models not only match the performance of
+state-of-the-art LLMs with Transformer but are also significantly faster. Code
+is released at: https://github.com/OpenNLPLab/TransnormerLLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report. Yiran Zhong is the corresponding author. Zhen Qin,
+  Dong Li, Weigao Sun, Weixuan Sun, Xuyang Shen contribute equally to this
+  paper. Code is released at: https://github.com/OpenNLPLab/TransnormerLLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KnowledgeNavigator: Leveraging Large Language Models for Enhanced
+  Reasoning over Knowledge Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.15880v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.15880v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiezheng Guo, Qingwen Yang, Chen Wang, Yanyi Liu, Pan Li, Jiawei Tang, Dapeng Li, Yingyou Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language model (LLM) has achieved outstanding performance on various
+downstream tasks with its powerful natural language understanding and zero-shot
+capability, but LLM still suffers from knowledge limitation. Especially in
+scenarios that require long logical chains or complex reasoning, the
+hallucination and knowledge limitation of LLM limit its performance in question
+answering (QA). In this paper, we propose a novel framework KnowledgeNavigator
+to address these challenges by efficiently and accurately retrieving external
+knowledge from knowledge graph and using it as a key factor to enhance LLM
+reasoning. Specifically, KnowledgeNavigator first mines and enhances the
+potential constraints of the given question to guide the reasoning. Then it
+retrieves and filters external knowledge that supports answering through
+iterative reasoning on knowledge graph with the guidance of LLM and the
+question. Finally, KnowledgeNavigator constructs the structured knowledge into
+effective prompts that are friendly to LLM to help its reasoning. We evaluate
+KnowledgeNavigator on multiple public KGQA benchmarks, the experiments show the
+framework has great effectiveness and generalization, outperforming previous
+knowledge graph enhanced LLM methods and is comparable to the fully supervised
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Abilities in Large Language Models are Affected by Supervised
+  Fine-tuning Data Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05492v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05492v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanting Dong, Hongyi Yuan, Keming Lu, Chengpeng Li, Mingfeng Xue, Dayiheng Liu, Wei Wang, Zheng Yuan, Chang Zhou, Jingren Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) with enormous pre-training tokens and parameters
+emerge diverse abilities, including math reasoning, code generation, and
+instruction following. These abilities are further enhanced by supervised
+fine-tuning (SFT). While the open-source community has explored ad-hoc SFT for
+enhancing individual capabilities, proprietary LLMs exhibit versatility across
+various skills. Therefore, understanding the facilitation of multiple abilities
+via SFT is paramount. In this study, we specifically focuses on the interplay
+of data composition between mathematical reasoning, code generation, and
+general human-aligning abilities during SFT. We propose four intriguing
+research questions to explore the association between model performance and
+various factors including data amount, composition ratio, model size and SFT
+strategies. Our experiments reveal that distinct capabilities scale differently
+and larger models generally show superior performance with same amount of data.
+Mathematical reasoning and code generation consistently improve with increasing
+data amount, whereas general abilities plateau after roughly a thousand
+samples. Moreover, we observe data composition appears to enhance various
+abilities under limited data conditions, yet can lead to performance conflicts
+when data is plentiful. Our findings also suggest the amount of composition
+data influences performance more than the composition ratio. In analysis of SFT
+strategies, we find that sequentially learning multiple skills risks
+catastrophic forgetting. Our proposed Dual-stage Mixed Fine-tuning (DMT)
+strategy offers a promising solution to learn multiple abilities with different
+scaling patterns.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Text Embeddings with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00368v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00368v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Wang, Nan Yang, Xiaolong Huang, Linjun Yang, Rangan Majumder, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a novel and simple method for obtaining
+high-quality text embeddings using only synthetic data and less than 1k
+training steps. Unlike existing methods that often depend on multi-stage
+intermediate pre-training with billions of weakly-supervised text pairs,
+followed by fine-tuning with a few labeled datasets, our method does not
+require building complex training pipelines or relying on manually collected
+datasets that are often constrained by task diversity and language coverage. We
+leverage proprietary LLMs to generate diverse synthetic data for hundreds of
+thousands of text embedding tasks across nearly 100 languages. We then
+fine-tune open-source decoder-only LLMs on the synthetic data using standard
+contrastive loss. Experiments demonstrate that our method achieves strong
+performance on highly competitive text embedding benchmarks without using any
+labeled data. Furthermore, when fine-tuned with a mixture of synthetic and
+labeled data, our model sets new state-of-the-art results on the BEIR and MTEB
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 15 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Better Explain <span class="highlight-title">Transformer</span>s by Illuminating Important Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09972v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09972v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linxin Song, Yan Cui, Ao Luo, Freddy Lecue, Irene Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based models excel in various natural language processing (NLP)
+tasks, attracting countless efforts to explain their inner workings. Prior
+methods explain Transformers by focusing on the raw gradient and attention as
+token attribution scores, where non-relevant information is often considered
+during explanation computation, resulting in confusing results. In this work,
+we propose highlighting the important information and eliminating irrelevant
+information by a refined information flow on top of the layer-wise relevance
+propagation (LRP) method. Specifically, we consider identifying syntactic and
+positional heads as important attention heads and focus on the relevance
+obtained from these important heads. Experimental results demonstrate that
+irrelevant information does distort output attribution scores and then should
+be masked during explanation computation. Compared to eight baselines on both
+classification and question-answering datasets, our method consistently
+outperforms with over 3\% to 33\% improvement on explanation metrics, providing
+superior explanation performance. Our anonymous code repository is available
+at: https://github.com/LinxinS97/Mask-LRP
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UniversalNER: Targeted Distillation from Large Language Models for Open
+  Named Entity Recognition <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03279v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03279v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Zhou, Sheng Zhang, Yu Gu, Muhao Chen, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable generalizability,
+such as understanding arbitrary entities and relations. Instruction tuning has
+proven effective for distilling LLMs into more cost-efficient models such as
+Alpaca and Vicuna. Yet such student models still trail the original LLMs by
+large margins in downstream applications. In this paper, we explore targeted
+distillation with mission-focused instruction tuning to train student models
+that can excel in a broad application class such as open information
+extraction. Using named entity recognition (NER) for case study, we show how
+ChatGPT can be distilled into much smaller UniversalNER models for open NER.
+For evaluation, we assemble the largest NER benchmark to date, comprising 43
+datasets across 9 diverse domains such as biomedicine, programming, social
+media, law, finance. Without using any direct supervision, UniversalNER attains
+remarkable NER accuracy across tens of thousands of entity types, outperforming
+general instruction-tuned models such as Alpaca and Vicuna by over 30 absolute
+F1 points in average. With a tiny fraction of parameters, UniversalNER not only
+acquires ChatGPT's capability in recognizing arbitrary entity types, but also
+outperforms its NER accuracy by 7-9 absolute F1 points in average. Remarkably,
+UniversalNER even outperforms by a large margin state-of-the-art multi-task
+instruction-tuned systems such as InstructUIE, which uses supervised NER
+examples. We also conduct thorough ablation studies to assess the impact of
+various components in our distillation approach. We release the distillation
+recipe, data, and UniversalNER models to facilitate future research on targeted
+distillation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024. Project page: https://universal-ner.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chain-of-Table: Evolving Tables in the Reasoning Chain for Table
+  Understanding <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04398v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04398v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zilong Wang, Hao Zhang, Chun-Liang Li, Julian Martin Eisenschlos, Vincent Perot, Zifeng Wang, Lesly Miculicich, Yasuhisa Fujii, Jingbo Shang, Chen-Yu Lee, Tomas Pfister
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Table-based reasoning with large language models (LLMs) is a promising
+direction to tackle many table understanding tasks, such as table-based
+question answering and fact verification. Compared with generic reasoning,
+table-based reasoning requires the extraction of underlying semantics from both
+free-form questions and semi-structured tabular data. Chain-of-Thought and its
+similar approaches incorporate the reasoning chain in the form of textual
+context, but it is still an open question how to effectively leverage tabular
+data in the reasoning chain. We propose the Chain-of-Table framework, where
+tabular data is explicitly used in the reasoning chain as a proxy for
+intermediate thoughts. Specifically, we guide LLMs using in-context learning to
+iteratively generate operations and update the table to represent a tabular
+reasoning chain. LLMs can therefore dynamically plan the next operation based
+on the results of the previous ones. This continuous evolution of the table
+forms a chain, showing the reasoning process for a given tabular problem. The
+chain carries structured information of the intermediate results, enabling more
+accurate and reliable predictions. Chain-of-Table achieves new state-of-the-art
+performance on WikiTQ, FeTaQA, and TabFact benchmarks across multiple LLM
+choices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">91</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents Depth Anything, a highly practical solution for robust
+monocular depth estimation. Without pursuing novel technical modules, we aim to
+build a simple yet powerful foundation model dealing with any images under any
+circumstances. To this end, we scale up the dataset by designing a data engine
+to collect and automatically annotate large-scale unlabeled data (~62M), which
+significantly enlarges the data coverage and thus is able to reduce the
+generalization error. We investigate two simple yet effective strategies that
+make data scaling-up promising. First, a more challenging optimization target
+is created by leveraging data augmentation tools. It compels the model to
+actively seek extra visual knowledge and acquire robust representations.
+Second, an auxiliary supervision is developed to enforce the model to inherit
+rich semantic priors from pre-trained encoders. We evaluate its zero-shot
+capabilities extensively, including six public datasets and randomly captured
+photos. It demonstrates impressive generalization ability. Further, through
+fine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs
+are set. Our better depth model also results in a better depth-conditioned
+ControlNet. Our models are released at
+https://github.com/LiheYoung/Depth-Anything.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://depth-anything.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event detection from novel data sources: Leveraging satellite imagery
+  alongside GPS traces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10890v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10890v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ekin Ugurel, Steffen Coenen, Minda Zhou Chen, Cynthia Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapid identification and response to breaking events, particularly those that
+pose a threat to human life such as natural disasters or conflicts, is of
+paramount importance. The prevalence of mobile devices and the ubiquity of
+network connectivity has generated a massive amount of temporally- and
+spatially-stamped data. Numerous studies have used mobile data to derive
+individual human mobility patterns for various applications. Similarly, the
+increasing number of orbital satellites has made it easier to gather
+high-resolution images capturing a snapshot of a geographical area in sub-daily
+temporal frequency. We propose a novel data fusion methodology integrating
+satellite imagery with privacy-enhanced mobile data to augment the event
+inference task, whether in real-time or historical. In the absence of boots on
+the ground, mobile data is able to give an approximation of human mobility,
+proximity to one another, and the built environment. On the other hand,
+satellite imagery can provide visual information on physical changes to the
+built and natural environment. The expected use cases for our methodology
+include small-scale disaster detection (i.e., tornadoes, wildfires, and floods)
+in rural regions, search and rescue operation augmentation for lost hikers in
+remote wilderness areas, and identification of active conflict areas and
+population displacement in war-torn states. Our implementation is open-source
+on GitHub: https://github.com/ekinugurel/SatMobFusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthesizing Moving People with 3D Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyi Li, Jathushan Rajasegaran, Yossi Gandelsman, Alexei A. Efros, Jitendra Malik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a diffusion model-based framework for animating
+people from a single image for a given target 3D motion sequence. Our approach
+has two core components: a) learning priors about invisible parts of the human
+body and clothing, and b) rendering novel body poses with proper clothing and
+texture. For the first part, we learn an in-filling diffusion model to
+hallucinate unseen parts of a person given a single image. We train this model
+on texture map space, which makes it more sample-efficient since it is
+invariant to pose and viewpoint. Second, we develop a diffusion-based rendering
+pipeline, which is controlled by 3D human poses. This produces realistic
+renderings of novel poses of the person, including clothing, hair, and
+plausible in-filling of unseen regions. This disentangled approach allows our
+method to generate a sequence of images that are faithful to the target motion
+in the 3D pose and, to the input image in terms of visual similarity. In
+addition to that, the 3D control allows various synthetic camera trajectories
+to render a person. Our experiments show that our method is resilient in
+generating prolonged motions and varied challenging and complex poses compared
+to prior methods. Please check our website for more details:
+https://boyiliee.github.io/3DHM.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SCENES: Subpixel Correspondence Estimation With Epipolar Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik A. Kloepfer, João F. Henriques, Dylan Campbell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting point correspondences from two or more views of a scene is a
+fundamental computer vision problem with particular importance for relative
+camera pose estimation and structure-from-motion. Existing local feature
+matching approaches, trained with correspondence supervision on large-scale
+datasets, obtain highly-accurate matches on the test sets. However, they do not
+generalise well to new datasets with different characteristics to those they
+were trained on, unlike classic feature extractors. Instead, they require
+finetuning, which assumes that ground-truth correspondences or ground-truth
+camera poses and 3D structure are available. We relax this assumption by
+removing the requirement of 3D structure, e.g., depth maps or point clouds, and
+only require camera pose information, which can be obtained from odometry. We
+do so by replacing correspondence losses with epipolar losses, which encourage
+putative matches to lie on the associated epipolar line. While weaker than
+correspondence supervision, we observe that this cue is sufficient for
+finetuning existing models on new data. We then further relax the assumption of
+known camera poses by using pose estimates in a novel bootstrapping approach.
+We evaluate on highly challenging datasets, including an indoor drone dataset
+and an outdoor smartphone camera dataset, and obtain state-of-the-art results
+without strong supervision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Cadaver in the Machine: The Social Practices of Measurement and
+  Validation in Motion Capture Technology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emma Harvey, Hauke Sandhaus, Abigail Z. Jacobs, Emanuel Moss, Mona Sloane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion capture systems, used across various domains, make body
+representations concrete through technical processes. We argue that the
+measurement of bodies and the validation of measurements for motion capture
+systems can be understood as social practices. By analyzing the findings of a
+systematic literature review (N=278) through the lens of social practice
+theory, we show how these practices, and their varying attention to errors,
+become ingrained in motion capture design and innovation over time. Moreover,
+we show how contemporary motion capture systems perpetuate assumptions about
+human bodies and their movements. We suggest that social practices of
+measurement and validation are ubiquitous in the development of data- and
+sensor-driven systems more broadly, and provide this work as a basis for
+investigating hidden design assumptions and their potential negative
+consequences in human-computer interaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 9 figures. To appear in the 2024 ACM CHI Conference on
+  Human Factors in Computing Systems (CHI '24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Motion Consistency Loss for Monocular Visual Odometry with
+  Attention-Based Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10857v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10857v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        André O. Françani, Marcos R. O. A. Maximo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning algorithms have driven expressive progress in many complex
+tasks. The loss function is a core component of deep learning techniques,
+guiding the learning process of neural networks. This paper contributes by
+introducing a consistency loss for visual odometry with deep learning-based
+approaches. The motion consistency loss explores repeated motions that appear
+in consecutive overlapped video clips. Experimental results show that our
+approach increased the performance of a model on the KITTI odometry benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Source-Free and Image-Only Unsupervised Domain Adaptation for Category
+  Level Object Pose Estimation <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prakhar Kaushik, Aayush Mishra, Adam Kortylewski, Alan Yuille
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of source-free unsupervised category-level pose
+estimation from only RGB images to a target domain without any access to source
+domain data or 3D annotations during adaptation. Collecting and annotating
+real-world 3D data and corresponding images is laborious, expensive, yet
+unavoidable process, since even 3D pose domain adaptation methods require 3D
+data in the target domain. We introduce 3DUDA, a method capable of adapting to
+a nuisance-ridden target domain without 3D or depth data. Our key insight stems
+from the observation that specific object subparts remain stable across
+out-of-domain (OOD) scenarios, enabling strategic utilization of these
+invariant subcomponents for effective model updates. We represent object
+categories as simple cuboid meshes, and harness a generative model of neural
+feature activations modeled at each mesh vertex learnt using differential
+rendering. We focus on individual locally robust mesh vertex features and
+iteratively update them based on their proximity to corresponding features in
+the target domain even when the global pose is not correct. Our model is then
+trained in an EM fashion, alternating between updating the vertex features and
+the feature extractor. We show that our method simulates fine-tuning on a
+global pseudo-labeled dataset under mild assumptions, which converges to the
+target domain asymptotically. Through extensive empirical validation, including
+a complex extreme UDA setup which combines real nuisances, synthetic noise, and
+occlusion, we demonstrate the potency of our simple approach in addressing the
+domain shift challenge and significantly improving pose estimation accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 9 figures, 50 tables; ICLR 2024 (Poster)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Video <span class="highlight-title">Transformer</span>s via Universal Concept Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10831v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10831v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Kowal, Achal Dave, Rares Ambrus, Adrien Gaidon, Konstantinos G. Derpanis, Pavel Tokmakov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the problem of concept-based interpretability of
+transformer representations for videos. Concretely, we seek to explain the
+decision-making process of video transformers based on high-level,
+spatiotemporal concepts that are automatically discovered. Prior research on
+concept-based interpretability has concentrated solely on image-level tasks.
+Comparatively, video models deal with the added temporal dimension, increasing
+complexity and posing challenges in identifying dynamic concepts over time. In
+this work, we systematically address these challenges by introducing the first
+Video Transformer Concept Discovery (VTCD) algorithm. To this end, we propose
+an efficient approach for unsupervised identification of units of video
+transformer representations - concepts, and ranking their importance to the
+output of a model. The resulting concepts are highly interpretable, revealing
+spatio-temporal reasoning mechanisms and object-centric representations in
+unstructured video models. Performing this analysis jointly over a diverse set
+of supervised and self-supervised representations, we discover that some of
+these mechanism are universal in video transformers. Finally, we demonstrate
+that VTCDcan be used to improve model performance for fine-grained tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ActAnywhere: Subject-Aware Video Background Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boxiao Pan, Zhan Xu, Chun-Hao Paul Huang, Krishna Kumar Singh, Yang Zhou, Leonidas J. Guibas, Jimei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating video background that tailors to foreground subject motion is an
+important problem for the movie industry and visual effects community. This
+task involves synthesizing background that aligns with the motion and
+appearance of the foreground subject, while also complies with the artist's
+creative intention. We introduce ActAnywhere, a generative model that automates
+this process which traditionally requires tedious manual efforts. Our model
+leverages the power of large-scale video diffusion models, and is specifically
+tailored for this task. ActAnywhere takes a sequence of foreground subject
+segmentation as input and an image that describes the desired scene as
+condition, to produce a coherent video with realistic foreground-background
+interactions while adhering to the condition frame. We train our model on a
+large-scale dataset of human-scene interaction videos. Extensive evaluations
+demonstrate the superior performance of our model, significantly outperforming
+baselines. Moreover, we show that ActAnywhere generalizes to diverse
+out-of-distribution samples, including non-human subjects. Please visit our
+project webpage at https://actanywhere.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RAD-DINO: Exploring Scalable Medical Image Encoders Beyond Text
+  Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Pérez-García, Harshita Sharma, Sam Bond-Taylor, Kenza Bouzid, Valentina Salvatelli, Maximilian Ilse, Shruthi Bannur, Daniel C. Castro, Anton Schwaighofer, Matthew P. Lungren, Maria Wetscherek, Noel Codella, Stephanie L. Hyland, Javier Alvarez-Valle, Ozan Oktay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language-supervised pre-training has proven to be a valuable method for
+extracting semantically meaningful features from images, serving as a
+foundational element in multimodal systems within the computer vision and
+medical imaging domains. However, resulting features are limited by the
+information contained within the text. This is particularly problematic in
+medical imaging, where radiologists' written findings focus on specific
+observations; a challenge compounded by the scarcity of paired imaging-text
+data due to concerns over leakage of personal health information. In this work,
+we fundamentally challenge the prevailing reliance on language supervision for
+learning general purpose biomedical imaging encoders. We introduce RAD-DINO, a
+biomedical image encoder pre-trained solely on unimodal biomedical imaging data
+that obtains similar or greater performance than state-of-the-art biomedical
+language supervised models on a diverse range of benchmarks. Specifically, the
+quality of learned representations is evaluated on standard imaging tasks
+(classification and semantic segmentation), and a vision-language alignment
+task (text report generation from images). To further demonstrate the drawback
+of language supervision, we show that features from RAD-DINO correlate with
+other medical records (e.g., sex or age) better than language-supervised
+models, which are generally not mentioned in radiology reports. Finally, we
+conduct a series of ablations determining the factors in RAD-DINO's
+performance; notably, we observe that RAD-DINO's downstream performance scales
+well with the quantity and diversity of training data, demonstrating that
+image-only supervision is a scalable approach for training a foundational
+biomedical image encoder.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Visually Connect Actions and their Effects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10805v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10805v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Peh, Paritosh Parmar, Basura Fernando
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce the novel concept of visually Connecting Actions
+and Their Effects (CATE) in video understanding. CATE can have applications in
+areas like task planning and learning from demonstration. We propose different
+CATE-based task formulations, such as action selection and action
+specification, where video understanding models connect actions and effects at
+semantic and fine-grained levels. We observe that different formulations
+produce representations capturing intuitive action properties. We also design
+various baseline models for action selection and action specification. Despite
+the intuitive nature of the task, we observe that models struggle, and humans
+outperform them by a large margin. The study aims to establish a foundation for
+future efforts, showcasing the flexibility and versatility of connecting
+actions and effects in video understanding, with the hope of inspiring advanced
+formulations and models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring the Impact of Scene Level Objects on Object Detection: Towards
+  Quantitative Explanations of Detection Decisions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lynn Vonder Haar, Timothy Elvira, Luke Newcomb, Omar Ochoa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although accuracy and other common metrics can provide a useful window into
+the performance of an object detection model, they lack a deeper view of the
+model's decision process. Regardless of the quality of the training data and
+process, the features that an object detection model learns cannot be
+guaranteed. A model may learn a relationship between certain background
+context, i.e., scene level objects, and the presence of the labeled classes.
+Furthermore, standard performance verification and metrics would not identify
+this phenomenon. This paper presents a new black box explainability method for
+additional verification of object detection models by finding the impact of
+scene level objects on the identification of the objects within the image. By
+comparing the accuracies of a model on test data with and without certain scene
+level objects, the contributions of these objects to the model's performance
+becomes clearer. The experiment presented here will assess the impact of
+buildings and people in image context on the detection of emergency road
+vehicles by a fine-tuned YOLOv8 model. A large increase in accuracy in the
+presence of a scene level object will indicate the model's reliance on that
+object to make its detections. The results of this research lead to providing a
+quantitative explanation of the object detection model's decision process,
+enabling a deeper understanding of the model's performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sat2Scene: 3D Urban Scene Generation from Satellite Images with
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10786v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10786v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuoyue Li, Zhenqiang Li, Zhaopeng Cui, Marc Pollefeys, Martin R. Oswald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Directly generating scenes from satellite imagery offers exciting
+possibilities for integration into applications like games and map services.
+However, challenges arise from significant view changes and scene scale.
+Previous efforts mainly focused on image or video generation, lacking
+exploration into the adaptability of scene generation for arbitrary views.
+Existing 3D generation works either operate at the object level or are
+difficult to utilize the geometry obtained from satellite imagery. To overcome
+these limitations, we propose a novel architecture for direct 3D scene
+generation by introducing diffusion models into 3D sparse representations and
+combining them with neural rendering techniques. Specifically, our approach
+generates texture colors at the point level for a given geometry using a 3D
+diffusion model first, which is then transformed into a scene representation in
+a feed-forward manner. The representation can be utilized to render arbitrary
+views which would excel in both single-frame quality and inter-frame
+consistency. Experiments in two city-scale datasets show that our model
+demonstrates proficiency in generating photo-realistic street-view image
+sequences and cross-view urban scenes from satellite imagery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Determination of efficiency indicators of the stand for intelligent
+  control of manual operations in industrial production 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10777v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10777v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anton Sergeev, Victor Minchenkov, Aleksei Soldatov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Systems of intelligent control of manual operations in industrial production
+are being implemented in many industries nowadays. Such systems use
+high-resolution cameras and computer vision algorithms to automatically track
+the operator's manipulations and prevent technological errors in the assembly
+process. At the same time compliance with safety regulations in the workspace
+is monitored. As a result, the defect rate of manufactured products and the
+number of accidents during the manual assembly of any device are decreased.
+Before implementing an intelligent control system into a real production it is
+necessary to calculate its efficiency. In order to do it experiments on the
+stand for manual operations control systems were carried out. This paper
+proposes the methodology for calculating the efficiency indicators. This
+mathematical approach is based on the IoU calculation of real- and
+predicted-time intervals between assembly stages. The results show high
+precision in tracking the validity of manual assembly and do not depend on the
+duration of the assembly process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NN-VVC: Versatile Video Coding boosted by <span class="highlight-title">self-supervised</span>ly learned
+  image coding for machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jukka I. Ahonen, Nam Le, Honglei Zhang, Antti Hallapuro, Francesco Cricri, Hamed Rezazadegan Tavakoli, Miska M. Hannuksela, Esa Rahtu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent progress in artificial intelligence has led to an ever-increasing
+usage of images and videos by machine analysis algorithms, mainly neural
+networks. Nonetheless, compression, storage and transmission of media have
+traditionally been designed considering human beings as the viewers of the
+content. Recent research on image and video coding for machine analysis has
+progressed mainly in two almost orthogonal directions. The first is represented
+by end-to-end (E2E) learned codecs which, while offering high performance on
+image coding, are not yet on par with state-of-the-art conventional video
+codecs and lack interoperability. The second direction considers using the
+Versatile Video Coding (VVC) standard or any other conventional video codec
+(CVC) together with pre- and post-processing operations targeting machine
+analysis. While the CVC-based methods benefit from interoperability and broad
+hardware and software support, the machine task performance is often lower than
+the desired level, particularly in low bitrates. This paper proposes a hybrid
+codec for machines called NN-VVC, which combines the advantages of an
+E2E-learned image codec and a CVC to achieve high performance in both image and
+video coding for machines. Our experiments show that the proposed system
+achieved up to -43.20% and -26.8% Bj{\o}ntegaard Delta rate reduction over VVC
+for image and video data, respectively, when evaluated on multiple different
+datasets and machine vision tasks. To the best of our knowledge, this is the
+first research paper showing a hybrid video codec that outperforms VVC on
+multiple datasets and multiple machine vision tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ISM 2023 Best paper award winner version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HiCD: Change Detection in Quality-Varied Images via Hierarchical
+  Correlation Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10752v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10752v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Pang, Xingxing Weng, Jiang Wu, Qiang Wang, Gui-Song Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advanced change detection techniques primarily target image pairs of equal
+and high quality. However, variations in imaging conditions and platforms
+frequently lead to image pairs with distinct qualities: one image being
+high-quality, while the other being low-quality. These disparities in image
+quality present significant challenges for understanding image pairs
+semantically and extracting change features, ultimately resulting in a notable
+decline in performance. To tackle this challenge, we introduce an innovative
+training strategy grounded in knowledge distillation. The core idea revolves
+around leveraging task knowledge acquired from high-quality image pairs to
+guide the model's learning process when dealing with image pairs that exhibit
+differences in quality. Additionally, we develop a hierarchical correlation
+distillation approach (involving self-correlation, cross-correlation, and
+global correlation). This approach compels the student model to replicate the
+correlations inherent in the teacher model, rather than focusing solely on
+individual features. This ensures effective knowledge transfer while
+maintaining the student model's training flexibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by TGRS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Character Recognition in Byzantine Seals with Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10741v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10741v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Théophile Rageau, Laurence Likforman-Sulem, Attilio Fiandrotti, Victoria Eyharabide, Béatrice Caseau, Jean-Claude Cheynet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Seals are small coin-shaped artifacts, mostly made of lead, held with strings
+to seal letters. This work presents the first attempt towards automatic reading
+of text on Byzantine seal images.Byzantine seals are generally decorated with
+iconography on the obverse side and Greek text on the reverse side. Text may
+include the sender's name, position in the Byzantine aristocracy, and elements
+of prayers. Both text and iconography are precious literary sources that wait
+to be exploited electronically, so the development of computerized systems for
+interpreting seals images is of paramount importance. This work's contribution
+is hence a deep, two-stages, character reading pipeline for transcribing
+Byzantine seal images. A first deep convolutional neural network (CNN) detects
+characters in the seal (character localization). A second convolutional network
+reads the localized characters (character classification). Finally, a
+diplomatic transcription of the seal is provided by post-processing the two
+network outputs. We provide an experimental evaluation of each CNN in isolation
+and both CNNs in combination. All performances are evaluated by
+cross-validation. Character localization achieves a mean average precision
+(mAP@0.5) greater than 0.9. Classification of characters cropped from ground
+truth bounding boxes achieves Top-1 accuracy greater than 0.92. End-to-end
+evaluation shows the efficiency of the proposed approach when compared to the
+SoTA for similar tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging the gap between image coding for machines and humans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nam Le, Honglei Zhang, Francesco Cricri, Ramin G. Youvalari, Hamed Rezazadegan Tavakoli, Emre Aksu, Miska M. Hannuksela, Esa Rahtu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image coding for machines (ICM) aims at reducing the bitrate required to
+represent an image while minimizing the drop in machine vision analysis
+accuracy. In many use cases, such as surveillance, it is also important that
+the visual quality is not drastically deteriorated by the compression process.
+Recent works on using neural network (NN) based ICM codecs have shown
+significant coding gains against traditional methods; however, the decompressed
+images, especially at low bitrates, often contain checkerboard artifacts. We
+propose an effective decoder finetuning scheme based on adversarial training to
+significantly enhance the visual quality of ICM codecs, while preserving the
+machine analysis accuracy, without adding extra bitcost or parameters at the
+inference phase. The results show complete removal of the checkerboard
+artifacts at the negligible cost of -1.6% relative change in task performance
+score. In the cases where some amount of artifacts is tolerable, such as when
+machine consumption is the primary target, this technique can enhance both
+pixel-fidelity and feature-fidelity scores without losing task performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Removal and Selection: Improving RGB-Infrared Object Detection via
+  Coarse-to-Fine Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Zhao, Maoxun Yuan, Xingxing Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object detection in visible (RGB) and infrared (IR) images has been widely
+applied in recent years. Leveraging the complementary characteristics of RGB
+and IR images, the object detector provides reliable and robust object
+localization from day to night. Existing fusion strategies directly inject RGB
+and IR images into convolution neural networks, leading to inferior detection
+performance. Since the RGB and IR features have modality-specific noise, these
+strategies will worsen the fused features along with the propagation. Inspired
+by the mechanism of human brain processing multimodal information, this work
+introduces a new coarse-to-fine perspective to purify and fuse two modality
+features. Specifically, following this perspective, we design a Redundant
+Spectrum Removal module to coarsely remove interfering information within each
+modality and a Dynamic Feature Selection module to finely select the desired
+features for feature fusion. To verify the effectiveness of the coarse-to-fine
+fusion strategy, we construct a new object detector called Removal and
+Selection Detector (RSDet). Extensive experiments on three RGB-IR object
+detection datasets verify the superior performance of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9pages, 7figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tool-LMM: A Large Multi-Modal Model for Tool Agent Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyu Wang, Weixin Luo, Qianyu Chen, Haonan Mai, Jindi Guo, Sixun Dong,  Xiaohua,  Xuan, Zhengxin Li, Lin Ma, Shenghua Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the astonishing performance of large language models (LLMs) in
+natural language comprehension and generation tasks triggered lots of
+exploration of using them as central controllers to build agent systems.
+Multiple studies focus on bridging the LLMs to external tools to extend the
+application scenarios. However, the current LLMs' perceiving tool-use ability
+is limited to a single text query, which may result in ambiguity in
+understanding the users' real intentions. LLMs are expected to eliminate that
+by perceiving the visual- or auditory-grounded instructions' information.
+Therefore, in this paper, we propose Tool-LMM, a system incorporating
+open-source LLMs and multi-modal encoders so that the learnt LLMs can be
+conscious of multi-modal input instruction and then select the function-matched
+tool correctly. To facilitate the evaluation of the model's capability, we
+collect a dataset featured by consisting of multi-modal input tools from
+HuggingFace. Another important feature of our dataset is that our dataset also
+contains multiple potential choices for the same instruction due to the
+existence of identical functions and synonymous functions, which provides more
+potential solutions for the same query. The experiments reveal that our LMM is
+capable of recommending appropriate tools for multi-modal instructions. Codes
+and data are available at https://github.com/Tool-LMM/Tool-LMM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 9 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Q&A <span class="highlight-title">Prompt</span>s: Discovering Rich Visual Clues through Mining
+  Question-Answer <span class="highlight-title">Prompt</span>s for VQA requiring Diverse World Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haibi Wang, Weifeng Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the breakthrough of multi-modal large language models, answering complex
+visual questions that demand advanced reasoning abilities and world knowledge
+has become a much more important testbed for developing AI models than ever.
+However, equipping AI models with robust cross-modality reasoning ability
+remains challenging since the cognition scheme of humans has not been
+understood systematically. In this paper, we believe that if we can collect
+visual clues in the given image as much as possible, we will recognize the
+image more accurately, understand the question better, recall relevant
+knowledge more easily, and finally reason out the answer. We discover these
+rich visual clues by mining question-answer pairs in images and sending them
+into multi-modal large language models as prompts. We call the proposed method
+Q&A Prompts. Specifically, we first use the image-answer pairs and the
+corresponding questions in the training set as inputs and outputs to train a
+visual question generation model. Then, we use an image tagging model to
+identify various instances and send packaged image-tag pairs into the visual
+question generation model to generate relevant questions with the extracted
+image tags as answers. Finally, we encode these generated question-answer pairs
+as prompts with a visual-aware prompting module and send them into pre-trained
+multi-modal large language models to reason out the final answers. Experimental
+results show that, compared with state-of-the-art methods, our Q&A Prompts
+achieves substantial improvements on the challenging visual question answering
+datasets requiring reasoning over diverse world knowledge, such as OK-VQA and
+A-OKVQA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weakly Supervised Gaussian Contrastive Grounding with Large Multimodal
+  Models for Video Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haibo Wang, Chenghang Lai, Yixuan Sun, Weifeng Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Question Answering (VideoQA) aims to answer natural language questions
+based on the information observed in videos. Despite the recent success of
+Large Multimodal Models (LMMs) in image-language understanding and reasoning,
+they deal with VideoQA insufficiently by simply taking uniformly sampled frames
+as visual inputs, which ignores question-relevant visual clues. Moreover, there
+are no human annotations for question-critical timestamps in existing VideoQA
+datasets. In light of this, we propose a novel weakly supervised framework to
+enforce the LMMs to reason out the answers with question-critical moments as
+visual inputs. Specifically, we fuse the question and answer pairs as event
+descriptions to find multiple keyframes as target moments, which will be
+pseudo-labels. With these pseudo-labels as additionally weak supervision, we
+devise a lightweight Gaussian-based Contrastive Grounding (GCG) module. GCG
+learns multiple Gaussian functions to characterize the temporal structure of
+the video, and sample question-critical frames as positive moments to be the
+visual inputs of LMMs. Extensive experiments on several VideoQA benchmarks
+verify the effectiveness of our framework, and we achieve substantial
+improvements compared to previous state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dense 3D Reconstruction Through Lidar: A Comparative Study on Ex-vivo
+  Porcine Tissue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10709v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10709v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guido Caccianiga, Julian Nubert, Marco Hutter, Katherine J. Kuchenbecker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  New sensing technologies and more advanced processing algorithms are
+transforming computer-integrated surgery. While researchers are actively
+investigating depth sensing and 3D reconstruction for vision-based surgical
+assistance, it remains difficult to achieve real-time, accurate, and robust 3D
+representations of the abdominal cavity for minimally invasive surgery. Thus,
+this work uses quantitative testing on fresh ex-vivo porcine tissue to
+thoroughly characterize the quality with which a 3D laser-based time-of-flight
+sensor (lidar) can perform anatomical surface reconstruction. Ground-truth
+surface shapes are captured with a commercial laser scanner, and the resulting
+signed error fields are analyzed using rigorous statistical tools. When
+compared to modern learning-based stereo matching from endoscopic images,
+time-of-flight sensing demonstrates higher precision, lower processing delay,
+higher frame rate, and superior robustness against sensor distance and poor
+illumination. Furthermore, we report on the potential negative effect of
+near-infrared light penetration on the accuracy of lidar measurements across
+different tissue samples, identifying a significant measured depth offset for
+muscle in contrast to fat and liver. Our findings highlight the potential of
+lidar for intraoperative 3D perception and point toward new methods that
+combine complementary time-of-flight and spectral imaging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MixNet: Towards Effective and Efficient UHD Low-Light Image Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Wu, Zhuoran Zheng, Xiuyi Jia, Wenqi Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the continuous advancement of imaging devices, the prevalence of
+Ultra-High-Definition (UHD) images is rising. Although many image restoration
+methods have achieved promising results, they are not directly applicable to
+UHD images on devices with limited computational resources due to the
+inherently high computational complexity of UHD images. In this paper, we focus
+on the task of low-light image enhancement (LLIE) and propose a novel LLIE
+method called MixNet, which is designed explicitly for UHD images. To capture
+the long-range dependency of features without introducing excessive
+computational complexity, we present the Global Feature Modulation Layer
+(GFML). GFML associates features from different views by permuting the feature
+maps, enabling efficient modeling of long-range dependency. In addition, we
+also design the Local Feature Modulation Layer (LFML) and Feed-forward Layer
+(FFL) to capture local features and transform features into a compact
+representation. This way, our MixNet achieves effective LLIE with few model
+parameters and low computational complexity. We conducted extensive experiments
+on both synthetic and real-world datasets, and the comprehensive results
+demonstrate that our proposed method surpasses the performance of current
+state-of-the-art methods. The code will be available at
+\url{https://github.com/zzr-idam/MixNet}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BadODD: Bangladeshi Autonomous Driving Object Detection <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirza Nihal Baig, Rony Hajong, Mahdi Murshed Patwary, Mohammad Shahidur Rahman, Husne Ara Chowdhury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a comprehensive dataset for object detection in diverse driving
+environments across 9 districts in Bangladesh. The dataset, collected
+exclusively from smartphone cameras, provided a realistic representation of
+real-world scenarios, including day and night conditions. Most existing
+datasets lack suitable classes for autonomous navigation on Bangladeshi roads,
+making it challenging for researchers to develop models that can handle the
+intricacies of road scenarios. To address this issue, the authors proposed a
+new set of classes based on characteristics rather than local vehicle names.
+The dataset aims to encourage the development of models that can handle the
+unique challenges of Bangladeshi road scenarios for the effective deployment of
+autonomous vehicles. The dataset did not consist of any online images to
+simulate real-world conditions faced by autonomous vehicles. The classification
+of vehicles is challenging because of the diverse range of vehicles on
+Bangladeshi roads, including those not found elsewhere in the world. The
+proposed classification system is scalable and can accommodate future vehicles,
+making it a valuable resource for researchers in the autonomous vehicle sector.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive <span class="highlight-title">Survey</span> on Deep-Learning-based Vehicle Re-Identification:
+  Models, Data Sets and Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Amiri, Aydin Kaya, Ali Seydi Keceli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicle re-identification (ReID) endeavors to associate vehicle images
+collected from a distributed network of cameras spanning diverse traffic
+environments. This task assumes paramount importance within the spectrum of
+vehicle-centric technologies, playing a pivotal role in deploying Intelligent
+Transportation Systems (ITS) and advancing smart city initiatives. Rapid
+advancements in deep learning have significantly propelled the evolution of
+vehicle ReID technologies in recent years. Consequently, undertaking a
+comprehensive survey of methodologies centered on deep learning for vehicle
+re-identification has become imperative and inescapable. This paper extensively
+explores deep learning techniques applied to vehicle ReID. It outlines the
+categorization of these methods, encompassing supervised and unsupervised
+approaches, delves into existing research within these categories, introduces
+datasets and evaluation criteria, and delineates forthcoming challenges and
+potential research directions. This comprehensive assessment examines the
+landscape of deep learning in vehicle ReID and establishes a foundation and
+starting point for future works. It aims to serve as a complete reference by
+highlighting challenges and emerging trends, fostering advancements and
+applications in vehicle ReID utilizing deep learning models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A comprehensive study on fidelity metrics for XAI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10640v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10640v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miquel Miró-Nicolau, Antoni Jaume-i-Capó, Gabriel Moyà-Alcover
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of eXplainable Artificial Intelligence (XAI) systems has introduced a
+set of challenges that need resolution. Herein, we focus on how to correctly
+select an XAI method, an open questions within the field. The inherent
+difficulty of this task is due to the lack of a ground truth. Several authors
+have proposed metrics to approximate the fidelity of different XAI methods.
+These metrics lack verification and have concerning disagreements. In this
+study, we proposed a novel methodology to verify fidelity metrics, using a
+well-known transparent model, namely a decision tree. This model allowed us to
+obtain explanations with perfect fidelity. Our proposal constitutes the first
+objective benchmark for these metrics, facilitating a comparison of existing
+proposals, and surpassing existing methods. We applied our benchmark to assess
+the existing fidelity metrics in two different experiments, each using public
+datasets comprising 52,000 images. The images from these datasets had a size a
+128 by 128 pixels and were synthetic data that simplified the training process.
+All metric values, indicated a lack of fidelity, with the best one showing a 30
+\% deviation from the expected values for perfect explanation. Our
+experimentation led us to conclude that the current fidelity metrics are not
+reliable enough to be used in real scenarios. From this finding, we deemed it
+necessary to development new metrics, to avoid the detected problems, and we
+recommend the usage of our proposal as a benchmark within the scientific
+community to address these limitations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Universal Unsupervised Anomaly Detection in Medical Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10637v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10637v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cosmin I. Bercea, Benedikt Wiestler, Daniel Rueckert, Julia A. Schnabel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing complexity of medical imaging data underscores the need for
+advanced anomaly detection methods to automatically identify diverse
+pathologies. Current methods face challenges in capturing the broad spectrum of
+anomalies, often limiting their use to specific lesion types in brain scans. To
+address this challenge, we introduce a novel unsupervised approach, termed
+\textit{Reversed Auto-Encoders (RA)}, designed to create realistic
+pseudo-healthy reconstructions that enable the detection of a wider range of
+pathologies. We evaluate the proposed method across various imaging modalities,
+including magnetic resonance imaging (MRI) of the brain, pediatric wrist X-ray,
+and chest X-ray, and demonstrate superior performance in detecting anomalies
+compared to existing state-of-the-art methods. Our unsupervised anomaly
+detection approach may enhance diagnostic accuracy in medical imaging by
+identifying a broader range of unknown pathologies. Our code is publicly
+available at: \url{https://github.com/ci-ber/RA}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Polytopic Autoencoders with Smooth Clustering for Reduced-order
+  Modelling of Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Heiland, Yongho Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of neural networks, there has been a notable increase,
+both in terms of quantity and variety, in research publications concerning the
+application of autoencoders to reduced-order models. We propose a polytopic
+autoencoder architecture that includes a lightweight nonlinear encoder, a
+convex combination decoder, and a smooth clustering network. Supported by
+several proofs, the model architecture ensures that all reconstructed states
+lie within a polytope, accompanied by a metric indicating the quality of the
+constructed polytopes, referred to as polytope error. Additionally, it offers a
+minimal number of convex coordinates for polytopic linear-parameter varying
+systems while achieving acceptable reconstruction errors compared to proper
+orthogonal decomposition (POD). To validate our proposed model, we conduct
+simulations involving two flow scenarios with the incompressible Navier-Stokes
+equation. Numerical results demonstrate the guaranteed properties of the model,
+low reconstruction errors compared to POD, and the improvement in error using a
+clustering network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ M2ORT: Many-To-One Regression <span class="highlight-title">Transformer</span> for Spatial Transcriptomics
+  Prediction from Histopathology Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyi Wang, Xiuju Du, Jing Liu, Shuyi Ouyang, Yen-Wei Chen, Lanfen Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of Spatial Transcriptomics (ST) has facilitated the
+spatially-aware profiling of gene expressions based on histopathology images.
+Although ST data offers valuable insights into the micro-environment of tumors,
+its acquisition cost remains expensive. Therefore, directly predicting the ST
+expressions from digital pathology images is desired. Current methods usually
+adopt existing regression backbones for this task, which ignore the inherent
+multi-scale hierarchical data structure of digital pathology images. To address
+this limit, we propose M2ORT, a many-to-one regression Transformer that can
+accommodate the hierarchical structure of the pathology images through a
+decoupled multi-scale feature extractor. Different from traditional models that
+are trained with one-to-one image-label pairs, M2ORT accepts multiple pathology
+images of different magnifications at a time to jointly predict the gene
+expressions at their corresponding common ST spot, aiming at learning a
+many-to-one relationship through training. We have tested M2ORT on three public
+ST datasets and the experimental results show that M2ORT can achieve
+state-of-the-art performance with fewer parameters and floating-point
+operations (FLOPs). The code is available at:
+https://github.com/Dootmaan/M2ORT/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DGL: Dynamic Global-Local <span class="highlight-title">Prompt</span> Tuning for Text-Video Retrieval <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangpeng Yang, Linchao Zhu, Xiaohan Wang, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-video retrieval is a critical multi-modal task to find the most relevant
+video for a text query. Although pretrained models like CLIP have demonstrated
+impressive potential in this area, the rising cost of fully finetuning these
+models due to increasing model size continues to pose a problem. To address
+this challenge, prompt tuning has emerged as an alternative. However, existing
+works still face two problems when adapting pretrained image-text models to
+downstream video-text tasks: (1) The visual encoder could only encode
+frame-level features and failed to extract global-level general video
+information. (2) Equipping the visual and text encoder with separated prompts
+failed to mitigate the visual-text modality gap. To this end, we propose DGL, a
+cross-modal Dynamic prompt tuning method with Global-Local video attention. In
+contrast to previous prompt tuning methods, we employ the shared latent space
+to generate local-level text and frame prompts that encourage inter-modal
+interaction. Furthermore, we propose modeling video in a global-local attention
+mechanism to capture global video information from the perspective of prompt
+tuning. Extensive experiments reveal that when only 0.67% parameters are tuned,
+our cross-modal prompt tuning strategy DGL outperforms or is comparable to
+fully finetuning methods on MSR-VTT, VATEX, LSMDC, and ActivityNet datasets.
+Code will be available at https://github.com/knightyxp/DGL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI2024, Code will be available at https://github.com/knightyxp/DGL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Shape Completion on Unseen Categories:A Weakly-supervised Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10578v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10578v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lintai Wu, Junhui Hou, Linqi Song, Yong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D shapes captured by scanning devices are often incomplete due to occlusion.
+3D shape completion methods have been explored to tackle this limitation.
+However, most of these methods are only trained and tested on a subset of
+categories, resulting in poor generalization to unseen categories. In this
+paper, we introduce a novel weakly-supervised framework to reconstruct the
+complete shapes from unseen categories. We first propose an end-to-end
+prior-assisted shape learning network that leverages data from the seen
+categories to infer a coarse shape. Specifically, we construct a prior bank
+consisting of representative shapes from the seen categories. Then, we design a
+multi-scale pattern correlation module for learning the complete shape of the
+input by analyzing the correlation between local patterns within the input and
+the priors at various scales. In addition, we propose a self-supervised shape
+refinement model to further refine the coarse shape. Considering the shape
+variability of 3D objects across categories, we construct a category-specific
+prior bank to facilitate shape refinement. Then, we devise a voxel-based
+partial matching loss and leverage the partial scans to drive the refinement
+process. Extensive experimental results show that our approach is superior to
+state-of-the-art methods by a large margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dream360: Diverse and Immersive Outdoor Virtual Scene Creation via
+  <span class="highlight-title">Transformer</span>-Based 360 Image Outpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10564v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10564v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Ai, Zidong Cao, Haonan Lu, Chen Chen, Jian Ma, Pengyuan Zhou, Tae-Kyun Kim, Pan Hui, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  360 images, with a field-of-view (FoV) of 180x360, provide immersive and
+realistic environments for emerging virtual reality (VR) applications, such as
+virtual tourism, where users desire to create diverse panoramic scenes from a
+narrow FoV photo they take from a viewpoint via portable devices. It thus
+brings us to a technical challenge: `How to allow the users to freely create
+diverse and immersive virtual scenes from a narrow FoV image with a specified
+viewport?' To this end, we propose a transformer-based 360 image outpainting
+framework called Dream360, which can generate diverse, high-fidelity, and
+high-resolution panoramas from user-selected viewports, considering the
+spherical properties of 360 images. Compared with existing methods, e.g., [3],
+which primarily focus on inputs with rectangular masks and central locations
+while overlooking the spherical property of 360 images, our Dream360 offers
+higher outpainting flexibility and fidelity based on the spherical
+representation. Dream360 comprises two key learning stages: (I) codebook-based
+panorama outpainting via Spherical-VQGAN (S-VQGAN), and (II) frequency-aware
+refinement with a novel frequency-aware consistency loss. Specifically, S-VQGAN
+learns a sphere-specific codebook from spherical harmonic (SH) values,
+providing a better representation of spherical data distribution for scene
+modeling. The frequency-aware refinement matches the resolution and further
+improves the semantic consistency and visual fidelity of the generated results.
+Our Dream360 achieves significantly lower Frechet Inception Distance (FID)
+scores and better visual fidelity than existing methods. We also conducted a
+user study involving 15 participants to interactively evaluate the quality of
+the generated results in VR, demonstrating the flexibility and superiority of
+our Dream360 framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, accepted to IEEE VR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAEDiff: Masked Autoencoder-enhanced Diffusion Models for Unsupervised
+  Anomaly Detection in Brain Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10561v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10561v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Xu, Yunke Wang, Bo Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly detection has gained significant attention in the field
+of medical imaging due to its capability of relieving the costly pixel-level
+annotation. To achieve this, modern approaches usually utilize generative
+models to produce healthy references of the diseased images and then identify
+the abnormalities by comparing the healthy references and the original diseased
+images. Recently, diffusion models have exhibited promising potential for
+unsupervised anomaly detection in medical images for their good mode coverage
+and high sample quality. However, the intrinsic characteristics of the medical
+images, e.g. the low contrast, and the intricate anatomical structure of the
+human body make the reconstruction challenging. Besides, the global information
+of medical images often remain underutilized. To address these two issues, we
+propose a novel Masked Autoencoder-enhanced Diffusion Model (MAEDiff) for
+unsupervised anomaly detection in brain images. The MAEDiff involves a
+hierarchical patch partition. It generates healthy images by overlapping
+upper-level patches and implements a mechanism based on the masked autoencoders
+operating on the sub-level patches to enhance the condition on the unnoised
+regions. Extensive experiments on data of tumors and multiple sclerosis lesions
+demonstrate the effectiveness of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 360ORB-SLAM: A Visual SLAM System for Panoramic Images with Depth
+  Completion Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichen Chen, Yiqi Pan, Ruyu Liu, Haoyu Zhang, Guodao Zhang, Bo Sun, Jianhua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To enhance the performance and effect of AR/VR applications and visual
+assistance and inspection systems, visual simultaneous localization and mapping
+(vSLAM) is a fundamental task in computer vision and robotics. However,
+traditional vSLAM systems are limited by the camera's narrow field-of-view,
+resulting in challenges such as sparse feature distribution and lack of dense
+depth information. To overcome these limitations, this paper proposes a
+360ORB-SLAM system for panoramic images that combines with a depth completion
+network. The system extracts feature points from the panoramic image, utilizes
+a panoramic triangulation module to generate sparse depth information, and
+employs a depth completion network to obtain a dense panoramic depth map.
+Experimental results on our novel panoramic dataset constructed based on Carla
+demonstrate that the proposed method achieves superior scale accuracy compared
+to existing monocular SLAM methods and effectively addresses the challenges of
+feature association and scale ambiguity. The integration of the depth
+completion network enhances system stability and mitigates the impact of
+dynamic elements on SLAM performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symbol as Points: Panoptic Symbol Spotting via Point-based
+  Representation <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10556v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10556v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenlong Liu, Tianyu Yang, Yuhan Wang, Qizhi Yu, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work studies the problem of panoptic symbol spotting, which is to spot
+and parse both countable object instances (windows, doors, tables, etc.) and
+uncountable stuff (wall, railing, etc.) from computer-aided design (CAD)
+drawings. Existing methods typically involve either rasterizing the vector
+graphics into images and using image-based methods for symbol spotting, or
+directly building graphs and using graph neural networks for symbol
+recognition. In this paper, we take a different approach, which treats graphic
+primitives as a set of 2D points that are locally connected and use point cloud
+segmentation methods to tackle it. Specifically, we utilize a point transformer
+to extract the primitive features and append a mask2former-like spotting head
+to predict the final output. To better use the local connection information of
+primitives and enhance their discriminability, we further propose the attention
+with connection module (ACM) and contrastive connection learning scheme (CCL).
+Finally, we propose a KNN interpolation mechanism for the mask attention module
+of the spotting head to better handle primitive mask downsampling, which is
+primitive-level in contrast to pixel-level for the image. Our approach, named
+SymPoint, is simple yet effective, outperforming recent state-of-the-art method
+GAT-CADNet by an absolute increase of 9.6% PQ and 10.4% RQ on the FloorPlanCAD
+dataset. The source code and models will be available at
+https://github.com/nicehuster/SymPoint.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ I-SplitEE: Image classification in Split Computing DNNs with Early Exits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Divya Jyoti Bajpai, Aastha Jaiswal, Manjesh Kumar Hanawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advances in Deep Neural Networks (DNNs) stem from their
+exceptional performance across various domains. However, their inherent large
+size hinders deploying these networks on resource-constrained devices like
+edge, mobile, and IoT platforms. Strategies have emerged, from partial cloud
+computation offloading (split computing) to integrating early exits within DNN
+layers. Our work presents an innovative unified approach merging early exits
+and split computing. We determine the 'splitting layer', the optimal depth in
+the DNN for edge device computations, and whether to infer on edge device or be
+offloaded to the cloud for inference considering accuracy, computational
+efficiency, and communication costs. Also, Image classification faces diverse
+environmental distortions, influenced by factors like time of day, lighting,
+and weather. To adapt to these distortions, we introduce I-SplitEE, an online
+unsupervised algorithm ideal for scenarios lacking ground truths and with
+sequential data. Experimental validation using Caltech-256 and Cifar-10
+datasets subjected to varied distortions showcases I-SplitEE's ability to
+reduce costs by a minimum of 55% with marginal performance degradation of at
+most 5%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in proceedings of IEEE International Conference on
+  Communications 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Position-Aware Implicit Neural Network for Real-World Face
+  Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Zhao, Huan Yang, Jianlong Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face inpainting requires the model to have a precise global understanding of
+the facial position structure. Benefiting from the powerful capabilities of
+deep learning backbones, recent works in face inpainting have achieved decent
+performance in ideal setting (square shape with $512px$). However, existing
+methods often produce a visually unpleasant result, especially in the
+position-sensitive details (e.g., eyes and nose), when directly applied to
+arbitrary-shaped images in real-world scenarios. The visually unpleasant
+position-sensitive details indicate the shortcomings of existing methods in
+terms of position information processing capability. In this paper, we propose
+an \textbf{I}mplicit \textbf{N}eural \textbf{I}npainting \textbf{N}etwork
+(IN$^2$) to handle arbitrary-shape face images in real-world scenarios by
+explicit modeling for position information. Specifically, a downsample
+processing encoder is proposed to reduce information loss while obtaining the
+global semantic feature. A neighbor hybrid attention block is proposed with a
+hybrid attention mechanism to improve the facial understanding ability of the
+model without restricting the shape of the input. Finally, an implicit neural
+pyramid decoder is introduced to explicitly model position information and
+bridge the gap between low-resolution features and high-resolution output.
+Extensive experiments demonstrate the superiority of the proposed method in
+real-world face inpainting task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NWPU-MOC: A Benchmark for Fine-grained Multi-category Object Counting in
+  Aerial Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10530v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10530v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyu Gao, Liangliang Zhao, Xuelong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object counting is a hot topic in computer vision, which aims to estimate the
+number of objects in a given image. However, most methods only count objects of
+a single category for an image, which cannot be applied to scenes that need to
+count objects with multiple categories simultaneously, especially in aerial
+scenes. To this end, this paper introduces a Multi-category Object Counting
+(MOC) task to estimate the numbers of different objects (cars, buildings,
+ships, etc.) in an aerial image. Considering the absence of a dataset for this
+task, a large-scale Dataset (NWPU-MOC) is collected, consisting of 3,416 scenes
+with a resolution of 1024 $\times$ 1024 pixels, and well-annotated using 14
+fine-grained object categories. Besides, each scene contains RGB and Near
+Infrared (NIR) images, of which the NIR spectrum can provide richer
+characterization information compared with only the RGB spectrum. Based on
+NWPU-MOC, the paper presents a multi-spectrum, multi-category object counting
+framework, which employs a dual-attention module to fuse the features of RGB
+and NIR and subsequently regress multi-channel density maps corresponding to
+each object category. In addition, to modeling the dependency between different
+channels in the density map with each object category, a spatial contrast loss
+is designed as a penalty for overlapping predictions at the same spatial
+position. Experimental results demonstrate that the proposed method achieves
+state-of-the-art performance compared with some mainstream counting algorithms.
+The dataset, code and models are publicly available at
+https://github.com/lyongo/NWPU-MOC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mementos: A Comprehensive Benchmark for Multimodal Large Language Model
+  Reasoning over Image Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiyao Wang, Yuhang Zhou, Xiaoyu Liu, Hongjin Lu, Yuancheng Xu, Feihong He, Jaehong Yoon, Taixi Lu, Gedas Bertasius, Mohit Bansal, Huaxiu Yao, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in
+handling a variety of visual-language tasks. However, current MLLM benchmarks
+are predominantly designed to evaluate reasoning based on static information
+about a single image, and the ability of modern MLLMs to extrapolate from image
+sequences, which is essential for understanding our ever-changing world, has
+been less investigated. To address this challenge, this paper introduces
+Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning
+abilities. Mementos features 4,761 diverse image sequences with varying
+lengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning
+performance. Through a careful evaluation of nine recent MLLMs on Mementos,
+including GPT-4V and Gemini, we find that they struggle to accurately describe
+dynamic information about given image sequences, often leading to
+hallucinations/misrepresentations of objects and their corresponding behaviors.
+Our quantitative analysis and case studies identify three key factors impacting
+MLLMs' sequential image reasoning: the correlation between object and
+behavioral hallucinations, the influence of cooccurring behaviors, and the
+compounding impact of behavioral hallucinations. Our dataset is available at
+https://github.com/umd-huang-lab/Mementos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 23 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On mitigating stability-plasticity dilemma in CLIP-guided image morphing
+  via geodesic distillation loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeongtak Oh, Saehyung Lee, Uiwon Hwang, Sungroh Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale language-vision pre-training models, such as CLIP, have achieved
+remarkable text-guided image morphing results by leveraging several
+unconditional generative models. However, existing CLIP-guided image morphing
+methods encounter difficulties when morphing photorealistic images.
+Specifically, existing guidance fails to provide detailed explanations of the
+morphing regions within the image, leading to misguidance. In this paper, we
+observed that such misguidance could be effectively mitigated by simply using a
+proper regularization loss. Our approach comprises two key components: 1) a
+geodesic cosine similarity loss that minimizes inter-modality features (i.e.,
+image and text) on a projected subspace of CLIP space, and 2) a latent
+regularization loss that minimizes intra-modality features (i.e., image and
+image) on the image manifold. By replacing the na\"ive directional CLIP loss in
+a drop-in replacement manner, our method achieves superior morphing results on
+both images and videos for various benchmarks, including CLIP-inversion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Focaler-IoU: More Focused Intersection over Union Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Zhang, Shuaijie Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bounding box regression plays a crucial role in the field of object
+detection, and the positioning accuracy of object detection largely depends on
+the loss function of bounding box regression. Existing researchs improve
+regression performance by utilizing the geometric relationship between bounding
+boxes, while ignoring the impact of difficult and easy sample distribution on
+bounding box regression. In this article, we analyzed the impact of difficult
+and easy sample distribution on regression results, and then proposed
+Focaler-IoU, which can improve detector performance in different detection
+tasks by focusing on different regression samples. Finally, comparative
+experiments were conducted using existing advanced detectors and regression
+methods for different detection tasks, and the detection performance was
+further improved by using the method proposed in this paper.Code is available
+at \url{https://github.com/malagoutou/Focaler-IoU}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2312.17663</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Color Invariance through Image-Level Ensemble Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10512v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10512v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunpeng Gong, Jiaquan Li, Lifei Chen, Min Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of computer vision, the persistent presence of color bias,
+resulting from fluctuations in real-world lighting and camera conditions,
+presents a substantial challenge to the robustness of models. This issue is
+particularly pronounced in complex wide-area surveillance scenarios, such as
+person re-identification and industrial dust segmentation, where models often
+experience a decline in performance due to overfitting on color information
+during training, given the presence of environmental variations. Consequently,
+there is a need to effectively adapt models to cope with the complexities of
+camera conditions. To address this challenge, this study introduces a learning
+strategy named Random Color Erasing, which draws inspiration from ensemble
+learning. This strategy selectively erases partial or complete color
+information in the training data without disrupting the original image
+structure, thereby achieving a balanced weighting of color features and other
+features within the neural network. This approach mitigates the risk of
+overfitting and enhances the model's ability to handle color variation, thereby
+improving its overall robustness. The approach we propose serves as an ensemble
+learning strategy, characterized by robust interpretability. A comprehensive
+analysis of this methodology is presented in this paper. Across various tasks
+such as person re-identification and semantic segmentation, our approach
+consistently improves strong baseline methods. Notably, in comparison to
+existing methods that prioritize color robustness, our strategy significantly
+enhances performance in cross-domain scenarios. The code available at
+\url{https://github.com/layumi/Person\_reID\_baseline\_pytorch/blob/master/random\_erasing.py}
+or \url{https://github.com/finger-monkey/Data-Augmentation}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GMC-IQA: Exploiting Global-correlation and Mean-opinion Consistency for
+  No-reference Image Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zewen Chen, Juan Wang, Bing Li, Chunfeng Yuan, Weiming Hu, Junxian Liu, Peng Li, Yan Wang, Youqun Zhang, Congxuan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the subjective nature of image quality assessment (IQA), assessing
+which image has better quality among a sequence of images is more reliable than
+assigning an absolute mean opinion score for an image. Thus, IQA models are
+evaluated by global correlation consistency (GCC) metrics like PLCC and SROCC,
+rather than mean opinion consistency (MOC) metrics like MAE and MSE. However,
+most existing methods adopt MOC metrics to define their loss functions, due to
+the infeasible computation of GCC metrics during training. In this work, we
+construct a novel loss function and network to exploit Global-correlation and
+Mean-opinion Consistency, forming a GMC-IQA framework. Specifically, we propose
+a novel GCC loss by defining a pairwise preference-based rank estimation to
+solve the non-differentiable problem of SROCC and introducing a queue mechanism
+to reserve previous data to approximate the global results of the whole data.
+Moreover, we propose a mean-opinion network, which integrates diverse opinion
+features to alleviate the randomness of weight learning and enhance the model
+robustness. Experiments indicate that our method outperforms SOTA methods on
+multiple authentic datasets with higher accuracy and generalization. We also
+adapt the proposed loss to various networks, which brings better performance
+and more stable training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing medical vision-language contrastive learning via
+  inter-matching relation modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingjian Li, Mingyuan Meng, Michael Fulham, David Dagan Feng, Lei Bi, Jinman Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image representations can be learned through medical vision-language
+contrastive learning (mVLCL) where medical imaging reports are used as weak
+supervision through image-text alignment. These learned image representations
+can be transferred to and benefit various downstream medical vision tasks such
+as disease classification and segmentation. Recent mVLCL methods attempt to
+align image sub-regions and the report keywords as local-matchings. However,
+these methods aggregate all local-matchings via simple pooling operations while
+ignoring the inherent relations between them. These methods therefore fail to
+reason between local-matchings that are semantically related, e.g.,
+local-matchings that correspond to the disease word and the location word
+(semantic-relations), and also fail to differentiate such clinically important
+local-matchings from others that correspond to less meaningful words, e.g.,
+conjunction words (importance-relations). Hence, we propose a mVLCL method that
+models the inter-matching relations between local-matchings via a
+relation-enhanced contrastive learning framework (RECLF). In RECLF, we
+introduce a semantic-relation reasoning module (SRM) and an importance-relation
+reasoning module (IRM) to enable more fine-grained report supervision for image
+representation learning. We evaluated our method using four public benchmark
+datasets on four downstream tasks, including segmentation, zero-shot
+classification, supervised classification, and cross-modal retrieval. Our
+results demonstrated the superiority of our RECLF over the state-of-the-art
+mVLCL methods with consistent improvements across single-modal and cross-modal
+tasks. These results suggest that our RECLF, by modelling the inter-matching
+relations, can learn improved medical image representations with better
+generalization capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short
+  Video Search Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangshuo Qiao, Xianxin Li, Xiaozhe Qu, Jie Zhang, Yang Liu, Yu Luo, Cihang Jin, Jin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Models pre-trained on large-scale image-text datasets have
+shown superior performance in downstream tasks such as image retrieval. Most of
+the images for pre-training are presented in the form of open domain
+common-sense visual elements. Differently, video covers in short video search
+scenarios are presented as user-originated contents that provide important
+visual summaries of videos. In addition, a portion of the video covers come
+with manually designed cover texts that provide semantic complements. In order
+to fill in the gaps in short video cover data, we establish the first
+large-scale cover-text benchmark for Chinese short video search scenarios.
+Specifically, we release two large-scale datasets CBVS-5M/10M to provide short
+video covers, and the manual fine-labeling dataset CBVS-20K to provide real
+user queries, which serves as an image-text benchmark test in the Chinese short
+video search field. To integrate the semantics of cover text in the case of
+modality missing, we propose UniCLIP where cover texts play a guiding role
+during training, however are not relied upon by inference. Extensive evaluation
+on CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has
+been deployed to Tencent's online video search systems with hundreds of
+millions of visits and achieved significant gains. The complete dataset, code
+and checkpoints will be available upon release.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LDReg: Local Dimensionality Regularized <span class="highlight-title">Self-Supervised</span> Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10474v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10474v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanxun Huang, Ricardo J. G. B. Campello, Sarah Monazam Erfani, Xingjun Ma, Michael E. Houle, James Bailey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representations learned via self-supervised learning (SSL) can be susceptible
+to dimensional collapse, where the learned representation subspace is of
+extremely low dimensionality and thus fails to represent the full data
+distribution and modalities. Dimensional collapse also known as the
+"underfilling" phenomenon is one of the major causes of degraded performance on
+downstream tasks. Previous work has investigated the dimensional collapse
+problem of SSL at a global level. In this paper, we demonstrate that
+representations can span over high dimensional space globally, but collapse
+locally. To address this, we propose a method called $\textit{local
+dimensionality regularization (LDReg)}$. Our formulation is based on the
+derivation of the Fisher-Rao metric to compare and optimize local distance
+distributions at an asymptotically small radius for each data point. By
+increasing the local intrinsic dimensionality, we demonstrate through a range
+of experiments that LDReg improves the representation quality of SSL. The
+results also show that LDReg can regularize dimensionality at both local and
+global levels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Robustly Reconstruct Low-light Dynamic Scenes from Spike
+  Streams 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liwen Hu, Ziluo Ding, Mianzhi Liu, Lei Ma, Tiejun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a neuromorphic sensor with high temporal resolution, spike camera can
+generate continuous binary spike streams to capture per-pixel light intensity.
+We can use reconstruction methods to restore scene details in high-speed
+scenarios. However, due to limited information in spike streams, low-light
+scenes are difficult to effectively reconstruct. In this paper, we propose a
+bidirectional recurrent-based reconstruction framework, including a
+Light-Robust Representation (LR-Rep) and a fusion module, to better handle such
+extreme conditions. LR-Rep is designed to aggregate temporal information in
+spike streams, and a fusion module is utilized to extract temporal features.
+Additionally, we have developed a reconstruction benchmark for high-speed
+low-light scenes. Light sources in the scenes are carefully aligned to
+real-world conditions. Experimental results demonstrate the superiority of our
+method, which also generalizes well to real spike streams. Related codes and
+proposed datasets will be released after publication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Path Choice Matters for Clear Attribution in Path Methods <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Borui Zhang, Wenzhao Zheng, Jie Zhou, Jiwen Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rigorousness and clarity are both essential for interpretations of DNNs to
+engender human trust. Path methods are commonly employed to generate rigorous
+attributions that satisfy three axioms. However, the meaning of attributions
+remains ambiguous due to distinct path choices. To address the ambiguity, we
+introduce \textbf{Concentration Principle}, which centrally allocates high
+attributions to indispensable features, thereby endowing aesthetic and
+sparsity. We then present \textbf{SAMP}, a model-agnostic interpreter, which
+efficiently searches the near-optimal path from a pre-defined set of
+manipulation paths. Moreover, we propose the infinitesimal constraint (IC) and
+momentum strategy (MS) to improve the rigorousness and optimality.
+Visualizations show that SAMP can precisely reveal DNNs by pinpointing salient
+image pixels. We also perform quantitative experiments and observe that our
+method significantly outperforms the counterparts. Code:
+https://github.com/zbr17/SAMP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeRF Revisited: Fixing Quadrature Instability in Volume Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.20685v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.20685v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikaela Angelina Uy, Kiyohiro Nakayama, Guandao Yang, Rahul Krishna Thomas, Leonidas Guibas, Ke Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural radiance fields (NeRF) rely on volume rendering to synthesize novel
+views. Volume rendering requires evaluating an integral along each ray, which
+is numerically approximated with a finite sum that corresponds to the exact
+integral along the ray under piecewise constant volume density. As a
+consequence, the rendered result is unstable w.r.t. the choice of samples along
+the ray, a phenomenon that we dub quadrature instability. We propose a
+mathematically principled solution by reformulating the sample-based rendering
+equation so that it corresponds to the exact integral under piecewise linear
+volume density. This simultaneously resolves multiple issues: conflicts between
+samples along different rays, imprecise hierarchical sampling, and
+non-differentiability of quantiles of ray termination distances w.r.t. model
+parameters. We demonstrate several benefits over the classical sample-based
+rendering equation, such as sharper textures, better geometric reconstruction,
+and stronger depth supervision. Our proposed formulation can be also be used as
+a drop-in replacement to the volume rendering equation of existing NeRF-based
+methods. Our project page can be found at pl-nerf.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Neurips 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GBSD: Generative Bokeh with Stage Diffusion <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jieren Deng, Xin Zhou, Hao Tian, Zhihong Pan, Derek Aguiar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The bokeh effect is an artistic technique that blurs out-of-focus areas in a
+photograph and has gained interest due to recent developments in text-to-image
+synthesis and the ubiquity of smart-phone cameras and photo-sharing apps. Prior
+work on rendering bokeh effects have focused on post hoc image manipulation to
+produce similar blurring effects in existing photographs using classical
+computer graphics or neural rendering techniques, but have either depth
+discontinuity artifacts or are restricted to reproducing bokeh effects that are
+present in the training data. More recent diffusion based models can synthesize
+images with an artistic style, but either require the generation of
+high-dimensional masks, expensive fine-tuning, or affect global image
+characteristics. In this paper, we present GBSD, the first generative
+text-to-image model that synthesizes photorealistic images with a bokeh style.
+Motivated by how image synthesis occurs progressively in diffusion models, our
+approach combines latent diffusion models with a 2-stage conditioning algorithm
+to render bokeh effects on semantically defined objects. Since we can focus the
+effect on objects, this semantic bokeh effect is more versatile than classical
+rendering techniques. We evaluate GBSD both quantitatively and qualitatively
+and demonstrate its ability to be applied in both text-to-image and
+image-to-image settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Short Version is accepted by International Conference on Acoustics,
+  Speech, and Signal Processing (ICASSP) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Smooth and Stepwise Self-Distillation for Object Detection <span class="chip">ICIP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05015v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05015v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jieren Deng, Xin Zhou, Hao Tian, Zhihong Pan, Derek Aguiar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distilling the structured information captured in feature maps has
+contributed to improved results for object detection tasks, but requires
+careful selection of baseline architectures and substantial pre-training.
+Self-distillation addresses these limitations and has recently achieved
+state-of-the-art performance for object detection despite making several
+simplifying architectural assumptions. Building on this work, we propose Smooth
+and Stepwise Self-Distillation (SSSD) for object detection. Our SSSD
+architecture forms an implicit teacher from object labels and a feature pyramid
+network backbone to distill label-annotated feature maps using Jensen-Shannon
+distance, which is smoother than distillation losses used in prior work. We
+additionally add a distillation coefficient that is adaptively configured based
+on the learning rate. We extensively benchmark SSSD against a baseline and two
+state-of-the-art object detector architectures on the COCO dataset by varying
+the coefficients and backbone and detector networks. We demonstrate that SSSD
+achieves higher average precision in most experimental settings, is robust to a
+wide range of coefficients, and benefits from our stepwise distillation
+procedure.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Conference on Image Processing (ICIP) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IPR-NeRF: Ownership Verification meets Neural Radiance Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09495v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09495v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Win Kent Ong, Kam Woh Ng, Chee Seng Chan, Yi Zhe Song, Tao Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Field (NeRF) models have gained significant attention in the
+computer vision community in the recent past with state-of-the-art visual
+quality and produced impressive demonstrations. Since then, technopreneurs have
+sought to leverage NeRF models into a profitable business. Therefore, NeRF
+models make it worth the risk of plagiarizers illegally copying,
+re-distributing, or misusing those models. This paper proposes a comprehensive
+intellectual property (IP) protection framework for the NeRF model in both
+black-box and white-box settings, namely IPR-NeRF. In the black-box setting, a
+diffusion-based solution is introduced to embed and extract the watermark via a
+two-stage optimization process. In the white-box setting, a designated digital
+signature is embedded into the weights of the NeRF model by adopting the sign
+loss objective. Our extensive experiments demonstrate that not only does our
+approach maintain the fidelity (\ie, the rendering quality) of IPR-NeRF models,
+but it is also robust against both ambiguity and removal attacks compared to
+prior arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Error on the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AUPIMO: Redefining Visual Anomaly Detection Benchmarks with High Speed
+  and Low Tolerance <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01984v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01984v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joao P. C. Bertoldo, Dick Ameln, Ashwin Vaidya, Samet Akçay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in visual anomaly detection research have seen AUROC and
+AUPRO scores on public benchmark datasets such as MVTec and VisA converge
+towards perfect recall, giving the impression that these benchmarks are
+near-solved. However, high AUROC and AUPRO scores do not always reflect
+qualitative performance, which limits the validity of these metrics in
+real-world applications. We argue that the artificial ceiling imposed by the
+lack of an adequate evaluation metric restrains progression of the field, and
+it is crucial that we revisit the evaluation metrics used to rate our
+algorithms. In response, we introduce Per-IMage Overlap (PIMO), a novel metric
+that addresses the shortcomings of AUROC and AUPRO. PIMO retains the
+recall-based nature of the existing metrics but introduces two distinctions:
+the assignment of curves (and respective area under the curve) is per-image,
+and its X-axis relies solely on normal images. Measuring recall per image
+simplifies instance score indexing and is more robust to noisy annotations. As
+we show, it also accelerates computation and enables the usage of statistical
+tests to compare models. By imposing low tolerance for false positives on
+normal images, PIMO provides an enhanced model validation procedure and
+highlights performance variations across datasets. Our experiments demonstrate
+that PIMO offers practical advantages and nuanced performance insights that
+redefine anomaly detection benchmarks -- notably challenging the perception
+that MVTec AD and VisA datasets have been solved by contemporary models.
+Available on GitHub: https://github.com/jpcbertoldo/aupimo.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This research has been conducted during Google Summer of Code 2023
+  (GSoC 2023) at OpenVINO (Intel). GSoC 2023 page:
+  https://summerofcode.withgoogle.com/archive/2023/projects/SPMopugd</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking Robustness of Multimodal Image-Text Models under
+  Distribution Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08044v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08044v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jielin Qiu, Yi Zhu, Xingjian Shi, Florian Wenzel, Zhiqiang Tang, Ding Zhao, Bo Li, Mu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal image-text models have shown remarkable performance in the past
+few years. However, evaluating robustness against distribution shifts is
+crucial before adopting them in real-world applications. In this work, we
+investigate the robustness of 12 popular open-sourced image-text models under
+common perturbations on five tasks (image-text retrieval, visual reasoning,
+visual entailment, image captioning, and text-to-image generation). In
+particular, we propose several new multimodal robustness benchmarks by applying
+17 image perturbation and 16 text perturbation techniques on top of existing
+datasets. We observe that multimodal models are not robust to image and text
+perturbations, especially to image perturbations. Among the tested perturbation
+methods, character-level perturbations constitute the most severe distribution
+shift for text, and zoom blur is the most severe shift for image data. We also
+introduce two new robustness metrics (\textbf{MMI} for MultiModal Impact score
+and \textbf{MOR} for Missing Object Rate) for proper evaluations of multimodal
+models. We hope our extensive study sheds light on new directions for the
+development of robust multimodal models. More details can be found on the
+project webpage: \url{https://MMRobustness.github.io}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Journal of Data-centric Machine Learning Research (DMLR)
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ClawCraneNet: Leveraging Object-level Relation for Text-based Video
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.10702v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.10702v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liang, Yu Wu, Yawei Luo, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-based video segmentation is a challenging task that segments out the
+natural language referred objects in videos. It essentially requires semantic
+comprehension and fine-grained video understanding. Existing methods introduce
+language representation into segmentation models in a bottom-up manner, which
+merely conducts vision-language interaction within local receptive fields of
+ConvNets. We argue that such interaction is not fulfilled since the model can
+barely construct region-level relationships given partial observations, which
+is contrary to the description logic of natural language/referring expressions.
+In fact, people usually describe a target object using relations with other
+objects, which may not be easily understood without seeing the whole video. To
+address the issue, we introduce a novel top-down approach by imitating how we
+human segment an object with the language guidance. We first figure out all
+candidate objects in videos and then choose the refereed one by parsing
+relations among those high-level objects. Three kinds of object-level relations
+are investigated for precise relationship understanding, i.e., positional
+relation, text-guided semantic relation, and temporal relation. Extensive
+experiments on A2D Sentences and J-HMDB Sentences show our method outperforms
+state-of-the-art methods by a large margin. Qualitative results also show our
+results are more explainable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version published in
+  https://ieeexplore.ieee.org/abstract/document/10083244</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PoseScript: Linking 3D Human Poses and Natural Language <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.11795v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.11795v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ginger Delmas, Philippe Weinzaepfel, Thomas Lucas, Francesc Moreno-Noguer, Grégory Rogez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language plays a critical role in many computer vision applications,
+such as image captioning, visual question answering, and cross-modal retrieval,
+to provide fine-grained semantic information. Unfortunately, while human pose
+is key to human understanding, current 3D human pose datasets lack detailed
+language descriptions. To address this issue, we have introduced the PoseScript
+dataset. This dataset pairs more than six thousand 3D human poses from AMASS
+with rich human-annotated descriptions of the body parts and their spatial
+relationships. Additionally, to increase the size of the dataset to a scale
+that is compatible with data-hungry learning algorithms, we have proposed an
+elaborate captioning process that generates automatic synthetic descriptions in
+natural language from given 3D keypoints. This process extracts low-level pose
+information, known as "posecodes", using a set of simple but generic rules on
+the 3D keypoints. These posecodes are then combined into higher level textual
+descriptions using syntactic rules. With automatic annotations, the amount of
+available data significantly scales up (100k), making it possible to
+effectively pretrain deep models for finetuning on human captions. To showcase
+the potential of annotated poses, we present three multi-modal learning tasks
+that utilize the PoseScript dataset. Firstly, we develop a pipeline that maps
+3D poses and textual descriptions into a joint embedding space, allowing for
+cross-modal retrieval of relevant poses from large-scale datasets. Secondly, we
+establish a baseline for a text-conditioned model generating 3D poses. Thirdly,
+we present a learned process for generating pose descriptions. These
+applications demonstrate the versatility and usefulness of annotated poses in
+various tasks and pave the way for future research in the field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of the ECCV 2022 paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Cross-modal Interaction from a Top-down Perspective for
+  Referring Video Object Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.01061v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.01061v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liang, Yu Wu, Tianfei Zhou, Wenguan Wang, Zongxin Yang, Yunchao Wei, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Referring video object segmentation (RVOS) aims to segment video objects with
+the guidance of natural language reference. Previous methods typically tackle
+RVOS through directly grounding linguistic reference over the image lattice.
+Such bottom-up strategy fails to explore object-level cues, easily leading to
+inferior results. In this work, we instead put forward a two-stage, top-down
+RVOS solution. First, an exhaustive set of object tracklets is constructed by
+propagating object masks detected from several sampled frames to the entire
+video. Second, a Transformer-based tracklet-language grounding module is
+proposed, which models instance-level visual relations and cross-modal
+interactions simultaneously and efficiently. Our model ranks first place on
+CVPR2021 Referring Youtube-VOS challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Champion solution in YouTube-VOS 2021 Track 3. Extended version
+  published in https://ieeexplore.ieee.org/abstract/document/10083244</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IM-IAD: Industrial Image Anomaly Detection Benchmark in Manufacturing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13359v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13359v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoyang Xie, Jinbao Wang, Jiaqi Liu, Jiayi Lyu, Yong Liu, Chengjie Wang, Feng Zheng, Yaochu Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image anomaly detection (IAD) is an emerging and vital computer vision task
+in industrial manufacturing (IM). Recently, many advanced algorithms have been
+reported, but their performance deviates considerably with various IM settings.
+We realize that the lack of a uniform IM benchmark is hindering the development
+and usage of IAD methods in real-world applications. In addition, it is
+difficult for researchers to analyze IAD algorithms without a uniform
+benchmark. To solve this problem, we propose a uniform IM benchmark, for the
+first time, to assess how well these algorithms perform, which includes various
+levels of supervision (unsupervised versus fully supervised), learning
+paradigms (few-shot, continual and noisy label), and efficiency (memory usage
+and inference speed). Then, we construct a comprehensive image anomaly
+detection benchmark (IM-IAD), which includes 19 algorithms on seven major
+datasets with a uniform setting. Extensive experiments (17,017 total) on IM-IAD
+provide in-depth insights into IAD algorithm redesign or selection. Moreover,
+the proposed IM-IAD benchmark challenges existing algorithms and suggests
+future research directions. To foster reproducibility and accessibility, the
+source code of IM-IAD is uploaded on the website,
+https://github.com/M-3LAB/IM-IAD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Matcher: Segment Anything with One Shot Using All-Purpose Feature
+  Matching <span class="chip">ICLR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13310v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13310v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu, Muzhi Zhu, Hengtao Li, Hao Chen, Xinlong Wang, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Powered by large-scale pre-training, vision foundation models exhibit
+significant potential in open-world image understanding. However, unlike large
+language models that excel at directly tackling various language tasks, vision
+foundation models require a task-specific model structure followed by
+fine-tuning on specific tasks. In this work, we present Matcher, a novel
+perception paradigm that utilizes off-the-shelf vision foundation models to
+address various perception tasks. Matcher can segment anything by using an
+in-context example without training. Additionally, we design three effective
+components within the Matcher framework to collaborate with these foundation
+models and unleash their full potential in diverse perception tasks. Matcher
+demonstrates impressive generalization performance across various segmentation
+tasks, all without training. For example, it achieves 52.7% mIoU on COCO-20$^i$
+with one example, surpassing the state-of-the-art specialist model by 1.6%. In
+addition, Matcher achieves 33.0% mIoU on the proposed LVIS-92$^i$ for one-shot
+semantic segmentation, outperforming the state-of-the-art generalist model by
+14.4%. Our visualization results further showcase the open-world generality and
+flexibility of Matcher when applied to images in the wild. Our code can be
+found at https://github.com/aim-uofa/Matcher.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Local-Global Context Aware <span class="highlight-title">Transformer</span> for Language-Guided Video
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.09773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.09773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liang, Wenguan Wang, Tianfei Zhou, Jiaxu Miao, Yawei Luo, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the task of language-guided video segmentation (LVS). Previous
+algorithms mostly adopt 3D CNNs to learn video representation, struggling to
+capture long-term context and easily suffering from visual-linguistic
+misalignment. In light of this, we present Locater (local-global context aware
+Transformer), which augments the Transformer architecture with a finite memory
+so as to query the entire video with the language expression in an efficient
+manner. The memory is designed to involve two components -- one for
+persistently preserving global video content, and one for dynamically gathering
+local temporal context and segmentation history. Based on the memorized
+local-global context and the particular content of each frame, Locater
+holistically and flexibly comprehends the expression as an adaptive query
+vector for each frame. The vector is used to query the corresponding frame for
+mask generation. The memory also allows Locater to process videos with linear
+time complexity and constant size memory, while Transformer-style
+self-attention computation scales quadratically with sequence length. To
+thoroughly examine the visual grounding capability of LVS models, we contribute
+a new LVS dataset, A2D-S+, which is built upon A2D-S dataset but poses
+increased challenges in disambiguating among similar objects. Experiments on
+three LVS datasets and our A2D-S+ show that Locater outperforms previous
+state-of-the-arts. Further, we won the 1st place in the Referring Video Object
+Segmentation Track of the 3rd Large-scale Video Object Segmentation Challenge,
+where Locater served as the foundation for the winning solution. Our code and
+dataset are available at: https://github.com/leonnnop/Locater
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TPAMI. Code, data: https://github.com/leonnnop/Locater</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-Driven Modelling for Harmonic Current Emission in Low-Voltage Grid
+  Using MCReSANet with Interpretability Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15420v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15420v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jieyu Yao, Hao Yu, Paul Judge, Jiabin Jia, Sasa Djokic, Verner Püvi, Matti Lehtonen, Jan Meyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even though the use of power electronics PE loads offers enhanced electrical
+energy conversion efficiency and control, they remain the primary sources of
+harmonics in grids. When diverse loads are connected in the distribution
+system, their interactions complicate establishing analytical models for the
+relationship between harmonic voltages and currents. To solve this, our paper
+presents a data-driven model using MCReSANet to construct the highly nonlinear
+between harmonic voltage and current. Two datasets from PCCs in Finland and
+Germany are utilized, which demonstrates that MCReSANet is capable of
+establishing accurate nonlinear mappings, even in the presence of various
+network characteristics for selected Finland and Germany datasets. The model
+built by MCReSANet can improve the MAE by 10% and 14% compared to the CNN, and
+by 8% and 17% compared to the MLP for both Finnish and German datasets, also
+showing much lower model uncertainty than others. This is a crucial
+prerequisite for more precise SHAP value-based feature importance analysis,
+which is a method for the model interpretability analysis in this paper. The
+results by feature importance analysis show the detailed relationships between
+each order of harmonic voltage and current in the distribution system. There is
+an interactive impact on each order of harmonic current, but some orders of
+harmonic voltages have a dominant influence on harmonic current emissions:
+positive sequence and zero sequence harmonics have the dominant importance in
+the Finnish and German networks, respectively, which conforms to the pattern of
+connected load types in two selected Finnish and German datasets. This paper
+enhances the potential for understanding and predicting harmonic current
+emissions by diverse PE loads in distribution systems, which is beneficial to
+more effective management for optimizing power quality in diverse grid
+environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ConstScene: <span class="highlight-title">Dataset</span> and Model for Advancing Robust Semantic Segmentation
+  in Construction Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.16516v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.16516v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maghsood Salimi, Mohammad Loni, Sara Afshar, Antonio Cicchetti, Marjan Sirjani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing demand for autonomous machines in construction environments
+necessitates the development of robust object detection algorithms that can
+perform effectively across various weather and environmental conditions. This
+paper introduces a new semantic segmentation dataset specifically tailored for
+construction sites, taking into account the diverse challenges posed by adverse
+weather and environmental conditions. The dataset is designed to enhance the
+training and evaluation of object detection models, fostering their
+adaptability and reliability in real-world construction applications. Our
+dataset comprises annotated images captured under a wide range of different
+weather conditions, including but not limited to sunny days, rainy periods,
+foggy atmospheres, and low-light situations. Additionally, environmental
+factors such as the existence of dirt/mud on the camera lens are integrated
+into the dataset through actual captures and synthetic generation to simulate
+the complex conditions prevalent in construction sites. We also generate
+synthetic images of the annotations including precise semantic segmentation
+masks for various objects commonly found in construction environments, such as
+wheel loader machines, personnel, cars, and structural elements. To demonstrate
+the dataset's utility, we evaluate state-of-the-art object detection algorithms
+on our proposed benchmark. The results highlight the dataset's success in
+adversarial training models across diverse conditions, showcasing its efficacy
+compared to existing datasets that lack such environmental variability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EZ-CLIP: Efficient Zeroshot Video Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.08010v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.08010v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahzad Ahmad, Sukalpa Chanda, Yogesh S Rawat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large-scale pre-training of visual-language models on
+paired image-text data have demonstrated impressive generalization capabilities
+for zero-shot tasks. Building on this success, efforts have been made to adapt
+these image-based visual-language models, such as CLIP, for videos extending
+their zero-shot capabilities to the video domain. While these adaptations have
+shown promising results, they come at a significant computational cost and
+struggle with effectively modeling the crucial temporal aspects inherent to the
+video domain. In this study, we present EZ-CLIP, a simple and efficient
+adaptation of CLIP that addresses these challenges. EZ-CLIP leverages temporal
+visual prompting for seamless temporal adaptation, requiring no fundamental
+alterations to the core CLIP architecture while preserving its remarkable
+generalization abilities. Moreover, we introduce a novel learning objective
+that guides the temporal visual prompts to focus on capturing motion, thereby
+enhancing its learning capabilities from video data. We conducted extensive
+experiments on five different benchmark datasets, thoroughly evaluating EZ-CLIP
+for zero-shot learning and base-to-novel video action recognition, and also
+demonstrating its potential for few-shot generalization.Impressively, with a
+mere 5.2 million learnable parameters (as opposed to the 71.1 million in the
+prior best model), EZ-CLIP can be efficiently trained on a single GPU,
+outperforming existing approaches in several evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semantic Lens: Instance-Centric Semantic Alignment for Video
+  Super-Resolution <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07823v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07823v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Tang, Yao Zhao, Meiqin Liu, Jian Jin, Chao Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a critical clue of video super-resolution (VSR), inter-frame alignment
+significantly impacts overall performance. However, accurate pixel-level
+alignment is a challenging task due to the intricate motion interweaving in the
+video. In response to this issue, we introduce a novel paradigm for VSR named
+Semantic Lens, predicated on semantic priors drawn from degraded videos.
+Specifically, video is modeled as instances, events, and scenes via a Semantic
+Extractor. Those semantics assist the Pixel Enhancer in understanding the
+recovered contents and generating more realistic visual results. The distilled
+global semantics embody the scene information of each frame, while the
+instance-specific semantics assemble the spatial-temporal contexts related to
+each instance. Furthermore, we devise a Semantics-Powered Attention
+Cross-Embedding (SPACE) block to bridge the pixel-level features with semantic
+knowledge, composed of a Global Perspective Shifter (GPS) and an
+Instance-Specific Semantic Embedding Encoder (ISEE). Concretely, the GPS module
+generates pairs of affine transformation parameters for pixel-level feature
+modulation conditioned on global semantics. After that, the ISEE module
+harnesses the attention mechanism to align the adjacent frames in the
+instance-centric semantic space. In addition, we incorporate a simple yet
+effective pre-alignment module to alleviate the difficulty of model training.
+Extensive experiments demonstrate the superiority of our model over existing
+state-of-the-art VSR methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Divide and not forget: Ensemble of selectively trained experts in
+  Continual Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10191v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10191v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Grzegorz Rypeść, Sebastian Cygert, Valeriya Khan, Tomasz Trzciński, Bartosz Zieliński, Bartłomiej Twardowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class-incremental learning is becoming more popular as it helps models widen
+their applicability while not forgetting what they already know. A trend in
+this area is to use a mixture-of-expert technique, where different models work
+together to solve the task. However, the experts are usually trained all at
+once using whole task data, which makes them all prone to forgetting and
+increasing computational burden. To address this limitation, we introduce a
+novel approach named SEED. SEED selects only one, the most optimal expert for a
+considered task, and uses data from this task to fine-tune only this expert.
+For this purpose, each expert represents each class with a Gaussian
+distribution, and the optimal expert is selected based on the similarity of
+those distributions. Consequently, SEED increases diversity and heterogeneity
+within the experts while maintaining the high stability of this ensemble
+method. The extensive experiments demonstrate that SEED achieves
+state-of-the-art performance in exemplar-free settings across various
+scenarios, showing the potential of expert diversification through data in
+continual learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for ICLR 2024 (main track), code is available at:
+  https://github.com/grypesc/SEED</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards domain-invariant <span class="highlight-title">Self-Supervised</span> Learning with Batch Styles
+  Standardization <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06088v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06088v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marin Scalbert, Maria Vakalopoulou, Florent Couzinié-Devy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Self-Supervised Learning (SSL), models are typically pretrained,
+fine-tuned, and evaluated on the same domains. However, they tend to perform
+poorly when evaluated on unseen domains, a challenge that Unsupervised Domain
+Generalization (UDG) seeks to address. Current UDG methods rely on domain
+labels, which are often challenging to collect, and domain-specific
+architectures that lack scalability when confronted with numerous domains,
+making the current methodology impractical and rigid. Inspired by
+contrastive-based UDG methods that mitigate spurious correlations by
+restricting comparisons to examples from the same domain, we hypothesize that
+eliminating style variability within a batch could provide a more convenient
+and flexible way to reduce spurious correlations without requiring domain
+labels. To verify this hypothesis, we introduce Batch Styles Standardization
+(BSS), a relatively simple yet powerful Fourier-based method to standardize the
+style of images in a batch specifically designed for integration with SSL
+methods to tackle UDG. Combining BSS with existing SSL methods offers serious
+advantages over prior UDG methods: (1) It eliminates the need for domain labels
+or domain-specific network components to enhance domain-invariance in SSL
+representations, and (2) offers flexibility as BSS can be seamlessly integrated
+with diverse contrastive-based but also non-contrastive-based SSL methods.
+Experiments on several UDG datasets demonstrate that it significantly improves
+downstream task performances on unseen domains, often outperforming or rivaling
+with UDG methods. Finally, this work clarifies the underlying mechanisms
+contributing to BSS's effectiveness in improving domain-invariance in SSL
+representations and performances on unseen domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Masked 3D Diffusion Model for Video Outpainting <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02119v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02119v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanda Fan, Chaoxu Guo, Litong Gong, Biao Wang, Tiezheng Ge, Yuning Jiang, Chunjie Luo, Jianfeng Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video outpainting aims to adequately complete missing areas at the edges of
+video frames. Compared to image outpainting, it presents an additional
+challenge as the model should maintain the temporal consistency of the filled
+area. In this paper, we introduce a masked 3D diffusion model for video
+outpainting. We use the technique of mask modeling to train the 3D diffusion
+model. This allows us to use multiple guide frames to connect the results of
+multiple video clip inferences, thus ensuring temporal consistency and reducing
+jitter between adjacent frames. Meanwhile, we extract the global frames of the
+video as prompts and guide the model to obtain information other than the
+current video clip using cross-attention. We also introduce a hybrid
+coarse-to-fine inference pipeline to alleviate the artifact accumulation
+problem. The existing coarse-to-fine pipeline only uses the infilling strategy,
+which brings degradation because the time interval of the sparse frames is too
+large. Our pipeline benefits from bidirectional learning of the mask modeling
+and thus can employ a hybrid strategy of infilling and interpolation when
+generating sparse frames. Experiments show that our method achieves
+state-of-the-art results in video outpainting tasks. More results and codes are
+provided at our https://fanfanda.github.io/M3DDM/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Model is Secretly a Training-free Open Vocabulary Semantic
+  Segmenter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinglong Wang, Xiawei Li, Jing Zhang, Qingyuan Xu, Qin Zhou, Qian Yu, Lu Sheng, Dong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pre-trained text-image discriminative models, such as CLIP, has been
+explored for open-vocabulary semantic segmentation with unsatisfactory results
+due to the loss of crucial localization information and awareness of object
+shapes. Recently, there has been a growing interest in expanding the
+application of generative models from generation tasks to semantic
+segmentation. These approaches utilize generative models either for generating
+annotated data or extracting features to facilitate semantic segmentation. This
+typically involves generating a considerable amount of synthetic data or
+requiring additional mask annotations. To this end, we uncover the potential of
+generative text-to-image diffusion models (e.g., Stable Diffusion) as highly
+efficient open-vocabulary semantic segmenters, and introduce a novel
+training-free approach named DiffSegmenter. The insight is that to generate
+realistic objects that are semantically faithful to the input text, both the
+complete object shapes and the corresponding semantics are implicitly learned
+by diffusion models. We discover that the object shapes are characterized by
+the self-attention maps while the semantics are indicated through the
+cross-attention maps produced by the denoising U-Net, forming the basis of our
+segmentation results.Additionally, we carefully design effective textual
+prompts and a category filtering mechanism to further enhance the segmentation
+results. Extensive experiments on three benchmark datasets show that the
+proposed DiffSegmenter achieves impressive results for open-vocabulary semantic
+segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain Generalization with Vital Phase Augmentation <span class="chip">AAAI-24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.16451v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.16451v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ingyun Lee, Wooju Lee, Hyun Myung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have shown remarkable performance in image
+classification. However, their performance significantly deteriorates with
+corrupted input data. Domain generalization methods have been proposed to train
+robust models against out-of-distribution data. Data augmentation in the
+frequency domain is one of such approaches that enable a model to learn phase
+features to establish domain-invariant representations. This approach changes
+the amplitudes of the input data while preserving the phases. However, using
+fixed phases leads to susceptibility to phase fluctuations because amplitudes
+and phase fluctuations commonly occur in out-of-distribution. In this study, to
+address this problem, we introduce an approach using finite variation of the
+phases of input data rather than maintaining fixed phases. Based on the
+assumption that the degree of domain-invariant features varies for each phase,
+we propose a method to distinguish phases based on this degree. In addition, we
+propose a method called vital phase augmentation (VIPAug) that applies the
+variation to the phases differently according to the degree of domain-invariant
+features of given phases. The model depends more on the vital phases that
+contain more domain-invariant features for attaining robustness to amplitude
+and phase fluctuations. We present experimental evaluations of our proposed
+approach, which exhibited improved performance for both clean and corrupted
+data. VIPAug achieved SOTA performance on the benchmark CIFAR-10 and CIFAR-100
+datasets, as well as near-SOTA performance on the ImageNet-100 and ImageNet
+datasets. Our code is available at https://github.com/excitedkid/vipaug.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI-24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning from History: Task-agnostic Model Contrastive Learning for
+  Image Restoration <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06023v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06023v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gang Wu, Junjun Jiang, Kui Jiang, Xianming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning has emerged as a prevailing paradigm for high-level
+vision tasks, which, by introducing properly negative samples, has also been
+exploited for low-level vision tasks to achieve a compact optimization space to
+account for their ill-posed nature. However, existing methods rely on manually
+predefined and task-oriented negatives, which often exhibit pronounced
+task-specific biases. To address this challenge, our paper introduces an
+innovative method termed 'learning from history', which dynamically generates
+negative samples from the target model itself. Our approach, named Model
+Contrastive paradigm for Image Restoration (MCIR), rejuvenates latency models
+as negative models, making it compatible with diverse image restoration tasks.
+We propose the Self-Prior guided Negative loss (SPN) to enable it. This
+approach significantly enhances existing models when retrained with the
+proposed model contrastive paradigm. The results show significant improvements
+in image restoration across various tasks and architectures. For example,
+models retrained with SPN outperform the original FFANet and DehazeFormer by
+3.41 dB and 0.57 dB on the RESIDE indoor dataset for image dehazing. Similarly,
+they achieve notable improvements of 0.47 dB on SPA-Data over IDT for image
+deraining and 0.12 dB on Manga109 for a 4x scale super-resolution over
+lightweight SwinIR, respectively. Code and retrained models are available at
+https://github.com/Aitical/MCIR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera Ready Version. Accepted to The 38th Annual AAAI Conference on
+  Artificial Intelligence (AAAI 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bias-Conflict Sample Synthesis and Adversarial Removal Debias Strategy
+  for Temporal Sentence Grounding in Video <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07567v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07567v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaobo Qi, Yibo Yuan, Xiaowen Ruan, Shuhui Wang, Weigang Zhang, Qingming Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Sentence Grounding in Video (TSGV) is troubled by dataset bias
+issue, which is caused by the uneven temporal distribution of the target
+moments for samples with similar semantic components in input videos or query
+texts. Existing methods resort to utilizing prior knowledge about bias to
+artificially break this uneven distribution, which only removes a limited
+amount of significant language biases. In this work, we propose the
+bias-conflict sample synthesis and adversarial removal debias strategy
+(BSSARD), which dynamically generates bias-conflict samples by explicitly
+leveraging potentially spurious correlations between single-modality features
+and the temporal position of the target moments. Through adversarial training,
+its bias generators continuously introduce biases and generate bias-conflict
+samples to deceive its grounding model. Meanwhile, the grounding model
+continuously eliminates the introduced biases, which requires it to model
+multi-modality alignment information. BSSARD will cover most kinds of coupling
+relationships and disrupt language and visual biases simultaneously. Extensive
+experiments on Charades-CD and ActivityNet-CD demonstrate the promising
+debiasing capability of BSSARD. Source codes are available at
+https://github.com/qzhb/BSSARD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wasserstein Distance-based Expansion of Low-Density Latent Regions for
+  Unknown Class Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.05594v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.05594v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prakash Mallick, Feras Dayoub, Jamie Sherrah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the significant challenge in open-set object detection
+(OSOD): the tendency of state-of-the-art detectors to erroneously classify
+unknown objects as known categories with high confidence. We present a novel
+approach that effectively identifies unknown objects by distinguishing between
+high and low-density regions in latent space. Our method builds upon the
+Open-Det (OD) framework, introducing two new elements to the loss function.
+These elements enhance the known embedding space's clustering and expand the
+unknown space's low-density regions. The first addition is the Class
+Wasserstein Anchor (CWA), a new function that refines the classification
+boundaries. The second is a spectral normalisation step, improving the
+robustness of the model. Together, these augmentations to the existing
+Contrastive Feature Learner (CFL) and Unknown Probability Learner (UPL) loss
+functions significantly improve OSOD performance. Our proposed OpenDet-CWA
+(OD-CWA) method demonstrates: a) a reduction in open-set errors by
+approximately 17%-22%, b) an enhancement in novelty detection capability by
+1.5%-16%, and c) a decrease in the wilderness index by 2%-20% across various
+open-set scenarios. These results represent a substantial advancement in the
+field, showcasing the potential of our approach in managing the complexities of
+open-set object detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 Full length pages, followed by 2 supplementary pages, total of 9
+  Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Compositional Representations for Few-shot Action
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.09424v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.09424v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changzhen Li, Jie Zhang, Shuzhe Wu, Xin Jin, Shiguang Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently action recognition has received more and more attention for its
+comprehensive and practical applications in intelligent surveillance and
+human-computer interaction. However, few-shot action recognition has not been
+well explored and remains challenging because of data scarcity. In this paper,
+we propose a novel hierarchical compositional representations (HCR) learning
+approach for few-shot action recognition. Specifically, we divide a complicated
+action into several sub-actions by carefully designed hierarchical clustering
+and further decompose the sub-actions into more fine-grained spatially
+attentional sub-actions (SAS-actions). Although there exist large differences
+between base classes and novel classes, they can share similar patterns in
+sub-actions or SAS-actions. Furthermore, we adopt the Earth Mover's Distance in
+the transportation problem to measure the similarity between video samples in
+terms of sub-action representations. It computes the optimal matching flows
+between sub-actions as distance metric, which is favorable for comparing
+fine-grained patterns. Extensive experiments show our method achieves the
+state-of-the-art results on HMDB51, UCF101 and Kinetics datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Computer Vision and Image Understanding</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Skeleton-Guided Instance Separation for Fine-Grained Segmentation in
+  Microscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Wang, Chengfeng Zhou, Zhaoyan Ming, Lina Wei, Xudong Jiang, Dahong Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the fundamental challenges in microscopy (MS) image analysis is
+instance segmentation (IS), particularly when segmenting cluster regions where
+multiple objects of varying sizes and shapes may be connected or even
+overlapped in arbitrary orientations. Existing IS methods usually fail in
+handling such scenarios, as they rely on coarse instance representations such
+as keypoints and horizontal bounding boxes (h-bboxes). In this paper, we
+propose a novel one-stage framework named A2B-IS to address this challenge and
+enhance the accuracy of IS in MS images. Our approach represents each instance
+with a pixel-level mask map and a rotated bounding box (r-bbox). Unlike
+two-stage methods that use box proposals for segmentations, our method
+decouples mask and box predictions, enabling simultaneous processing to
+streamline the model pipeline. Additionally, we introduce a Gaussian skeleton
+map to aid the IS task in two key ways: (1) It guides anchor placement,
+reducing computational costs while improving the model's capacity to learn
+RoI-aware features by filtering out noise from background regions. (2) It
+ensures accurate isolation of densely packed instances by rectifying erroneous
+box predictions near instance boundaries. To further enhance the performance,
+we integrate two modules into the framework: (1) An Atrous Attention Block
+(A2B) designed to extract high-resolution feature maps with fine-grained
+multiscale information, and (2) A Semi-Supervised Learning (SSL) strategy that
+leverages both labeled and unlabeled images for model training. Our method has
+been thoroughly validated on two large-scale MS datasets, demonstrating its
+superiority over most state-of-the-art approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diagnosis Of Takotsubo Syndrome By Robust Feature Selection From The
+  Complex Latent Space Of DL-based Segmentation Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12653v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12653v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fahim Ahmed Zaman, Wahidul Alam, Tarun Kanti Roy, Amanda Chang, Kan Liu, Xiaodong Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Researchers have shown significant correlations among segmented objects in
+various medical imaging modalities and disease related pathologies. Several
+studies showed that using hand crafted features for disease prediction neglects
+the immense possibility to use latent features from deep learning (DL) models
+which may reduce the overall accuracy of differential diagnosis. However,
+directly using classification or segmentation models on medical to learn latent
+features opt out robust feature selection and may lead to overfitting. To fill
+this gap, we propose a novel feature selection technique using the latent space
+of a segmentation model that can aid diagnosis. We evaluated our method in
+differentiating a rare cardiac disease: Takotsubo Syndrome (TTS) from the ST
+elevation myocardial infarction (STEMI) using echocardiogram videos (echo). TTS
+can mimic clinical features of STEMI in echo and extremely hard to distinguish.
+Our approach shows promising results in differential diagnosis of TTS with 82%
+diagnosis accuracy beating the previous state-of-the-art (SOTA) approach.
+Moreover, the robust feature selection technique using LASSO algorithm shows
+great potential in reducing the redundant features and creates a robust
+pipeline for short- and long-term disease prognoses in the downstream analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion-Zero: Zero-Shot Moving Object Control Framework for
+  Diffusion-Based Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10150v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10150v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changgu Chen, Junwei Shu, Lianggangxu Chen, Gaoqi He, Changbo Wang, Yang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent large-scale pre-trained diffusion models have demonstrated a powerful
+generative ability to produce high-quality videos from detailed text
+descriptions. However, exerting control over the motion of objects in videos
+generated by any video diffusion model is a challenging problem. In this paper,
+we propose a novel zero-shot moving object trajectory control framework,
+Motion-Zero, to enable a bounding-box-trajectories-controlled text-to-video
+diffusion model.To this end, an initial noise prior module is designed to
+provide a position-based prior to improve the stability of the appearance of
+the moving object and the accuracy of position. In addition, based on the
+attention map of the U-net, spatial constraints are directly applied to the
+denoising process of diffusion models, which further ensures the positional and
+spatial consistency of moving objects during the inference. Furthermore,
+temporal consistency is guaranteed with a proposed shift temporal attention
+mechanism. Our method can be flexibly applied to various state-of-the-art video
+diffusion models without any training process. Extensive experiments
+demonstrate our proposed method can control the motion trajectories of objects
+and generate high-quality videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast graph-based denoising for point cloud color information <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09721v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09721v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryosuke Watanabe, Keisuke Nonaka, Eduardo Pavez, Tatsuya Kobayashi, Antonio Ortega
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point clouds are utilized in various 3D applications such as cross-reality
+(XR) and realistic 3D displays. In some applications, e.g., for live streaming
+using a 3D point cloud, real-time point cloud denoising methods are required to
+enhance the visual quality. However, conventional high-precision denoising
+methods cannot be executed in real time for large-scale point clouds owing to
+the complexity of graph constructions with K nearest neighbors and noise level
+estimation. This paper proposes a fast graph-based denoising (FGBD) for a
+large-scale point cloud. First, high-speed graph construction is achieved by
+scanning a point cloud in various directions and searching adjacent
+neighborhoods on the scanning lines. Second, we propose a fast noise level
+estimation method using eigenvalues of the covariance matrix on a graph.
+Finally, we also propose a new low-cost filter selection method to enhance
+denoising accuracy to compensate for the degradation caused by the acceleration
+algorithms. In our experiments, we succeeded in reducing the processing time
+dramatically while maintaining accuracy relative to conventional denoising
+methods. Denoising was performed at 30fps, with frames containing approximately
+1 million points.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the proceeding of 2024 IEEE International Conference on
+  Acoustics, Speech and Signal Processing (ICASSP 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Progressive Text-to-Image Diffusion with Soft Latent Direction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09466v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09466v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        YuTeng Ye, Jiale Cai, Hang Zhou, Guanwen Li, Youjia Zhang, Zikai Song, Chenxing Gao, Junqing Yu, Wei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In spite of the rapidly evolving landscape of text-to-image generation, the
+synthesis and manipulation of multiple entities while adhering to specific
+relational constraints pose enduring challenges. This paper introduces an
+innovative progressive synthesis and editing operation that systematically
+incorporates entities into the target image, ensuring their adherence to
+spatial and relational constraints at each sequential step. Our key insight
+stems from the observation that while a pre-trained text-to-image diffusion
+model adeptly handles one or two entities, it often falters when dealing with a
+greater number. To address this limitation, we propose harnessing the
+capabilities of a Large Language Model (LLM) to decompose intricate and
+protracted text descriptions into coherent directives adhering to stringent
+formats. To facilitate the execution of directives involving distinct semantic
+operations-namely insertion, editing, and erasing-we formulate the Stimulus,
+Response, and Fusion (SRF) framework. Within this framework, latent regions are
+gently stimulated in alignment with each operation, followed by the fusion of
+the responsive latent components to achieve cohesive entity manipulation. Our
+proposed framework yields notable advancements in object synthesis,
+particularly when confronted with intricate and lengthy textual inputs.
+Consequently, it establishes a new benchmark for text-to-image generation
+tasks, further elevating the field's performance standards.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Modality Perturbation Synergy Attack for Person Re-identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10090v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10090v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunpeng Gong, Zhun Zhong, Zhiming Luo, Yansong Qu, Rongrong Ji, Min Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there has been significant research focusing on addressing
+security concerns in single-modal person re-identification (ReID) systems that
+are based on RGB images. However, the safety of cross-modality scenarios, which
+are more commonly encountered in practical applications involving images
+captured by infrared cameras, has not received adequate attention. The main
+challenge in cross-modality ReID lies in effectively dealing with visual
+differences between different modalities. For instance, infrared images are
+typically grayscale, unlike visible images that contain color information.
+Existing attack methods have primarily focused on the characteristics of the
+visible image modality, overlooking the features of other modalities and the
+variations in data distribution among different modalities. This oversight can
+potentially undermine the effectiveness of these methods in image retrieval
+across diverse modalities. This study represents the first exploration into the
+security of cross-modality ReID models and proposes a universal perturbation
+attack specifically designed for cross-modality ReID. This attack optimizes
+perturbations by leveraging gradients from diverse modality data, thereby
+disrupting the discriminator and reinforcing the differences between
+modalities. We conducted experiments on two widely used cross-modality
+datasets, namely RegDB and SYSU, which not only demonstrated the effectiveness
+of our method but also provided insights for future enhancements in the
+robustness of cross-modality ReID systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion-based Data Augmentation for Nuclei Image Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.14197v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.14197v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyi Yu, Guanbin Li, Wei Lou, Siqi Liu, Xiang Wan, Yan Chen, Haofeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nuclei segmentation is a fundamental but challenging task in the quantitative
+analysis of histopathology images. Although fully-supervised deep
+learning-based methods have made significant progress, a large number of
+labeled images are required to achieve great segmentation performance.
+Considering that manually labeling all nuclei instances for a dataset is
+inefficient, obtaining a large-scale human-annotated dataset is time-consuming
+and labor-intensive. Therefore, augmenting a dataset with only a few labeled
+images to improve the segmentation performance is of significant research and
+application value. In this paper, we introduce the first diffusion-based
+augmentation method for nuclei segmentation. The idea is to synthesize a large
+number of labeled images to facilitate training the segmentation model. To
+achieve this, we propose a two-step strategy. In the first step, we train an
+unconditional diffusion model to synthesize the Nuclei Structure that is
+defined as the representation of pixel-level semantic and distance transform.
+Each synthetic nuclei structure will serve as a constraint on histopathology
+image synthesis and is further post-processed to be an instance map. In the
+second step, we train a conditioned diffusion model to synthesize
+histopathology images based on nuclei structures. The synthetic histopathology
+images paired with synthetic instance maps will be added to the real dataset
+for training the segmentation model. The experimental results show that by
+augmenting 10% labeled real dataset with synthetic samples, one can achieve
+comparable segmentation results with the fully-supervised baseline. The code is
+released in: https://github.com/lhaof/Nudiff
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023, released code: https://github.com/lhaof/Nudiff</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning
+  and Optimization Functions for Enhanced Precision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15497v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15497v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel De Araujo, Shanlin Sun, Xiaohui Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image registration has traditionally been done using two distinct approaches:
+learning based methods, relying on robust deep neural networks, and
+optimization-based methods, applying complex mathematical transformations to
+warp images accordingly. Of course, both paradigms offer advantages and
+disadvantages, and, in this work, we seek to combine their respective strengths
+into a single streamlined framework, using the outputs of the learning based
+method as initial parameters for optimization while prioritizing computational
+power for the image pairs that offer the greatest loss. Our investigations
+showed improvements of up to 1.6% in test data, while maintaining the same
+inference time, and a substantial 1.0% points performance gain in deformation
+field smoothness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting Multiple Sequence Lengths in Fast End to End Training for
+  Image Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06551v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06551v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia Cheng Hu, Roberto Cavicchioli, Alessandro Capotondi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a method called the Expansion mechanism that processes the input
+unconstrained by the number of elements in the sequence. By doing so, the model
+can learn more effectively compared to traditional attention-based approaches.
+To support this claim, we design a novel architecture ExpansionNet v2 that
+achieved strong results on the MS COCO 2014 Image Captioning challenge and the
+State of the Art in its respective category, with a score of 143.7 CIDErD in
+the offline test split, 140.8 CIDErD in the online evaluation server and 72.9
+AllCIDEr on the nocaps validation set. Additionally, we introduce an End to End
+training algorithm up to 2.8 times faster than established alternatives. Source
+code available at: https://github.com/jchenghu/ExpansionNet_v2
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VIPTR: A Vision Permutable Extractor for Fast and Efficient Scene Text
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10110v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10110v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianfu Cheng, Weixiao Zhou, Xiang Li, Xiaoming Chen, Jian Yang, Tongliang Li, Zhoujun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Text Recognition (STR) is a challenging task that involves recognizing
+text within images of natural scenes. Although current state-of-the-art models
+for STR exhibit high performance, they typically suffer from low inference
+efficiency due to their reliance on hybrid architectures comprised of visual
+encoders and sequence decoders. In this work, we propose the VIsion Permutable
+extractor for fast and efficient scene Text Recognition (VIPTR), which achieves
+an impressive balance between high performance and rapid inference speeds in
+the domain of STR. Specifically, VIPTR leverages a visual-semantic extractor
+with a pyramid structure, characterized by multiple self-attention layers,
+while eschewing the traditional sequence decoder. This design choice results in
+a lightweight and efficient model capable of handling inputs of varying sizes.
+Extensive experimental results on various standard datasets for both Chinese
+and English scene text recognition validate the superiority of VIPTR. Notably,
+the VIPTR-T (Tiny) variant delivers highly competitive accuracy on par with
+other lightweight models and achieves SOTA inference speeds. Meanwhile, the
+VIPTR-L (Large) variant attains greater recognition accuracy, while maintaining
+a low parameter count and favorable inference speed. Our proposed method
+provides a compelling solution for the STR challenge, which blends high
+accuracy with efficiency and greatly benefits real-world applications requiring
+fast and reliable text recognition. The code is publicly available at
+https://github.com/cxfyxl/VIPTR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2205.00159 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WaterHE-NeRF: Water-ray Tracing Neural Radiance Fields for Underwater
+  Scene Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.06946v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.06946v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingchun Zhou, Tianyu Liang, Dehuan Zhang, Zongxin He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Field (NeRF) technology demonstrates immense potential in
+novel viewpoint synthesis tasks, due to its physics-based volumetric rendering
+process, which is particularly promising in underwater scenes. Addressing the
+limitations of existing underwater NeRF methods in handling light attenuation
+caused by the water medium and the lack of real Ground Truth (GT) supervision,
+this study proposes WaterHE-NeRF. We develop a new water-ray tracing field by
+Retinex theory that precisely encodes color, density, and illuminance
+attenuation in three-dimensional space. WaterHE-NeRF, through its illuminance
+attenuation mechanism, generates both degraded and clear multi-view images and
+optimizes image restoration by combining reconstruction loss with Wasserstein
+distance. Additionally, the use of histogram equalization (HE) as pseudo-GT
+enhances the network's accuracy in preserving original details and color
+distribution. Extensive experiments on real underwater datasets and synthetic
+datasets validate the effectiveness of WaterHE-NeRF. Our code will be made
+publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DynPoint: Dynamic Neural Point For View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.18999v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.18999v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaichen Zhou, Jia-Xing Zhong, Sangyun Shin, Kai Lu, Yiyuan Yang, Andrew Markham, Niki Trigoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The introduction of neural radiance fields has greatly improved the
+effectiveness of view synthesis for monocular videos. However, existing
+algorithms face difficulties when dealing with uncontrolled or lengthy
+scenarios, and require extensive training time specific to each new scenario.
+To tackle these limitations, we propose DynPoint, an algorithm designed to
+facilitate the rapid synthesis of novel views for unconstrained monocular
+videos. Rather than encoding the entirety of the scenario information into a
+latent representation, DynPoint concentrates on predicting the explicit 3D
+correspondence between neighboring frames to realize information aggregation.
+Specifically, this correspondence prediction is achieved through the estimation
+of consistent depth and scene flow information across frames. Subsequently, the
+acquired correspondence is utilized to aggregate information from multiple
+reference frames to a target frame, by constructing hierarchical neural point
+clouds. The resulting framework enables swift and accurate view synthesis for
+desired views of target frames. The experimental results obtained demonstrate
+the considerable acceleration of training time achieved - typically an order of
+magnitude - by our proposed method while yielding comparable outcomes compared
+to prior approaches. Furthermore, our method exhibits strong robustness in
+handling long-duration videos without learning a canonical representation of
+video content.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Adversarial Robustness of Camera-based 3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.10766v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.10766v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoyuan Xie, Zichao Li, Zeyu Wang, Cihang Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, camera-based 3D object detection has gained widespread
+attention for its ability to achieve high performance with low computational
+cost. However, the robustness of these methods to adversarial attacks has not
+been thoroughly examined, especially when considering their deployment in
+safety-critical domains like autonomous driving. In this study, we conduct the
+first comprehensive investigation of the robustness of leading camera-based 3D
+object detection approaches under various adversarial conditions. We
+systematically analyze the resilience of these models under two attack
+settings: white-box and black-box; focusing on two primary objectives:
+classification and localization. Additionally, we delve into two types of
+adversarial attack techniques: pixel-based and patch-based. Our experiments
+yield four interesting findings: (a) bird's-eye-view-based representations
+exhibit stronger robustness against localization attacks; (b)
+depth-estimation-free approaches have the potential to show stronger
+robustness; (c) accurate depth estimation effectively improves robustness for
+depth-estimation-based methods; (d) incorporating multi-frame benign inputs can
+effectively mitigate adversarial attacks. We hope our findings can steer the
+development of future camera-based object detection models with enhanced
+adversarial robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Transactions on Machine Learning Research, 2024. ISSN 2835-8856</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IA2U: A Transfer Plugin with Multi-Prior for In-Air Model to Underwater 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.06955v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.06955v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingchun Zhou, Qilin Gai, Kin-man Lam, Xianping Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In underwater environments, variations in suspended particle concentration
+and turbidity cause severe image degradation, posing significant challenges to
+image enhancement (IE) and object detection (OD) tasks. Currently, in-air image
+enhancement and detection methods have made notable progress, but their
+application in underwater conditions is limited due to the complexity and
+variability of these environments. Fine-tuning in-air models saves high
+overhead and has more optional reference work than building an underwater model
+from scratch. To address these issues, we design a transfer plugin with
+multiple priors for converting in-air models to underwater applications, named
+IA2U. IA2U enables efficient application in underwater scenarios, thereby
+improving performance in Underwater IE and OD. IA2U integrates three types of
+underwater priors: the water type prior that characterizes the degree of image
+degradation, such as color and visibility; the degradation prior, focusing on
+differences in details and textures; and the sample prior, considering the
+environmental conditions at the time of capture and the characteristics of the
+photographed object. Utilizing a Transformer-like structure, IA2U employs these
+priors as query conditions and a joint task loss function to achieve
+hierarchical enhancement of task-level underwater image features, therefore
+considering the requirements of two different tasks, IE and OD. Experimental
+results show that IA2U combined with an in-air model can achieve superior
+performance in underwater image enhancement and object detection tasks. The
+code will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DGNet: Dynamic Gradient-guided Network with Noise Suppression for
+  Underwater Image Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.06999v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.06999v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingchun Zhou, Zongxin He, Dehuan Zhang, Kin-man Lam, Xianping Fu, Yi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underwater image enhancement (UIE) is a challenging task due to the complex
+degradation caused by underwater environments. To solve this issue, previous
+methods often idealize the degradation process, and neglect the impact of
+medium noise and object motion on the distribution of image features, limiting
+the generalization and adaptability of the model. Previous methods use the
+reference gradient that is constructed from original images and synthetic
+ground-truth images. This may cause the network performance to be influenced by
+some low-quality training data. Our approach utilizes predicted images to
+dynamically update pseudo-labels, adding a dynamic gradient to optimize the
+network's gradient space. This process improves image quality and avoids local
+optima. Moreover, we propose a Feature Restoration and Reconstruction module
+(FRR) based on a Channel Combination Inference (CCI) strategy and a Frequency
+Domain Smoothing module (FRS). These modules decouple other degradation
+features while reducing the impact of various types of noise on network
+performance. Experiments on multiple public datasets demonstrate the
+superiority of our method over existing state-of-the-art approaches, especially
+in achieving performance milestones: PSNR of 25.6dB and SSIM of 0.93 on the
+UIEB dataset. Its efficiency in terms of parameter size and inference time
+further attests to its broad practicality. The code will be made publicly
+available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OTS: A One-shot Learning Approach for Text Spotting in Historical
+  Manuscripts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.00746v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.00746v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenbo Hu, Hongjian Zhan, Cong Liu, Bing Yin, Yue Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of historical manuscript research, scholars frequently encounter
+novel symbols in ancient texts, investing considerable effort in their
+identification and documentation. Although some object detection methods have
+achieved impressive performance, they primarily excel at detecting categories
+included in training datasets, often failing to recognize novel symbols without
+retraining. To overcome this limitation, we propose a novel One-shot
+learning-based Text Spotting (OTS) approach that accurately and reliably spots
+novel characters with just one annotated support sample. Drawing inspiration
+from cognitive research, we introduce a spatial alignment module that finds,
+focuses on, and learns the most discriminative spatial regions in the query
+image based on one support image. Especially, since the low-resource spotting
+task often faces the problem of example imbalance, we propose a novel loss
+function called torus loss which can make the embedding space of distance
+metric more discriminative. Our approach is highly efficient and requires only
+a few training samples while exhibiting the remarkable ability to handle novel
+characters and symbols. To enhance dataset diversity, a new manuscript dataset
+that contains the ancient Dongba hieroglyphics (DBH) is created, a script
+associated with China and developed by the ancestors of the Naxi minority. We
+conduct experiments on publicly available DBH, EGY, VML-HD, TKH, and NC
+datasets. The experimental results demonstrate that OTS outperforms the
+state-of-the-art methods in one-shot text spotting. Overall, our proposed
+method offers promising applications in text spotting in historical
+manuscripts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Model with Perceptual Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00110v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00110v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanchuan Lin, Xiao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models trained with mean squared error loss tend to generate
+unrealistic samples. Current state-of-the-art models rely on classifier-free
+guidance to improve sample quality, yet its surprising effectiveness is not
+fully understood. In this paper, we show that the effectiveness of
+classifier-free guidance partly originates from it being a form of implicit
+perceptual guidance. As a result, we can directly incorporate perceptual loss
+in diffusion training to improve sample quality. Since the score matching
+objective used in diffusion training strongly resembles the denoising
+autoencoder objective used in unsupervised training of perceptual networks, the
+diffusion model itself is a perceptual network and can be used to generate
+meaningful perceptual loss. We propose a novel self-perceptual objective that
+results in diffusion models capable of generating more realistic samples. For
+conditional generation, our method only improves sample quality without
+entanglement with the conditional input and therefore does not sacrifice sample
+diversity. Our method can also improve sample quality for unconditional
+generation, which was not possible with classifier-free guidance before.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">18</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using LLMs to discover emerging coded antisemitic hate-speech emergence
+  in extremist social media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhanush Kikkisetti, Raza Ul Mustafa, Wendy Melillo, Roberto Corizzo, Zois Boukouvalas, Jeff Gill, Nathalie Japkowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online hate speech proliferation has created a difficult problem for social
+media platforms. A particular challenge relates to the use of coded language by
+groups interested in both creating a sense of belonging for its users and
+evading detection. Coded language evolves quickly and its use varies over time.
+This paper proposes a methodology for detecting emerging coded hate-laden
+terminology. The methodology is tested in the context of online antisemitic
+discourse. The approach considers posts scraped from social media platforms,
+often used by extremist users. The posts are scraped using seed expressions
+related to previously known discourse of hatred towards Jews. The method begins
+by identifying the expressions most representative of each post and calculating
+their frequency in the whole corpus. It filters out grammatically incoherent
+expressions as well as previously encountered ones so as to focus on emergent
+well-formed terminology. This is followed by an assessment of semantic
+similarity to known antisemitic terminology using a fine-tuned large language
+model, and subsequent filtering out of the expressions that are too distant
+from known expressions of hatred. Emergent antisemitic expressions containing
+terms clearly relating to Jewish topics are then removed to return only coded
+expressions of hatred.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, 2 algorithms, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Q&A of Clinical Documents with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Elgedawy, Sudarshan Srinivasan, Ioana Danciu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electronic health records (EHRs) house crucial patient data in clinical
+notes. As these notes grow in volume and complexity, manual extraction becomes
+challenging. This work introduces a natural language interface using large
+language models (LLMs) for dynamic question-answering on clinical notes. Our
+chatbot, powered by Langchain and transformer-based LLMs, allows users to query
+in natural language, receiving relevant answers from clinical notes.
+Experiments, utilizing various embedding models and advanced LLMs, show Wizard
+Vicuna's superior accuracy, albeit with high compute demands. Model
+optimization, including weight quantization, improves latency by approximately
+48 times. Promising results indicate potential, yet challenges such as model
+hallucinations and limited diverse medical case evaluations remain. Addressing
+these gaps is crucial for unlocking the value in clinical notes and advancing
+AI-driven clinical decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and
+  unfairness in dyadic regression models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Paz-Ruza, Amparo Alonso-Betanzos, Bertha Guijarro-Berdiñas, Brais Cancela, Carlos Eiras-Franco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dyadic regression models, which predict real-valued outcomes for pairs of
+entities, are fundamental in many domains (e.g. predicting the rating of a user
+to a product in Recommender Systems) and promising and under exploration in
+many others (e.g. approximating the adequate dosage of a drug for a patient in
+personalized pharmacology). In this work, we demonstrate that non-uniformity in
+the observed value distributions of individual entities leads to severely
+biased predictions in state-of-the-art models, skewing predictions towards the
+average of observed past values for the entity and providing worse-than-random
+predictive power in eccentric yet equally important cases. We show that the
+usage of global error metrics like Root Mean Squared Error (RMSE) and Mean
+Absolute Error (MAE) is insufficient to capture this phenomenon, which we name
+eccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as
+a new complementary metric that can quantify it in all studied models and
+datasets. We also prove the adequateness of EAUC by using naive de-biasing
+corrections to demonstrate that a lower model bias correlates with a lower EAUC
+and vice-versa. This work contributes a bias-aware evaluation of dyadic
+regression models to avoid potential unfairness and risks in critical
+real-world applications of such systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Construction of Multi-faceted User Profiles using Text
+  Clustering and its Application to Expert Recommendation and Filtering
+  Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10634v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10634v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis M. de Campos, Juan M. Fernández-Luna, Juan F. Huete, Luis Redondo-Expósito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the information age we are living in today, not only are we interested in
+accessing multimedia objects such as documents, videos, etc. but also in
+searching for professional experts, people or celebrities, possibly for
+professional needs or just for fun. Information access systems need to be able
+to extract and exploit various sources of information (usually in text format)
+about such individuals, and to represent them in a suitable way usually in the
+form of a profile. In this article, we tackle the problems of profile-based
+expert recommendation and document filtering from a machine learning
+perspective by clustering expert textual sources to build profiles and capture
+the different hidden topics in which the experts are interested. The experts
+will then be represented by means of multi-faceted profiles. Our experiments
+show that this is a valid technique to improve the performance of expert
+finding and document filtering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LDA-based Term Profiles for Expert Finding in a Political Setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis M. de Campos, Juan M. Fernández-Luna, Juan F. Huete, Luis Redondo-Expósito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common task in many political institutions (i.e. Parliament) is to find
+politicians who are experts in a particular field. In order to tackle this
+problem, the first step is to obtain politician profiles which include their
+interests, and these can be automatically learned from their speeches. As a
+politician may have various areas of expertise, one alternative is to use a set
+of subprofiles, each of which covers a different subject. In this study, we
+propose a novel approach for this task by using latent Dirichlet allocation
+(LDA) to determine the main underlying topics of each political speech, and to
+distribute the related terms among the different topic-based subprofiles. With
+this objective, we propose the use of fifteen distance and similarity measures
+to automatically determine the optimal number of topics discussed in a
+document, and to demonstrate that every measure converges into five strategies:
+Euclidean, Dice, Sorensen, Cosine and Overlap. Our experimental results showed
+that the scores of the different accuracy metrics of the proposed strategies
+tended to be higher than those of the baselines for expert recommendation
+tasks, and that the use of an appropriate number of topics has proved relevant.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Publication venue recommendation using profiles based on clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis M. de Campos, Juan M. Fernández-Luna, Juan F. Huete
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we study the venue recommendation problem in order to help
+researchers to identify a journal or conference to submit a given paper. A
+common approach to tackle this problem is to build profiles defining the scope
+of each venue. Then, these profiles are compared against the target paper. In
+our approach we will study how clustering techniques can be used to construct
+topic-based profiles and use an Information Retrieval based approach to obtain
+the final recommendations. Additionally, we will explore how the use of
+authorship, representing a complementary piece of information, helps to improve
+the recommendations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Use of topical and temporal profiles and their hybridisation for
+  content-based recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10607v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10607v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis M. de Campos, Juan M. Fernández-Luna, Juan F. Huete
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of content-based recommender systems, the aim of this paper is
+to determine how better profiles can be built and how these affect the
+recommendation process based on the incorporation of temporality, i.e. the
+inclusion of time in the recommendation process, and topicality, i.e. the
+representation of texts associated with users and items using topics and their
+combination. The main contribution of the paper is to present two different
+ways of hybridising these two dimensions and to evaluate and compare them with
+other alternatives.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Biases in Chat<span class="highlight-title">GPT</span>-based Recommender Systems: Provider
+  Fairness, Temporal Stability, and Recency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yashar Deldjoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the nuanced capabilities and inherent biases of
+Recommender Systems using Large Language Models (RecLLMs), with a focus on
+ChatGPT-based systems. It studies into the contrasting behaviors of generative
+models and traditional collaborative filtering models in movie recommendations.
+The research primarily investigates prompt design strategies and their impact
+on various aspects of recommendation quality, including accuracy, provider
+fairness, diversity, stability, genre dominance, and temporal freshness
+(recency).
+  Our experimental analysis reveals that the introduction of specific 'system
+roles' and 'prompt strategies' in RecLLMs significantly influences their
+performance. For instance, role-based prompts enhance fairness and diversity in
+recommendations, mitigating popularity bias. We find that while GPT-based
+models do not always match the performance of CF baselines, they exhibit a
+unique tendency to recommend newer and more diverse movie genres. Notably,
+GPT-based models tend to recommend more recent films, particularly those
+released post-2000, and show a preference for genres like \sq{Drama} and
+Comedy, and Romance (compared to CF Action, Adventure) presumably due to the
+RecLLMs' training on varied data sets, which allows them to capture recent
+trends and discussions more effectively than CF models. Interestingly, our
+results demonstrate that the 'Simple' and 'Chain of Thought (COT)' paradigms
+yield the highest accuracy. These findings imply the potential of combining
+these strategies with scenarios that favor more recent content, thereby
+offering a more balanced and up-to-date recommendation experience. This study
+contributes significantly to the understanding of emerging RecLLMs,
+particularly in the context of harms and biases within these systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Dense Retrieval: Memory Can Be a Burden <span class="chip">EACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiwen Yuan, Xinglin Wang, Shaoxiong Feng, Boyuan Pan, Yiwei Li, Heda Wang, Xupeng Miao, Kan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Retrieval (GR), autoregressively decoding relevant document
+identifiers given a query, has been shown to perform well under the setting of
+small-scale corpora. By memorizing the document corpus with model parameters,
+GR implicitly achieves deep interaction between query and document. However,
+such a memorizing mechanism faces three drawbacks: (1) Poor memory accuracy for
+fine-grained features of documents; (2) Memory confusion gets worse as the
+corpus size increases; (3) Huge memory update costs for new documents. To
+alleviate these problems, we propose the Generative Dense Retrieval (GDR)
+paradigm. Specifically, GDR first uses the limited memory volume to achieve
+inter-cluster matching from query to relevant document clusters.
+Memorizing-free matching mechanism from Dense Retrieval (DR) is then introduced
+to conduct fine-grained intra-cluster matching from clusters to relevant
+documents. The coarse-to-fine process maximizes the advantages of GR's deep
+interaction and DR's scalability. Besides, we design a cluster identifier
+constructing strategy to facilitate corpus memory and a cluster-adaptive
+negative sampling strategy to enhance the intra-cluster mapping ability.
+Empirical results show that GDR obtains an average of 3.0 R@100 improvement on
+NQ dataset under multiple settings and has better scalability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EACL 2024 main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Scalability in Recommender Systems through Lottery Ticket
+  Hypothesis and Knowledge Distillation-based Neural Network Pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10484v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10484v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajaram R, Manoj Bharadhwaj, Vasan VS, Nargis Pervin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces an innovative approach aimed at the efficient pruning
+of neural networks, with a particular focus on their deployment on edge
+devices. Our method involves the integration of the Lottery Ticket Hypothesis
+(LTH) with the Knowledge Distillation (KD) framework, resulting in the
+formulation of three distinct pruning models. These models have been developed
+to address scalability issue in recommender systems, whereby the complexities
+of deep learning models have hindered their practical deployment. With
+judicious application of the pruning techniques, we effectively curtail the
+power consumption and model dimensions without compromising on accuracy.
+Empirical evaluation has been performed using two real world datasets from
+diverse domains against two baselines. Gratifyingly, our approaches yielded a
+GPU computation-power reduction of up to 66.67%. Notably, our study contributes
+to the field of recommendation system by pioneering the application of LTH and
+KD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in WITS 2023 as a workshop paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the selection of the correct number of terms for profile
+  construction: theoretical and empirical analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis M. de Campos, Juan M. Fernández-Luna, Juan F. Huete
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we examine the problem of building a user profile from a set
+of documents. This profile will consist of a subset of the most representative
+terms in the documents that best represent user preferences or interests.
+Inspired by the discrete concentration theory we have conducted an axiomatic
+study of seven properties that a selection function should fulfill: the minimum
+and maximum uncertainty principle, invariant to adding zeros, invariant to
+scale transformations, principle of nominal increase, transfer principle and
+the richest get richer inequality. We also present a novel selection function
+based on the use of similarity metrics, and more specifically the cosine
+measure which is commonly used in information retrieval, and demonstrate that
+this verifies six of the properties in addition to a weaker variant of the
+transfer principle, thereby representing a good selection approach. The
+theoretical study was complemented with an empirical study to compare the
+performance of different selection criteria (weight- and unweight-based) using
+real data in a parliamentary setting. In this study, we analyze the performance
+of the different functions focusing on the two main factors affecting the
+selection process: profile size (number of terms) and weight distribution.
+These profiles are then used in a document filtering task to show that our
+similarity-based approach performs well in terms not only of recommendation
+accuracy but also efficiency (we obtain smaller profiles and consequently
+faster recommendations).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Positive unlabeled learning for building recommender systems in a
+  parliamentary setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis M. de Camposa, Juan M. Fernández-Luna, Juan F. Huete, Luis Redondo-Expósito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our goal is to learn about the political interests and preferences of the
+Members of Parliament by mining their parliamentary activity, in order to
+develop a recommendation/filtering system that, given a stream of documents to
+be distributed among them, is able to decide which documents should receive
+each Member of Parliament. We propose to use positive unlabeled learning to
+tackle this problem, because we only have information about relevant documents
+(the own interventions of each Member of Parliament in the debates) but not
+about irrelevant documents, so that we cannot use standard binary classifiers
+trained with positive and negative examples. We have also developed a new
+algorithm of this type, which compares favourably with: a) the baseline
+approach assuming that all the interventions of other Members of Parliament are
+irrelevant, b) another well-known positive unlabeled learning method and c) an
+approach based on information retrieval methods that matches documents and
+legislators' representations. The experiments have been carried out with data
+from the regional Andalusian Parliament at Spain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Revolution on Chat Bot: Evidence from a Randomized Controlled
+  Experiment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sida Peng, Wojciech Swiatek, Allen Gao, Paul Cullivan, Haoge Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, generative AI has undergone major advancements,
+demonstrating significant promise in augmenting human productivity. Notably,
+large language models (LLM), with ChatGPT-4 as an example, have drawn
+considerable attention. Numerous articles have examined the impact of LLM-based
+tools on human productivity in lab settings and designed tasks or in
+observational studies. Despite recent advances, field experiments applying
+LLM-based tools in realistic settings are limited. This paper presents the
+findings of a field randomized controlled trial assessing the effectiveness of
+LLM-based tools in providing unmonitored support services for information
+retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Context-Driven Interactive Query Simulations Based on Generative Large
+  Language Models <span class="chip">ECIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09631v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09631v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Björn Engelmann, Timo Breuer, Jana Isabelle Friese, Philipp Schaer, Norbert Fuhr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulating user interactions enables a more user-oriented evaluation of
+information retrieval (IR) systems. While user simulations are cost-efficient
+and reproducible, many approaches often lack fidelity regarding real user
+behavior. Most notably, current user models neglect the user's context, which
+is the primary driver of perceived relevance and the interactions with the
+search results. To this end, this work introduces the simulation of
+context-driven query reformulations. The proposed query generation methods
+build upon recent Large Language Model (LLM) approaches and consider the user's
+context throughout the simulation of a search session. Compared to simple
+context-free query generation approaches, these methods show better
+effectiveness and allow the simulation of more efficient IR sessions.
+Similarly, our evaluations consider more interaction context than current
+session-based measures and reveal interesting complementary insights in
+addition to the established evaluation protocols. We conclude with directions
+for future work and provide an entirely open experimental setup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECIR 2024 (Full Paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models for Information Retrieval: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07107v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07107v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Zhu, Huaying Yuan, Shuting Wang, Jiongnan Liu, Wenhan Liu, Chenlong Deng, Haonan Chen, Zhicheng Dou, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a primary means of information acquisition, information retrieval (IR)
+systems, such as search engines, have integrated themselves into our daily
+lives. These systems also serve as components of dialogue, question-answering,
+and recommender systems. The trajectory of IR has evolved dynamically from its
+origins in term-based methods to its integration with advanced neural models.
+While the neural models excel at capturing complex contextual signals and
+semantic nuances, thereby reshaping the IR landscape, they still face
+challenges such as data scarcity, interpretability, and the generation of
+contextually plausible yet potentially inaccurate responses. This evolution
+requires a combination of both traditional methods (such as term-based sparse
+retrieval methods with rapid response) and modern neural architectures (such as
+language models with powerful language understanding capacity). Meanwhile, the
+emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has
+revolutionized natural language processing due to their remarkable language
+understanding, generation, generalization, and reasoning abilities.
+Consequently, recent research has sought to leverage LLMs to improve IR
+systems. Given the rapid evolution of this research trajectory, it is necessary
+to consolidate existing methodologies and provide nuanced insights through a
+comprehensive overview. In this survey, we delve into the confluence of LLMs
+and IR systems, including crucial aspects such as query rewriters, retrievers,
+rerankers, and readers. Additionally, we explore promising directions, such as
+search agents, within this expanding field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>updated to version 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Cross-Domain Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04971v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04971v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shu Chen, Zitao Xu, Weike Pan, Qiang Yang, Zhong Ming
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain sequential recommendation (CDSR) shifts the modeling of user
+preferences from flat to stereoscopic by integrating and learning interaction
+information from multiple domains at different granularities (ranging from
+inter-sequence to intra-sequence and from single-domain to cross-domain). In
+this survey, we first define the CDSR problem using a four-dimensional tensor
+and then analyze its multi-type input representations under multidirectional
+dimensionality reductions. Following that, we provide a systematic overview
+from both macro and micro views. From a macro view, we abstract the multi-level
+fusion structures of various models across domains and discuss their bridges
+for fusion. From a micro view, focusing on the existing models, we specifically
+discuss the basic technologies and then explain the auxiliary learning
+technologies. Finally, we exhibit the available public datasets and the
+representative experimental results as well as provide some insights into
+future directions for research in CDSR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Source Code Clone Detection Using Unsupervised Similarity Measures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09885v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09885v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Martinez-Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assessing similarity in source code has gained significant attention in
+recent years due to its importance in software engineering tasks such as clone
+detection and code search and recommendation. This work presents a comparative
+analysis of unsupervised similarity measures for identifying source code clone
+detection. The goal is to overview the current state-of-the-art techniques,
+their strengths, and weaknesses. To do that, we compile the existing
+unsupervised strategies and evaluate their performance on a benchmark dataset
+to guide software engineers in selecting appropriate methods for their specific
+use cases. The source code of this study is available at
+https://github.com/jorge-martinez-gil/codesim
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication as Full Paper in the Software Quality Days
+  2024, Vienna, Austria</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Text Embeddings with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00368v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00368v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Wang, Nan Yang, Xiaolong Huang, Linjun Yang, Rangan Majumder, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a novel and simple method for obtaining
+high-quality text embeddings using only synthetic data and less than 1k
+training steps. Unlike existing methods that often depend on multi-stage
+intermediate pre-training with billions of weakly-supervised text pairs,
+followed by fine-tuning with a few labeled datasets, our method does not
+require building complex training pipelines or relying on manually collected
+datasets that are often constrained by task diversity and language coverage. We
+leverage proprietary LLMs to generate diverse synthetic data for hundreds of
+thousands of text embedding tasks across nearly 100 languages. We then
+fine-tune open-source decoder-only LLMs on the synthetic data using standard
+contrastive loss. Experiments demonstrate that our method achieves strong
+performance on highly competitive text embedding benchmarks without using any
+labeled data. Furthermore, when fine-tuned with a mixture of synthetic and
+labeled data, our model sets new state-of-the-art results on the BEIR and MTEB
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 15 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">115</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SCENES: Subpixel Correspondence Estimation With Epipolar Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik A. Kloepfer, João F. Henriques, Dylan Campbell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting point correspondences from two or more views of a scene is a
+fundamental computer vision problem with particular importance for relative
+camera pose estimation and structure-from-motion. Existing local feature
+matching approaches, trained with correspondence supervision on large-scale
+datasets, obtain highly-accurate matches on the test sets. However, they do not
+generalise well to new datasets with different characteristics to those they
+were trained on, unlike classic feature extractors. Instead, they require
+finetuning, which assumes that ground-truth correspondences or ground-truth
+camera poses and 3D structure are available. We relax this assumption by
+removing the requirement of 3D structure, e.g., depth maps or point clouds, and
+only require camera pose information, which can be obtained from odometry. We
+do so by replacing correspondence losses with epipolar losses, which encourage
+putative matches to lie on the associated epipolar line. While weaker than
+correspondence supervision, we observe that this cue is sufficient for
+finetuning existing models on new data. We then further relax the assumption of
+known camera poses by using pose estimates in a novel bootstrapping approach.
+We evaluate on highly challenging datasets, including an indoor drone dataset
+and an outdoor smartphone camera dataset, and obtain state-of-the-art results
+without strong supervision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Applications of flow models to the generation of correlated lattice QCD
+  ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan Abbott, Aleksandar Botev, Denis Boyda, Daniel C. Hackett, Gurtej Kanwar, Sébastien Racanière, Danilo J. Rezende, Fernando Romero-López, Phiala E. Shanahan, Julian M. Urban
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine-learned normalizing flows can be used in the context of lattice
+quantum field theory to generate statistically correlated ensembles of lattice
+gauge fields at different action parameters. This work demonstrates how these
+correlations can be exploited for variance reduction in the computation of
+observables. Three different proof-of-concept applications are demonstrated
+using a novel residual flow architecture: continuum limits of gauge theories,
+the mass dependence of QCD observables, and hadronic matrix elements based on
+the Feynman-Hellmann approach. In all three cases, it is shown that statistical
+uncertainties are significantly reduced when machine-learned flows are
+incorporated as compared with the same calculations performed with uncorrelated
+ensembles or direct reweighting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 2 tables, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pruning for Protection: Increasing Jailbreak Resistance in Aligned LLMs
+  Without Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adib Hasan, Ileana Rugina, Alex Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are vulnerable to `Jailbreaking' prompts, a type
+of attack that can coax these models into generating harmful and illegal
+content. In this paper, we show that pruning up to 20% of LLM parameters
+markedly increases their resistance to such attacks without additional training
+and without sacrificing their performance in standard benchmarks. Intriguingly,
+we discovered that the enhanced safety observed post-pruning correlates to the
+initial safety training level of the model, hinting that the effect of pruning
+could be more general and may hold for other LLM behaviors beyond safety.
+Additionally, we introduce a curated dataset of 225 harmful tasks across five
+categories, inserted into ten different Jailbreaking prompts, showing that
+pruning aids LLMs in concentrating attention on task-relevant tokens in
+jailbreaking prompts. Lastly, our experiments reveal that the prominent chat
+models, such as LLaMA-2 Chat, Vicuna, and Mistral Instruct exhibit high
+susceptibility to jailbreaking attacks, with some categories achieving nearly
+70-100% success rate. These insights underline the potential of pruning as a
+generalizable approach for improving LLM safety, reliability, and potentially
+other desired behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ensembler: Combating model inversion attacks using model ensemble during
+  collaborative inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dancheng Liu, Jinjun Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models have exhibited remarkable performance across various
+domains. Nevertheless, the burgeoning model sizes compel edge devices to
+offload a significant portion of the inference process to the cloud. While this
+practice offers numerous advantages, it also raises critical concerns regarding
+user data privacy. In scenarios where the cloud server's trustworthiness is in
+question, the need for a practical and adaptable method to safeguard data
+privacy becomes imperative. In this paper, we introduce Ensembler, an
+extensible framework designed to substantially increase the difficulty of
+conducting model inversion attacks for adversarial parties. Ensembler leverages
+model ensembling on the adversarial server, running in parallel with existing
+approaches that introduce perturbations to sensitive data during colloborative
+inference. Our experiments demonstrate that when combined with even basic
+Gaussian noise, Ensembler can effectively shield images from reconstruction
+attacks, achieving recognition levels that fall below human performance in some
+strict settings, significantly outperforming baseline methods lacking the
+Ensembler framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using LLMs to discover emerging coded antisemitic hate-speech emergence
+  in extremist social media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhanush Kikkisetti, Raza Ul Mustafa, Wendy Melillo, Roberto Corizzo, Zois Boukouvalas, Jeff Gill, Nathalie Japkowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online hate speech proliferation has created a difficult problem for social
+media platforms. A particular challenge relates to the use of coded language by
+groups interested in both creating a sense of belonging for its users and
+evading detection. Coded language evolves quickly and its use varies over time.
+This paper proposes a methodology for detecting emerging coded hate-laden
+terminology. The methodology is tested in the context of online antisemitic
+discourse. The approach considers posts scraped from social media platforms,
+often used by extremist users. The posts are scraped using seed expressions
+related to previously known discourse of hatred towards Jews. The method begins
+by identifying the expressions most representative of each post and calculating
+their frequency in the whole corpus. It filters out grammatically incoherent
+expressions as well as previously encountered ones so as to focus on emergent
+well-formed terminology. This is followed by an assessment of semantic
+similarity to known antisemitic terminology using a fine-tuned large language
+model, and subsequent filtering out of the expressions that are too distant
+from known expressions of hatred. Emergent antisemitic expressions containing
+terms clearly relating to Jewish topics are then removed to return only coded
+expressions of hatred.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, 2 algorithms, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Video <span class="highlight-title">Transformer</span>s via Universal Concept Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10831v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10831v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Kowal, Achal Dave, Rares Ambrus, Adrien Gaidon, Konstantinos G. Derpanis, Pavel Tokmakov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the problem of concept-based interpretability of
+transformer representations for videos. Concretely, we seek to explain the
+decision-making process of video transformers based on high-level,
+spatiotemporal concepts that are automatically discovered. Prior research on
+concept-based interpretability has concentrated solely on image-level tasks.
+Comparatively, video models deal with the added temporal dimension, increasing
+complexity and posing challenges in identifying dynamic concepts over time. In
+this work, we systematically address these challenges by introducing the first
+Video Transformer Concept Discovery (VTCD) algorithm. To this end, we propose
+an efficient approach for unsupervised identification of units of video
+transformer representations - concepts, and ranking their importance to the
+output of a model. The resulting concepts are highly interpretable, revealing
+spatio-temporal reasoning mechanisms and object-centric representations in
+unstructured video models. Performing this analysis jointly over a diverse set
+of supervised and self-supervised representations, we discover that some of
+these mechanism are universal in video transformers. Finally, we demonstrate
+that VTCDcan be used to improve model performance for fine-grained tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">survey</span> on recent advances in named entity recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Imed Keraghel, Stanislas Morbieu, Mohamed Nadif
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Named Entity Recognition seeks to extract substrings within a text that name
+real-world objects and to determine their type (for example, whether they refer
+to persons or organizations). In this survey, we first present an overview of
+recent popular approaches, but we also look at graph- and transformer- based
+methods including Large Language Models (LLMs) that have not had much coverage
+in other surveys. Second, we focus on methods designed for datasets with scarce
+annotations. Third, we evaluate the performance of the main NER implementations
+on a variety of datasets with differing characteristics (as regards their
+domain, their size, and their number of classes). We thus provide a deep
+comparison of algorithms that are never considered together. Our experiments
+shed some light on how the characteristics of datasets affect the behavior of
+the methods that we compare.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimisation in Neurosymbolic Learning Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emile van Krieken
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neurosymbolic AI aims to integrate deep learning with symbolic AI. This
+integration has many promises, such as decreasing the amount of data required
+to train a neural network, improving the explainability and interpretability of
+answers given by models and verifying the correctness of trained systems. We
+study neurosymbolic learning, where we have both data and background knowledge
+expressed using symbolic languages. How do we connect the symbolic and neural
+components to communicate this knowledge? One option is fuzzy reasoning, which
+studies degrees of truth. For example, being tall is not a binary concept.
+Instead, probabilistic reasoning studies the probability that something is true
+or will happen. Our first research question studies how different forms of
+fuzzy reasoning combine with learning. We find surprising results like a
+connection to the Raven paradox stating we confirm "ravens are black" when we
+observe a green apple. In this study, we did not use the background knowledge
+when we deployed our models after training. In our second research question, we
+studied how to use background knowledge in deployed models. We developed a new
+neural network layer based on fuzzy reasoning. Probabilistic reasoning is a
+natural fit for neural networks, which we usually train to be probabilistic.
+However, they are expensive to compute and do not scale well to large tasks. In
+our third research question, we study how to connect probabilistic reasoning
+with neural networks by sampling to estimate averages, while in the final
+research question, we study scaling probabilistic neurosymbolic learning to
+much larger problems than before. Our insight is to train a neural network with
+synthetic data to predict the result of probabilistic reasoning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PhD dissertation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Co-Pilot for Health: Personalized Algorithmic AI Nudging to Improve
+  Health Outcomes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10816v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10816v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jodi Chiam, Aloysius Lim, Cheryl Nott, Nicholas Mark, Ankur Teredesai, Sunil Shinde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to shape health behaviors of large populations automatically,
+across wearable types and disease conditions at scale has tremendous potential
+to improve global health outcomes. We designed and implemented an AI driven
+platform for digital algorithmic nudging, enabled by a Graph-Neural Network
+(GNN) based Recommendation System, and granular health behavior data from
+wearable fitness devices. Here we describe the efficacy results of this
+platform with its capabilities of personalized and contextual nudging to
+$n=84,764$ individuals over a 12-week period in Singapore. We statistically
+validated that participants in the target group who received such AI optimized
+daily nudges increased daily physical activity like step count by 6.17% ($p =
+3.09\times10^{-4}$) and weekly minutes of Moderate to Vigorous Physical
+Activity (MVPA) by 7.61% ($p = 1.16\times10^{-2}$), compared to matched
+participants in control group who did not receive any nudges. Further, such
+nudges were very well received, with a 13.1% of nudges sent being opened (open
+rate), and 11.7% of the opened nudges rated useful compared to 1.9% rated as
+not useful thereby demonstrating significant improvement in population level
+engagement metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simulation Based Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10811v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10811v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roi Naveiro, Becky Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian Optimization (BO) is a powerful method for optimizing black-box
+functions by combining prior knowledge with ongoing function evaluations. BO
+constructs a probabilistic surrogate model of the objective function given the
+covariates, which is in turn used to inform the selection of future evaluation
+points through an acquisition function. For smooth continuous search spaces,
+Gaussian Processes (GPs) are commonly used as the surrogate model as they offer
+analytical access to posterior predictive distributions, thus facilitating the
+computation and optimization of acquisition functions. However, in complex
+scenarios involving optimizations over categorical or mixed covariate spaces,
+GPs may not be ideal.
+  This paper introduces Simulation Based Bayesian Optimization (SBBO) as a
+novel approach to optimizing acquisition functions that only requires
+\emph{sampling-based} access to posterior predictive distributions. SBBO allows
+the use of surrogate probabilistic models tailored for combinatorial spaces
+with discrete variables. Any Bayesian model in which posterior inference is
+carried out through Markov chain Monte Carlo can be selected as the surrogate
+model in SBBO. In applications involving combinatorial optimization, we
+demonstrate empirically the effectiveness of SBBO method using various choices
+of surrogate models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neglected Hessian component explains mysteries in Sharpness
+  regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10809v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10809v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yann N. Dauphin, Atish Agarwala, Hossein Mobahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that methods like SAM which either explicitly or
+implicitly penalize second order information can improve generalization in deep
+learning. Seemingly similar methods like weight noise and gradient penalties
+often fail to provide such benefits. We show that these differences can be
+explained by the structure of the Hessian of the loss. First, we show that a
+common decomposition of the Hessian can be quantitatively interpreted as
+separating the feature exploitation from feature exploration. The feature
+exploration, which can be described by the Nonlinear Modeling Error matrix
+(NME), is commonly neglected in the literature since it vanishes at
+interpolation. Our work shows that the NME is in fact important as it can
+explain why gradient penalties are sensitive to the choice of activation
+function. Using this insight we design interventions to improve performance. We
+also provide evidence that challenges the long held equivalence of weight noise
+and gradient penalties. This equivalence relies on the assumption that the NME
+can be ignored, which we find does not hold for modern networks since they
+involve significant feature learning. We find that regularizing feature
+exploitation but not feature exploration yields performance similar to gradient
+penalties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Visually Connect Actions and their Effects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10805v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10805v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Peh, Paritosh Parmar, Basura Fernando
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce the novel concept of visually Connecting Actions
+and Their Effects (CATE) in video understanding. CATE can have applications in
+areas like task planning and learning from demonstration. We propose different
+CATE-based task formulations, such as action selection and action
+specification, where video understanding models connect actions and effects at
+semantic and fine-grained levels. We observe that different formulations
+produce representations capturing intuitive action properties. We also design
+various baseline models for action selection and action specification. Despite
+the intuitive nature of the task, we observe that models struggle, and humans
+outperform them by a large margin. The study aims to establish a foundation for
+future efforts, showcasing the flexibility and versatility of connecting
+actions and effects in video understanding, with the hope of inspiring advanced
+formulations and models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimation of AMOC transition probabilities using a machine learning
+  based rare-event algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valérian Jacques-Dumas, René M. van Westen, Henk A. Dijkstra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Atlantic Meridional Overturning Circulation (AMOC) is an important
+component of the global climate, known to be a tipping element, as it could
+collapse under global warming. The main objective of this study is to compute
+the probability that the AMOC collapses within a specified time window, using a
+rare-event algorithm called Trajectory-Adaptive Multilevel Splitting (TAMS).
+However, the efficiency and accuracy of TAMS depend on the choice of the score
+function. Although the definition of the optimal score function, called
+``committor function" is known, it is impossible in general to compute it a
+priori. Here, we combine TAMS with a Next-Generation Reservoir Computing
+technique that estimates the committor function from the data generated by the
+rare-event algorithm. We test this technique in a stochastic box model of the
+AMOC for which two types of transition exist, the so-called F(ast)-transitions
+and S(low)-transitions. Results for the F-transtions compare favorably with
+those in the literature where a physically-informed score function was used. We
+show that coupling a rare-event algorithm with machine learning allows for a
+correct estimation of transition probabilities, transition times, and even
+transition paths for a wide range of model parameters. We then extend these
+results to the more difficult problem of S-transitions in the same model. In
+both cases of F- and S-transitions, we also show how the Next-Generation
+Reservoir Computing technique can be interpreted to retrieve an analytical
+estimate of the committor function.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Novel Representation Learning Technique using Graphs for Performance
+  Analytics <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10799v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10799v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tarek Ramadan, Ankur Lahiry, Tanzima Z. Islam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance analytics domain in High Performance Computing (HPC) uses
+tabular data to solve regression problems, such as predicting the execution
+time. Existing Machine Learning (ML) techniques leverage the correlations among
+features given tabular datasets, not leveraging the relationships between
+samples directly. Moreover, since high-quality embeddings from raw features
+improve the fidelity of the downstream predictive models, existing methods rely
+on extensive feature engineering and pre-processing steps, costing time and
+manual effort. To fill these two gaps, we propose a novel idea of transforming
+tabular performance data into graphs to leverage the advancement of Graph
+Neural Network-based (GNN) techniques in capturing complex relationships
+between features and samples. In contrast to other ML application domains, such
+as social networks, the graph is not given; instead, we need to build it. To
+address this gap, we propose graph-building methods where nodes represent
+samples, and the edges are automatically inferred iteratively based on the
+similarity between the features in the samples. We evaluate the effectiveness
+of the generated embeddings from GNNs based on how well they make even a simple
+feed-forward neural network perform for regression tasks compared to other
+state-of-the-art representation learning techniques. Our evaluation
+demonstrates that even with up to 25% random missing values for each dataset,
+our method outperforms commonly used graph and Deep Neural Network (DNN)-based
+approaches and achieves up to 61.67% & 78.56% improvement in MSE loss over the
+DNN baseline respectively for HPC dataset and Machine Learning Datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted at 22nd International Conference on
+  Machine Learning and Applications (ICMLA2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Reinforcement Learning Empowered Activity-Aware Dynamic Health
+  Monitoring Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10794v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10794v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqiaing Ye, Yulan Gao, Yue Xiao, Zehui Xiong, Dusit Niyato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In smart healthcare, health monitoring utilizes diverse tools and
+technologies to analyze patients' real-time biosignal data, enabling immediate
+actions and interventions. Existing monitoring approaches were designed on the
+premise that medical devices track several health metrics concurrently,
+tailored to their designated functional scope. This means that they report all
+relevant health values within that scope, which can result in excess resource
+use and the gathering of extraneous data due to monitoring irrelevant health
+metrics. In this context, we propose Dynamic Activity-Aware Health Monitoring
+strategy (DActAHM) for striking a balance between optimal monitoring
+performance and cost efficiency, a novel framework based on Deep Reinforcement
+Learning (DRL) and SlowFast Model to ensure precise monitoring based on users'
+activities. Specifically, with the SlowFast Model, DActAHM efficiently
+identifies individual activities and captures these results for enhanced
+processing. Subsequently, DActAHM refines health metric monitoring in response
+to the identified activity by incorporating a DRL framework. Extensive
+experiments comparing DActAHM against three state-of-the-art approaches
+demonstrate it achieves 27.3% higher gain than the best-performing baseline
+that fixes monitoring actions over timeline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early alignment in two-layer networks training is a two-edged sword 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10791v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10791v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Etienne Boursier, Nicolas Flammarion
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training neural networks with first order optimisation methods is at the core
+of the empirical success of deep learning. The scale of initialisation is a
+crucial factor, as small initialisations are generally associated to a feature
+learning regime, for which gradient descent is implicitly biased towards simple
+solutions. This work provides a general and quantitative description of the
+early alignment phase, originally introduced by Maennel et al. (2018) . For
+small initialisation and one hidden ReLU layer networks, the early stage of the
+training dynamics leads to an alignment of the neurons towards key directions.
+This alignment induces a sparse representation of the network, which is
+directly related to the implicit bias of gradient flow at convergence. This
+sparsity inducing alignment however comes at the expense of difficulties in
+minimising the training objective: we also provide a simple data example for
+which overparameterised networks fail to converge towards global minima and
+only converge to a spurious stationary point instead.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring the Impact of Scene Level Objects on Object Detection: Towards
+  Quantitative Explanations of Detection Decisions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lynn Vonder Haar, Timothy Elvira, Luke Newcomb, Omar Ochoa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although accuracy and other common metrics can provide a useful window into
+the performance of an object detection model, they lack a deeper view of the
+model's decision process. Regardless of the quality of the training data and
+process, the features that an object detection model learns cannot be
+guaranteed. A model may learn a relationship between certain background
+context, i.e., scene level objects, and the presence of the labeled classes.
+Furthermore, standard performance verification and metrics would not identify
+this phenomenon. This paper presents a new black box explainability method for
+additional verification of object detection models by finding the impact of
+scene level objects on the identification of the objects within the image. By
+comparing the accuracies of a model on test data with and without certain scene
+level objects, the contributions of these objects to the model's performance
+becomes clearer. The experiment presented here will assess the impact of
+buildings and people in image context on the detection of emergency road
+vehicles by a fine-tuned YOLOv8 model. A large increase in accuracy in the
+presence of a scene level object will indicate the model's reliance on that
+object to make its detections. The results of this research lead to providing a
+quantitative explanation of the object detection model's decision process,
+enabling a deeper understanding of the model's performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Medusa: Simple LLM Inference Acceleration Framework with Multiple
+  Decoding Heads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianle Cai, Yuhong Li, Zhengyang Geng, Hongwu Peng, Jason D. Lee, Deming Chen, Tri Dao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The inference process in Large Language Models (LLMs) is often limited due to
+the absence of parallelism in the auto-regressive decoding process, resulting
+in most operations being restricted by the memory bandwidth of accelerators.
+While methods such as speculative decoding have been suggested to address this
+issue, their implementation is impeded by the challenges associated with
+acquiring and maintaining a separate draft model. In this paper, we present
+Medusa, an efficient method that augments LLM inference by adding extra
+decoding heads to predict multiple subsequent tokens in parallel. Using a
+tree-based attention mechanism, Medusa constructs multiple candidate
+continuations and verifies them simultaneously in each decoding step. By
+leveraging parallel processing, Medusa introduces only minimal overhead in
+terms of single-step latency while substantially reducing the number of
+decoding steps required.
+  We present two levels of fine-tuning procedures for Medusa to meet the needs
+of different use cases: Medusa-1: Medusa is directly fine-tuned on top of a
+frozen backbone LLM, enabling lossless inference acceleration. Medusa-2: Medusa
+is fine-tuned together with the backbone LLM, enabling better prediction
+accuracy of Medusa heads and higher speedup but needing a special training
+recipe that preserves the backbone model's capabilities.
+  Moreover, we propose several extensions that improve or expand the utility of
+Medusa, including a self-distillation to handle situations where no training
+data is available and a typical acceptance scheme to boost the acceptance rate
+while maintaining generation quality. We evaluate Medusa on models of various
+sizes and training procedures. Our experiments demonstrate that Medusa-1 can
+achieve over 2.2x speedup without compromising generation quality, while
+Medusa-2 further improves the speedup to 2.3-3.6x.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code for this implementation is available at
+  https://github.com/FasterDecoding/Medusa</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Starlit: Privacy-Preserving Federated Learning to Enhance Financial
+  Fraud Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aydin Abadi, Bradley Doyle, Francesco Gini, Kieron Guinamard, Sasi Kumar Murakonda, Jack Liddell, Paul Mellor, Steven J. Murdoch, Mohammad Naseri, Hector Page, George Theodorakopoulos, Suzanne Weller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a data-minimization approach enabling
+collaborative model training across diverse clients with local data, avoiding
+direct data exchange. However, state-of-the-art FL solutions to identify
+fraudulent financial transactions exhibit a subset of the following
+limitations. They (1) lack a formal security definition and proof, (2) assume
+prior freezing of suspicious customers' accounts by financial institutions
+(limiting the solutions' adoption), (3) scale poorly, involving either $O(n^2)$
+computationally expensive modular exponentiation (where $n$ is the total number
+of financial institutions) or highly inefficient fully homomorphic encryption,
+(4) assume the parties have already completed the identity alignment phase,
+hence excluding it from the implementation, performance evaluation, and
+security analysis, and (5) struggle to resist clients' dropouts. This work
+introduces Starlit, a novel scalable privacy-preserving FL mechanism that
+overcomes these limitations. It has various applications, such as enhancing
+financial fraud detection, mitigating terrorism, and enhancing digital health.
+We implemented Starlit and conducted a thorough performance analysis using
+synthetic data from a key player in global financial transactions. The
+evaluation indicates Starlit's scalability, efficiency, and accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Augmentation for Traffic Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Wang, Alessandro Finamore, Pietro Michiardi, Massimo Gallo, Dario Rossi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data Augmentation (DA) -- enriching training data by adding synthetic samples
+-- is a technique widely adopted in Computer Vision (CV) and Natural Language
+Processing (NLP) tasks to improve models performance. Yet, DA has struggled to
+gain traction in networking contexts, particularly in Traffic Classification
+(TC) tasks. In this work, we fulfill this gap by benchmarking 18 augmentation
+functions applied to 3 TC datasets using packet time series as input
+representation and considering a variety of training conditions. Our results
+show that (i) DA can reap benefits previously unexplored with (ii)
+augmentations acting on time series sequence order and masking being a better
+suit for TC and (iii) simple latent space analysis can provide hints about why
+augmentations have positive or negative effects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to appear at Passive and Active Measurements (PAM), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BoolGebra: Attributed Graph-learning for Boolean Algebraic Manipulation <span class="chip">DATE 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingjie Li, Anthony Agnesina, Yanqing Zhang, Haoxing Ren, Cunxi Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Boolean algebraic manipulation is at the core of logic synthesis in
+Electronic Design Automation (EDA) design flow. Existing methods struggle to
+fully exploit optimization opportunities, and often suffer from an explosive
+search space and limited scalability efficiency. This work presents BoolGebra,
+a novel attributed graph-learning approach for Boolean algebraic manipulation
+that aims to improve fundamental logic synthesis. BoolGebra incorporates Graph
+Neural Networks (GNNs) and takes initial feature embeddings from both
+structural and functional information as inputs. A fully connected neural
+network is employed as the predictor for direct optimization result
+predictions, significantly reducing the search space and efficiently locating
+the optimization space. The experiments involve training the BoolGebra model
+w.r.t design-specific and cross-design inferences using the trained model,
+where BoolGebra demonstrates generalizability for cross-design inference and
+its potential to scale from small, simple training datasets to large, complex
+inference datasets. Finally, BoolGebra is integrated with existing synthesis
+tool ABC to perform end-to-end logic minimization evaluation w.r.t SOTA
+baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>DATE 2024 extended version. arXiv admin note: text overlap with
+  arXiv:2310.07846</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Systematic Evaluation of Euclidean Alignment with Deep Learning for
+  EEG Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruna Junqueira, Bruno Aristimunha, Sylvain Chevallier, Raphael Y. de Camargo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electroencephalography (EEG) signals are frequently used for various
+Brain-Computer Interface (BCI) tasks. While Deep Learning (DL) techniques have
+shown promising results, they are hindered by the substantial data
+requirements. By leveraging data from multiple subjects, transfer learning
+enables more effective training of DL models. A technique that is gaining
+popularity is Euclidean Alignment (EA) due to its ease of use, low
+computational complexity, and compatibility with Deep Learning models. However,
+few studies evaluate its impact on the training performance of shared and
+individual DL models. In this work, we systematically evaluate the effect of EA
+combined with DL for decoding BCI signals. We used EA to train shared models
+with data from multiple subjects and evaluated its transferability to new
+subjects. Our experimental results show that it improves decoding in the target
+subject by 4.33% and decreases convergence time by more than 70%. We also
+trained individual models for each subject to use as a majority-voting ensemble
+classifier. In this scenario, using EA improved the 3-model ensemble accuracy
+by 3.7%. However, when compared to the shared model with EA, the ensemble
+accuracy was 3.62% lower.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages and 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empowering Aggregators with Practical Data-Driven Tools: Harnessing
+  Aggregated and Disaggregated Flexibility for Demand Response 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10726v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10726v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Costas Mylonas, Donata Boric, Leila Luttenberger Maric, Alexandros Tsitsanis, Eleftheria Petrianou, Magda Foti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the crucial interplay between aggregators and building
+occupants in activating flexibility through Demand Response (DR) programs, with
+a keen focus on achieving robust decarbonization and fortifying the resilience
+of the energy system amidst the uncertainties presented by Renewable Energy
+Sources (RES). Firstly, it introduces a methodology of optimizing aggregated
+flexibility provision strategies in environments with limited data, utilizing
+Discrete Fourier Transformation (DFT) and clustering techniques to identify
+building occupant's activity patterns. Secondly, the study assesses the
+disaggregated flexibility provision of Heating Ventilation and Air Conditioning
+(HVAC) systems during DR events, employing machine learning and optimization
+techniques for precise, device-level analysis. The first approach offers a
+non-intrusive pathway for aggregators to provide flexibility services in
+environments of a single smart meter for the whole building's consumption,
+while the second approach carefully considers building occupants' thermal
+comfort profiles, while maximizing flexibility in case of existence of
+dedicated smart meters to the HVAC systems. Through the application of
+data-driven techniques and encompassing case studies from both industrial and
+residential buildings, this paper not only unveils pivotal opportunities for
+aggregators in the balancing and emerging flexibility markets but also
+successfully develops end-to-end practical tools for aggregators. Furthermore,
+the efficacy of this tool is validated through detailed case studies,
+substantiating its operational capability and contributing to the evolution of
+a resilient and efficient energy system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-Time Zero-Day Intrusion Detection System for Automotive Controller
+  Area Network on FPGAs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10724v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10724v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashwat Khandelwal, Shreejith Shanker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasing automation in vehicles enabled by increased connectivity to the
+outside world has exposed vulnerabilities in previously siloed automotive
+networks like controller area networks (CAN). Attributes of CAN such as
+broadcast-based communication among electronic control units (ECUs) that
+lowered deployment costs are now being exploited to carry out active injection
+attacks like denial of service (DoS), fuzzing, and spoofing attacks. Research
+literature has proposed multiple supervised machine learning models deployed as
+Intrusion detection systems (IDSs) to detect such malicious activity; however,
+these are largely limited to identifying previously known attack vectors. With
+the ever-increasing complexity of active injection attacks, detecting zero-day
+(novel) attacks in these networks in real-time (to prevent propagation) becomes
+a problem of particular interest. This paper presents an
+unsupervised-learning-based convolutional autoencoder architecture for
+detecting zero-day attacks, which is trained only on benign (attack-free) CAN
+messages. We quantise the model using Vitis-AI tools from AMD/Xilinx targeting
+a resource-constrained Zynq Ultrascale platform as our IDS-ECU system for
+integration. The proposed model successfully achieves equal or higher
+classification accuracy (> 99.5%) on unseen DoS, fuzzing, and spoofing attacks
+from a publicly available attack dataset when compared to the state-of-the-art
+unsupervised learning-based IDSs. Additionally, by cleverly overlapping IDS
+operation on a window of CAN messages with the reception, the model is able to
+meet line-rate detection (0.43 ms per window) of high-speed CAN, which when
+coupled with the low energy consumption per inference, makes this architecture
+ideally suited for detecting zero-day attacks on critical CAN networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Model for Constructing Reaction Path from Initial to Final
+  States 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akihide Hayashi, So Takamoto, Ju Li, Daisuke Okanohara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mapping out reaction pathways and their corresponding activation barriers is
+a significant aspect of molecular simulation. Given their inherent complexity
+and nonlinearity, even generating a initial guess of these paths remains a
+challenging problem. Presented in this paper is an innovative approach that
+utilizes neural networks to generate initial guess for these reaction pathways.
+The proposed method is initiated by inputting the coordinates of the initial
+state, followed by progressive alterations to its structure. This iterative
+process culminates in the generation of the approximate representation of the
+reaction path and the coordinates of the final state. The application of this
+method extends to complex reaction pathways illustrated by organic reactions.
+Training was executed on the Transition1x dataset, an organic reaction pathway
+dataset. The results revealed generation of reactions that bore substantial
+similarities with the corresponding test data. The method's flexibility allows
+for reactions to be generated either to conform to predetermined conditions or
+in a randomized manner.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification with neural networks with quadratic decision functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leon Frischauf, Otmar Scherzer, Cong Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural network with quadratic decision functions have been introduced as
+alternatives to standard neural networks with affine linear one. They are
+advantageous when the objects to be identified are of compact basic geometries
+like circles, ellipsis etc. In this paper we investigate the use of such ansatz
+functions for classification. In particular we test and compare the algorithm
+on the MNIST dataset for classification of handwritten digits and for
+classification of subspecies. We also show, that the implementation can be
+based on the neural network structure in the software Tensorflow and Keras,
+respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safe Offline Reinforcement Learning with Feasibility-Guided Diffusion
+  Model <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinan Zheng, Jianxiong Li, Dongjie Yu, Yujie Yang, Shengbo Eben Li, Xianyuan Zhan, Jingjing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safe offline RL is a promising way to bypass risky online interactions
+towards safe policy learning. Most existing methods only enforce soft
+constraints, i.e., constraining safety violations in expectation below
+thresholds predetermined. This can lead to potentially unsafe outcomes, thus
+unacceptable in safety-critical scenarios. An alternative is to enforce the
+hard constraint of zero violation. However, this can be challenging in offline
+setting, as it needs to strike the right balance among three highly intricate
+and correlated aspects: safety constraint satisfaction, reward maximization,
+and behavior regularization imposed by offline datasets. Interestingly, we
+discover that via reachability analysis of safe-control theory, the hard safety
+constraint can be equivalently translated to identifying the largest feasible
+region given the offline dataset. This seamlessly converts the original trilogy
+problem to a feasibility-dependent objective, i.e., maximizing reward value
+within the feasible region while minimizing safety risks in the infeasible
+region. Inspired by these, we propose FISOR (FeasIbility-guided Safe Offline
+RL), which allows safety constraint adherence, reward maximization, and offline
+policy learning to be realized via three decoupled processes, while offering
+strong safety performance and stability. In FISOR, the optimal policy for the
+translated optimization problem can be derived in a special form of weighted
+behavior cloning. Thus, we propose a novel energy-guided diffusion model that
+does not require training a complicated time-dependent classifier to extract
+the policy, greatly simplifying the training. We compare FISOR against
+baselines on DSRL benchmark for safe offline RL. Evaluation results show that
+FISOR is the only method that can guarantee safety satisfaction in all tasks,
+while achieving top returns in most tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024, 30pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and
+  unfairness in dyadic regression models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Paz-Ruza, Amparo Alonso-Betanzos, Bertha Guijarro-Berdiñas, Brais Cancela, Carlos Eiras-Franco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dyadic regression models, which predict real-valued outcomes for pairs of
+entities, are fundamental in many domains (e.g. predicting the rating of a user
+to a product in Recommender Systems) and promising and under exploration in
+many others (e.g. approximating the adequate dosage of a drug for a patient in
+personalized pharmacology). In this work, we demonstrate that non-uniformity in
+the observed value distributions of individual entities leads to severely
+biased predictions in state-of-the-art models, skewing predictions towards the
+average of observed past values for the entity and providing worse-than-random
+predictive power in eccentric yet equally important cases. We show that the
+usage of global error metrics like Root Mean Squared Error (RMSE) and Mean
+Absolute Error (MAE) is insufficient to capture this phenomenon, which we name
+eccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as
+a new complementary metric that can quantify it in all studied models and
+datasets. We also prove the adequateness of EAUC by using naive de-biasing
+corrections to demonstrate that a lower model bias correlates with a lower EAUC
+and vice-versa. This work contributes a bias-aware evaluation of dyadic
+regression models to avoid potential unfairness and risks in critical
+real-world applications of such systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Lightweight Multi-Attack CAN Intrusion Detection System on Hybrid
+  FPGAs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashwat Khandelwal, Shreejith Shanker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rising connectivity in vehicles is enabling new capabilities like connected
+autonomous driving and advanced driver assistance systems (ADAS) for improving
+the safety and reliability of next-generation vehicles. This increased access
+to in-vehicle functions compromises critical capabilities that use legacy
+invehicle networks like Controller Area Network (CAN), which has no inherent
+security or authentication mechanism. Intrusion detection and mitigation
+approaches, particularly using machine learning models, have shown promising
+results in detecting multiple attack vectors in CAN through their ability to
+generalise to new vectors. However, most deployments require dedicated
+computing units like GPUs to perform line-rate detection, consuming much higher
+power. In this paper, we present a lightweight multi-attack quantised machine
+learning model that is deployed using Xilinx's Deep Learning Processing Unit IP
+on a Zynq Ultrascale+ (XCZU3EG) FPGA, which is trained and validated using the
+public CAN Intrusion Detection dataset. The quantised model detects denial of
+service and fuzzing attacks with an accuracy of above 99 % and a false positive
+rate of 0.07%, which are comparable to the state-of-the-art techniques in the
+literature. The Intrusion Detection System (IDS) execution consumes just 2.0 W
+with software tasks running on the ECU and achieves a 25 % reduction in
+per-message processing latency over the state-of-the-art implementations. This
+deployment allows the ECU function to coexist with the IDS with minimal changes
+to the tasks, making it ideal for real-time IDS in in-vehicle systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Manipulating Sparse Double Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ya Shi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the double descent phenomenon in two-layer neural
+networks, focusing on the role of L1 regularization and representation
+dimensions. It explores an alternative double descent phenomenon, named sparse
+double descent. The study emphasizes the complex relationship between model
+complexity, sparsity, and generalization, and suggests further research into
+more diverse models and datasets. The findings contribute to a deeper
+understanding of neural network training and optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards End-to-End GPS Localization with Neural Pseudorange Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Weng, KV Ling, Haochen Liu, Kun Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pseudorange errors are the root cause of localization inaccuracy in GPS.
+Previous data-driven methods regress and eliminate pseudorange errors using
+handcrafted intermediate labels. Unlike them, we propose an end-to-end GPS
+localization framework, E2E-PrNet, to train a neural network for pseudorange
+correction (PrNet) directly using the final task loss calculated with the
+ground truth of GPS receiver states. The gradients of the loss with respect to
+learnable parameters are backpropagated through a differentiable nonlinear
+least squares optimizer to PrNet. The feasibility is verified with GPS data
+collected by Android phones, showing that E2E-PrNet outperforms the
+state-of-the-art end-to-end GPS localization methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning-based Embedded Intrusion Detection System for Automotive
+  CAN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashwat Khandelwal, Eashan Wadhwa, Shreejith Shanker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rising complexity of in-vehicle electronics is enabling new capabilities like
+autonomous driving and active safety. However, rising automation also increases
+risk of security threats which is compounded by lack of in-built security
+measures in legacy networks like CAN, allowing attackers to observe, tamper and
+modify information shared over such broadcast networks. Various intrusion
+detection approaches have been proposed to detect and tackle such threats, with
+machine learning models proving highly effective. However, deploying machine
+learning models will require high processing power through high-end processors
+or GPUs to perform them close to line rate. In this paper, we propose a hybrid
+FPGA-based ECU approach that can transparently integrate IDS functionality
+through a dedicated off-the-shelf hardware accelerator that implements a
+deep-CNN intrusion detection model. Our results show that the proposed approach
+provides an average accuracy of over 99% across multiple attack datasets with
+0.64% false detection rates while consuming 94% less energy and achieving 51.8%
+reduction in per-message processing latency when compared to IDS
+implementations on GPUs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FIMBA: Evaluating the Robustness of AI in Genomics via Feature
+  Importance Adversarial Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10657v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10657v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heorhii Skovorodnikov, Hoda Alkhzaimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the steady rise of the use of AI in bio-technical applications and the
+widespread adoption of genomics sequencing, an increasing amount of AI-based
+algorithms and tools is entering the research and production stage affecting
+critical decision-making streams like drug discovery and clinical outcomes.
+This paper demonstrates the vulnerability of AI models often utilized
+downstream tasks on recognized public genomics datasets. We undermine model
+robustness by deploying an attack that focuses on input transformation while
+mimicking the real data and confusing the model decision-making, ultimately
+yielding a pronounced deterioration in model performance. Further, we enhance
+our approach by generating poisoned data using a variational autoencoder-based
+model. Our empirical findings unequivocally demonstrate a decline in model
+performance, underscored by diminished accuracy and an upswing in false
+positives and false negatives. Furthermore, we analyze the resulting
+adversarial samples via spectral analysis yielding conclusions for
+countermeasures against such attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, core code available at:
+  https://github.com/HeorhiiS/fimba-attack</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attentive Fusion: A <span class="highlight-title">Transformer</span>-based Approach to Multimodal Hate Speech
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atanu Mandal, Gargi Roy, Amit Barman, Indranil Dutta, Sudip Kumar Naskar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent surge and exponential growth of social media usage,
+scrutinizing social media content for the presence of any hateful content is of
+utmost importance. Researchers have been diligently working since the past
+decade on distinguishing between content that promotes hatred and content that
+does not. Traditionally, the main focus has been on analyzing textual content.
+However, recent research attempts have also commenced into the identification
+of audio-based content. Nevertheless, studies have shown that relying solely on
+audio or text-based content may be ineffective, as recent upsurge indicates
+that individuals often employ sarcasm in their speech and writing. To overcome
+these challenges, we present an approach to identify whether a speech promotes
+hate or not utilizing both audio and textual representations. Our methodology
+is based on the Transformer framework that incorporates both audio and text
+sampling, accompanied by our very own layer called "Attentive Fusion". The
+results of our study surpassed previous state-of-the-art techniques, achieving
+an impressive macro F1 score of 0.927 on the Test Set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 20th International Conference on Natural Language
+  Processing (ICON)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoChunk: Automated Activation Chunk for Memory-Efficient Long Sequence
+  Inference <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanlei Zhao, Shenggan Cheng, Guangyang Lu, Jiarui Fang, Haotian Zhou, Bin Jia, Ziming Liu, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large deep learning models have achieved impressive performance across a
+range of applications. However, their large memory requirements, including
+parameter memory and activation memory, have become a significant challenge for
+their practical serving. While existing methods mainly address parameter
+memory, the importance of activation memory has been overlooked. Especially for
+long input sequences, activation memory is expected to experience a significant
+exponential growth as the length of sequences increases. In this approach, we
+propose AutoChunk, an automatic and adaptive compiler system that efficiently
+reduces activation memory for long sequence inference by chunk strategies. The
+proposed system generates chunk plans by optimizing through multiple stages. In
+each stage, the chunk search pass explores all possible chunk candidates and
+the chunk selection pass identifies the optimal one. At runtime, AutoChunk
+employs code generation to automatically apply chunk strategies. The
+experiments demonstrate that AutoChunk can reduce over 80\% of activation
+memory while maintaining speed loss within 10%, extend max sequence length by
+3.2x to 11.7x, and outperform state-of-the-art methods by a large margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Area Modeling using Stay Information for Large-Scale Users and Analysis
+  for Influence of COVID-19 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazuyuki Shoji, Shunsuke Aoki, Takuro Yonezawa, Nobuo Kawaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how people use area in a city can be a valuable information in
+a wide range of fields, from marketing to urban planning. Area usage is subject
+to change over time due to various events including seasonal shifts and
+pandemics. Before the spread of smartphones, this data had been collected
+through questionnaire survey. However, this is not a sustainable approach in
+terms of time to results and cost. There are many existing studies on area
+modeling, which characterize an area with some kind of information, using Point
+of Interest (POI) or inter-area movement data. However, since POI is data that
+is statically tied to space, and inter-area movement data ignores the behavior
+of people within an area, existing methods are not sufficient in terms of
+capturing area usage changes. In this paper, we propose a novel area modeling
+method named Area2Vec, inspired by Word2Vec, which models areas based on
+people's location data. This method is based on the discovery that it is
+possible to characterize an area based on its usage by using people's stay
+information in the area. And it is a novel method that can reflect the
+dynamically changing people's behavior in an area in the modeling results. We
+validated Area2vec by performing a functional classification of areas in a
+district of Japan. The results show that Area2Vec can be usable in general area
+analysis. We also investigated area usage changes due to COVID-19 in two
+districts in Japan. We could find that COVID-19 made people refrain from
+unnecessary going out, such as visiting entertainment areas.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is an English translation of the paper published in the
+  Transactions of the Information Processing Society of Japan
+  (http://doi.org/10.20729/00213190)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empowering HWNs with Efficient Data Labeling: A Clustered Federated
+  Semi-Supervised Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moqbel Hamood, Abdullatif Albaseer, Mohamed Abdallah, Ala Al-Fuqaha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clustered Federated Multitask Learning (CFL) has gained considerable
+attention as an effective strategy for overcoming statistical challenges,
+particularly when dealing with non independent and identically distributed (non
+IID) data across multiple users. However, much of the existing research on CFL
+operates under the unrealistic premise that devices have access to accurate
+ground truth labels. This assumption becomes especially problematic in
+hierarchical wireless networks (HWNs), where edge networks contain a large
+amount of unlabeled data, resulting in slower convergence rates and increased
+processing times, particularly when dealing with two layers of model
+aggregation. To address these issues, we introduce a novel framework, Clustered
+Federated Semi-Supervised Learning (CFSL), designed for more realistic HWN
+scenarios. Our approach leverages a best-performing specialized model
+algorithm, wherein each device is assigned a specialized model that is highly
+adept at generating accurate pseudo-labels for unlabeled data, even when the
+data stems from diverse environments. We validate the efficacy of CFSL through
+extensive experiments, comparing it with existing methods highlighted in recent
+literature. Our numerical results demonstrate that CFSL significantly improves
+upon key metrics such as testing accuracy, labeling accuracy, and labeling
+latency under varying proportions of labeled and unlabeled data while also
+accommodating the non-IID nature of the data and the unique characteristics of
+wireless edge networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for IEEE Wireless Communications and Networking Conference
+  (WCNC) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive <span class="highlight-title">Survey</span> on Deep-Learning-based Vehicle Re-Identification:
+  Models, Data Sets and Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Amiri, Aydin Kaya, Ali Seydi Keceli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicle re-identification (ReID) endeavors to associate vehicle images
+collected from a distributed network of cameras spanning diverse traffic
+environments. This task assumes paramount importance within the spectrum of
+vehicle-centric technologies, playing a pivotal role in deploying Intelligent
+Transportation Systems (ITS) and advancing smart city initiatives. Rapid
+advancements in deep learning have significantly propelled the evolution of
+vehicle ReID technologies in recent years. Consequently, undertaking a
+comprehensive survey of methodologies centered on deep learning for vehicle
+re-identification has become imperative and inescapable. This paper extensively
+explores deep learning techniques applied to vehicle ReID. It outlines the
+categorization of these methods, encompassing supervised and unsupervised
+approaches, delves into existing research within these categories, introduces
+datasets and evaluation criteria, and delineates forthcoming challenges and
+potential research directions. This comprehensive assessment examines the
+landscape of deep learning in vehicle ReID and establishes a foundation and
+starting point for future works. It aims to serve as a complete reference by
+highlighting challenges and emerging trends, fostering advancements and
+applications in vehicle ReID utilizing deep learning models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Universal Unsupervised Anomaly Detection in Medical Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10637v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10637v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cosmin I. Bercea, Benedikt Wiestler, Daniel Rueckert, Julia A. Schnabel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing complexity of medical imaging data underscores the need for
+advanced anomaly detection methods to automatically identify diverse
+pathologies. Current methods face challenges in capturing the broad spectrum of
+anomalies, often limiting their use to specific lesion types in brain scans. To
+address this challenge, we introduce a novel unsupervised approach, termed
+\textit{Reversed Auto-Encoders (RA)}, designed to create realistic
+pseudo-healthy reconstructions that enable the detection of a wider range of
+pathologies. We evaluate the proposed method across various imaging modalities,
+including magnetic resonance imaging (MRI) of the brain, pediatric wrist X-ray,
+and chest X-ray, and demonstrate superior performance in detecting anomalies
+compared to existing state-of-the-art methods. Our unsupervised anomaly
+detection approach may enhance diagnostic accuracy in medical imaging by
+identifying a broader range of unknown pathologies. Our code is publicly
+available at: \url{https://github.com/ci-ber/RA}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interventional Fairness on Partially Known Causal Graphs: A Constrained
+  Optimization Approach <span class="chip">ICLR24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aoqi Zuo, Yiqing Li, Susan Wei, Mingming Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fair machine learning aims to prevent discrimination against individuals or
+sub-populations based on sensitive attributes such as gender and race. In
+recent years, causal inference methods have been increasingly used in fair
+machine learning to measure unfairness by causal effects. However, current
+methods assume that the true causal graph is given, which is often not true in
+real-world applications. To address this limitation, this paper proposes a
+framework for achieving causal fairness based on the notion of interventions
+when the true causal graph is partially known. The proposed approach involves
+modeling fair prediction using a Partially Directed Acyclic Graph (PDAG),
+specifically, a class of causal DAGs that can be learned from observational
+data combined with domain knowledge. The PDAG is used to measure causal
+fairness, and a constrained optimization problem is formulated to balance
+between fairness and accuracy. Results on both simulated and real-world
+datasets demonstrate the effectiveness of this method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Polytopic Autoencoders with Smooth Clustering for Reduced-order
+  Modelling of Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Heiland, Yongho Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of neural networks, there has been a notable increase,
+both in terms of quantity and variety, in research publications concerning the
+application of autoencoders to reduced-order models. We propose a polytopic
+autoencoder architecture that includes a lightweight nonlinear encoder, a
+convex combination decoder, and a smooth clustering network. Supported by
+several proofs, the model architecture ensures that all reconstructed states
+lie within a polytope, accompanied by a metric indicating the quality of the
+constructed polytopes, referred to as polytope error. Additionally, it offers a
+minimal number of convex coordinates for polytopic linear-parameter varying
+systems while achieving acceptable reconstruction errors compared to proper
+orthogonal decomposition (POD). To validate our proposed model, we conduct
+simulations involving two flow scenarios with the incompressible Navier-Stokes
+equation. Numerical results demonstrate the guaranteed properties of the model,
+low reconstruction errors compared to POD, and the improvement in error using a
+clustering network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZnTrack -- Data as Code 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10603v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10603v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Zills, Moritz Schäfer, Samuel Tovey, Johannes Kästner, Christian Holm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The past decade has seen tremendous breakthroughs in computation and there is
+no indication that this will slow any time soon. Machine learning, large-scale
+computing resources, and increased industry focus have resulted in rising
+investments in computer-driven solutions for data management, simulations, and
+model generation. However, with this growth in computation has come an even
+larger expansion of data and with it, complexity in data storage, sharing, and
+tracking. In this work, we introduce ZnTrack, a Python-driven data versioning
+tool. ZnTrack builds upon established version control systems to provide a
+user-friendly and easy-to-use interface for tracking parameters in experiments,
+designing workflows, and storing and sharing data. From this ability to reduce
+large datasets to a simple Python script emerges the concept of Data as Code, a
+core component of the work presented here and an undoubtedly important concept
+as the age of computation continues to evolve. ZnTrack offers an open-source,
+FAIR data compatible Python package to enable users to harness these concepts
+of the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 10 figures, 2MB PDF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarially Robust Signed Graph Contrastive Learning from Balance
+  Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10590v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10590v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialong Zhou, Xing Ai, Yuni Lai, Kai Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Signed graphs consist of edges and signs, which can be separated into
+structural information and balance-related information, respectively. Existing
+signed graph neural networks (SGNNs) typically rely on balance-related
+information to generate embeddings. Nevertheless, the emergence of recent
+adversarial attacks has had a detrimental impact on the balance-related
+information. Similar to how structure learning can restore unsigned graphs,
+balance learning can be applied to signed graphs by improving the balance
+degree of the poisoned graph. However, this approach encounters the challenge
+"Irreversibility of Balance-related Information" - while the balance degree
+improves, the restored edges may not be the ones originally affected by
+attacks, resulting in poor defense effectiveness. To address this challenge, we
+propose a robust SGNN framework called Balance Augmented-Signed Graph
+Contrastive Learning (BA-SGCL), which combines Graph Contrastive Learning
+principles with balance augmentation techniques. Experimental results
+demonstrate that BA-SGCL not only enhances robustness against existing
+adversarial attacks but also achieves superior performance on link sign
+prediction task across various datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PuriDefense: Randomized Local Implicit Adversarial Purification for
+  Defending Black-box Query-based Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ping Guo, Zhiyuan Yang, Xi Lin, Qingchuan Zhao, Qingfu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Black-box query-based attacks constitute significant threats to Machine
+Learning as a Service (MLaaS) systems since they can generate adversarial
+examples without accessing the target model's architecture and parameters.
+Traditional defense mechanisms, such as adversarial training, gradient masking,
+and input transformations, either impose substantial computational costs or
+compromise the test accuracy of non-adversarial inputs. To address these
+challenges, we propose an efficient defense mechanism, PuriDefense, that
+employs random patch-wise purifications with an ensemble of lightweight
+purification models at a low level of inference cost. These models leverage the
+local implicit function and rebuild the natural image manifold. Our theoretical
+analysis suggests that this approach slows down the convergence of query-based
+attacks by incorporating randomness into purifications. Extensive experiments
+on CIFAR-10 and ImageNet validate the effectiveness of our proposed
+purifier-based defense mechanism, demonstrating significant improvements in
+robustness against query-based attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Multi-Modal Density Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna Mészáros, Julian F. Schumann, Javier Alonso-Mora, Arkady Zgonnikov, Jens Kober
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Development of multi-modal, probabilistic prediction models has lead to a
+need for comprehensive evaluation metrics. While several metrics can
+characterize the accuracy of machine-learned models (e.g., negative
+log-likelihood, Jensen-Shannon divergence), these metrics typically operate on
+probability densities. Applying them to purely sample-based prediction models
+thus requires that the underlying density function is estimated. However,
+common methods such as kernel density estimation (KDE) have been demonstrated
+to lack robustness, while more complex methods have not been evaluated in
+multi-modal estimation problems. In this paper, we present ROME (RObust
+Multi-modal density Estimator), a non-parametric approach for density
+estimation which addresses the challenge of estimating multi-modal, non-normal,
+and highly correlated distributions. ROME utilizes clustering to segment a
+multi-modal set of samples into multiple uni-modal ones and then combines
+simple KDE estimates obtained for individual clusters in a single multi-modal
+estimate. We compared our approach to state-of-the-art methods for density
+estimation as well as ablations of ROME, showing that it not only outperforms
+established methods but is also more robust to a variety of distributions. Our
+results demonstrate that ROME can overcome the issues of over-fitting and
+over-smoothing exhibited by other estimators, promising a more robust
+evaluation of probabilistic machine learning models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OrchMoE: Efficient Multi-Adapter Learning with Task-Skill Synergy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10559v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10559v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowen Wang, Tao Sun, Kaixiang Ji, Jian Wang, Cong Fan, Jinjie Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We advance the field of Parameter-Efficient Fine-Tuning (PEFT) with our novel
+multi-adapter method, OrchMoE, which capitalizes on modular skill architecture
+for enhanced forward transfer in neural networks. Unlike prior models that
+depend on explicit task identification inputs, OrchMoE automatically discerns
+task categories, streamlining the learning process. This is achieved through an
+integrated mechanism comprising an Automatic Task Classification module and a
+Task-Skill Allocation module, which collectively deduce task-specific
+classifications and tailor skill allocation matrices. Our extensive evaluations
+on the 'Super Natural Instructions' dataset, featuring 1,600 diverse
+instructional tasks, indicate that OrchMoE substantially outperforms comparable
+multi-adapter baselines in terms of both performance and sample utilization
+efficiency, all while operating within the same parameter constraints. These
+findings suggest that OrchMoE offers a significant leap forward in multi-task
+learning efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified View Imputation and Feature Selection Learning for Incomplete
+  Multi-view Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanyong Huang, Zongxin Shen, Tianrui Li, Fengmao Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although multi-view unsupervised feature selection (MUFS) is an effective
+technology for reducing dimensionality in machine learning, existing methods
+cannot directly deal with incomplete multi-view data where some samples are
+missing in certain views. These methods should first apply predetermined values
+to impute missing data, then perform feature selection on the complete dataset.
+Separating imputation and feature selection processes fails to capitalize on
+the potential synergy where local structural information gleaned from feature
+selection could guide the imputation, thereby improving the feature selection
+performance in turn. Additionally, previous methods only focus on leveraging
+samples' local structure information, while ignoring the intrinsic locality of
+the feature space. To tackle these problems, a novel MUFS method, called
+UNified view Imputation and Feature selectIon lEaRning (UNIFIER), is proposed.
+UNIFIER explores the local structure of multi-view data by adaptively learning
+similarity-induced graphs from both the sample and feature spaces. Then,
+UNIFIER dynamically recovers the missing views, guided by the sample and
+feature similarity graphs during the feature selection procedure. Furthermore,
+the half-quadratic minimization technique is used to automatically weight
+different instances, alleviating the impact of outliers and unreliable restored
+data. Comprehensive experimental results demonstrate that UNIFIER outperforms
+other state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PhoGAD: Graph-based Anomaly Behavior Detection with Persistent Homology
+  Optimization <span class="chip">WSDM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10547v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10547v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Yuan, Haoyi Zhou, Tianyu Chen, Jianxin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A multitude of toxic online behaviors, ranging from network attacks to
+anonymous traffic and spam, have severely disrupted the smooth operation of
+networks. Due to the inherent sender-receiver nature of network behaviors,
+graph-based frameworks are commonly used for detecting anomalous behaviors.
+However, in real-world scenarios, the boundary between normal and anomalous
+behaviors tends to be ambiguous. The local heterophily of graphs interferes
+with the detection, and existing methods based on nodes or edges introduce
+unwanted noise into representation results, thereby impacting the effectiveness
+of detection. To address these issues, we propose PhoGAD, a graph-based anomaly
+detection framework. PhoGAD leverages persistent homology optimization to
+clarify behavioral boundaries. Building upon this, the weights of adjacent
+edges are designed to mitigate the effects of local heterophily. Subsequently,
+to tackle the noise problem, we conduct a formal analysis and propose a
+disentangled representation-based explicit embedding method, ultimately
+achieving anomaly behavior detection. Experiments on intrusion, traffic, and
+spam datasets verify that PhoGAD has surpassed the performance of
+state-of-the-art (SOTA) frameworks in detection efficacy. Notably, PhoGAD
+demonstrates robust detection even with diminished anomaly proportions,
+highlighting its applicability to real-world scenarios. The analysis of
+persistent homology demonstrates its effectiveness in capturing the topological
+structure formed by normal edge features. Additionally, ablation experiments
+validate the effectiveness of the innovative mechanisms integrated within
+PhoGAD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by WSDM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ I-SplitEE: Image classification in Split Computing DNNs with Early Exits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Divya Jyoti Bajpai, Aastha Jaiswal, Manjesh Kumar Hanawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advances in Deep Neural Networks (DNNs) stem from their
+exceptional performance across various domains. However, their inherent large
+size hinders deploying these networks on resource-constrained devices like
+edge, mobile, and IoT platforms. Strategies have emerged, from partial cloud
+computation offloading (split computing) to integrating early exits within DNN
+layers. Our work presents an innovative unified approach merging early exits
+and split computing. We determine the 'splitting layer', the optimal depth in
+the DNN for edge device computations, and whether to infer on edge device or be
+offloaded to the cloud for inference considering accuracy, computational
+efficiency, and communication costs. Also, Image classification faces diverse
+environmental distortions, influenced by factors like time of day, lighting,
+and weather. To adapt to these distortions, we introduce I-SplitEE, an online
+unsupervised algorithm ideal for scenarios lacking ground truths and with
+sequential data. Experimental validation using Caltech-256 and Cifar-10
+datasets subjected to varied distortions showcases I-SplitEE's ability to
+reduce costs by a minimum of 55% with marginal performance degradation of at
+most 5%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in proceedings of IEEE International Conference on
+  Communications 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The "Colonial Impulse" of Natural Language Processing: An Audit of
+  Bengali Sentiment Analysis Tools and Their Identity-based Biases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dipto Das, Shion Guha, Jed Brubaker, Bryan Semaan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While colonization has sociohistorically impacted people's identities across
+various dimensions, those colonial values and biases continue to be perpetuated
+by sociotechnical systems. One category of sociotechnical systems--sentiment
+analysis tools--can also perpetuate colonial values and bias, yet less
+attention has been paid to how such tools may be complicit in perpetuating
+coloniality, although they are often used to guide various practices (e.g.,
+content moderation). In this paper, we explore potential bias in sentiment
+analysis tools in the context of Bengali communities that have experienced and
+continue to experience the impacts of colonialism. Drawing on identity
+categories most impacted by colonialism amongst local Bengali communities, we
+focused our analytic attention on gender, religion, and nationality. We
+conducted an algorithmic audit of all sentiment analysis tools for Bengali,
+available on the Python package index (PyPI) and GitHub. Despite similar
+semantic content and structure, our analyses showed that in addition to
+inconsistencies in output from different tools, Bengali sentiment analysis
+tools exhibit bias between different identity categories and respond
+differently to different ways of identity expression. Connecting our findings
+with colonially shaped sociocultural structures of Bengali communities, we
+discuss the implications of downstream bias of sentiment analysis tools.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mementos: A Comprehensive Benchmark for Multimodal Large Language Model
+  Reasoning over Image Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiyao Wang, Yuhang Zhou, Xiaoyu Liu, Hongjin Lu, Yuancheng Xu, Feihong He, Jaehong Yoon, Taixi Lu, Gedas Bertasius, Mohit Bansal, Huaxiu Yao, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have demonstrated proficiency in
+handling a variety of visual-language tasks. However, current MLLM benchmarks
+are predominantly designed to evaluate reasoning based on static information
+about a single image, and the ability of modern MLLMs to extrapolate from image
+sequences, which is essential for understanding our ever-changing world, has
+been less investigated. To address this challenge, this paper introduces
+Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning
+abilities. Mementos features 4,761 diverse image sequences with varying
+lengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning
+performance. Through a careful evaluation of nine recent MLLMs on Mementos,
+including GPT-4V and Gemini, we find that they struggle to accurately describe
+dynamic information about given image sequences, often leading to
+hallucinations/misrepresentations of objects and their corresponding behaviors.
+Our quantitative analysis and case studies identify three key factors impacting
+MLLMs' sequential image reasoning: the correlation between object and
+behavioral hallucinations, the influence of cooccurring behaviors, and the
+compounding impact of behavioral hallucinations. Our dataset is available at
+https://github.com/umd-huang-lab/Mementos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 23 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FARe: Fault-Aware GNN Training on ReRAM-based PIM Accelerators <span class="chip">DATE</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pratyush Dhingra, Chukwufumnanya Ogbogu, Biresh Kumar Joardar, Janardhan Rao Doppa, Ananth Kalyanaraman, Partha Pratim Pande
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Resistive random-access memory (ReRAM)-based processing-in-memory (PIM)
+architecture is an attractive solution for training Graph Neural Networks
+(GNNs) on edge platforms. However, the immature fabrication process and limited
+write endurance of ReRAMs make them prone to hardware faults, thereby limiting
+their widespread adoption for GNN training. Further, the existing
+fault-tolerant solutions prove inadequate for effectively training GNNs in the
+presence of faults. In this paper, we propose a fault-aware framework referred
+to as FARe that mitigates the effect of faults during GNN training. FARe
+outperforms existing approaches in terms of both accuracy and timing overhead.
+Experimental results demonstrate that FARe framework can restore GNN test
+accuracy by 47.6% on faulty ReRAM hardware with a ~1% timing overhead compared
+to the fault-free counterpart.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted to the conference DATE (Design,
+  Automation and Test in Europe) - 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial-temporal Forecasting for Regions without Observations <span class="chip">EDBT2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Su, Jianzhong Qi, Egemen Tanin, Yanchuan Chang, Majid Sarvi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial-temporal forecasting plays an important role in many real-world
+applications, such as traffic forecasting, air pollutant forecasting,
+crowd-flow forecasting, and so on. State-of-the-art spatial-temporal
+forecasting models take data-driven approaches and rely heavily on data
+availability. Such models suffer from accuracy issues when data is incomplete,
+which is common in reality due to the heavy costs of deploying and maintaining
+sensors for data collection. A few recent studies attempted to address the
+issue of incomplete data. They typically assume some data availability in a
+region of interest either for a short period or at a few locations. In this
+paper, we further study spatial-temporal forecasting for a region of interest
+without any historical observations, to address scenarios such as unbalanced
+region development, progressive deployment of sensors or lack of open data. We
+propose a model named STSM for the task. The model takes a contrastive
+learning-based approach to learn spatial-temporal patterns from adjacent
+regions that have recorded data. Our key insight is to learn from the locations
+that resemble those in the region of interest, and we propose a selective
+masking strategy to enable the learning. As a result, our model outperforms
+adapted state-of-the-art models, reducing errors consistently over both traffic
+and air pollutant forecasting tasks. The source code is available at
+https://github.com/suzy0223/STSM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EDBT2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Episodic Reinforcement Learning with Expanded State-reward Space <span class="chip">AAMAS'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10516v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10516v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dayang Liang, Yaru Zhang, Yunlong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Empowered by deep neural networks, deep reinforcement learning (DRL) has
+demonstrated tremendous empirical successes in various domains, including
+games, health care, and autonomous driving. Despite these advancements, DRL is
+still identified as data-inefficient as effective policies demand vast numbers
+of environmental samples. Recently, episodic control (EC)-based model-free DRL
+methods enable sample efficiency by recalling past experiences from episodic
+memory. However, existing EC-based methods suffer from the limitation of
+potential misalignment between the state and reward spaces for neglecting the
+utilization of (past) retrieval states with extensive information, which
+probably causes inaccurate value estimation and degraded policy performance. To
+tackle this issue, we introduce an efficient EC-based DRL framework with
+expanded state-reward space, where the expanded states used as the input and
+the expanded rewards used in the training both contain historical and current
+information. To be specific, we reuse the historical states retrieved by EC as
+part of the input states and integrate the retrieved MC-returns into the
+immediate reward in each interactive transition. As a result, our method is
+able to simultaneously achieve the full utilization of retrieval information
+and the better evaluation of state values by a Temporal Difference (TD) loss.
+Empirical results on challenging Box2d and Mujoco tasks demonstrate the
+superiority of our method over a recent sibling method and common baselines.
+Further, we also verify our method's effectiveness in alleviating Q-value
+overestimation by additional experiments of Q-value comparison.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAMAS'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A match made in consistency heaven: when large language models meet
+  evolutionary algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang Chao, Jiaxuan Zhao, Licheng Jiao, Lingling Li, Fang Liu, Shuyuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large language models (LLMs) have powerful capabilities for
+generating creative natural text. Evolutionary algorithms (EAs) can discover
+diverse solutions to complex real-world problems. Motivated by the common
+collective and directionality of text sequence generation and evolution, this
+paper illustrates the strong consistency of LLMs and EAs, which includes
+multiple one-to-one key characteristics: token embedding and genotype-phenotype
+mapping, position encoding and fitness shaping, position embedding and
+selection, attention and crossover, feed-forward neural network and mutation,
+model training and parameter update, and multi-task learning and
+multi-objective optimization. Based on this consistency perspective, existing
+coupling studies are analyzed, including evolutionary fine-tuning and
+LLM-enhanced EAs. Leveraging these insights, we outline a fundamental roadmap
+for future research in coupling LLMs and EAs, while highlighting key challenges
+along the way. The consistency not only reveals the evolution mechanism behind
+LLMs but also facilitates the development of evolved artificial agents that
+approach or surpass biological organisms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A perspective article under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Layering via Conditional Entropy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Itai Feigenbaum, Devansh Arpit, Huan Wang, Shelby Heinecke, Juan Carlos Niebles, Weiran Yao, Caiming Xiong, Silvio Savarese
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal discovery aims to recover information about an unobserved causal graph
+from the observable data it generates. Layerings are orderings of the variables
+which place causes before effects. In this paper, we provide ways to recover
+layerings of a graph by accessing the data via a conditional entropy oracle,
+when distributions are discrete. Our algorithms work by repeatedly removing
+sources or sinks from the graph. Under appropriate assumptions and
+conditioning, we can separate the sources or sinks from the remainder of the
+nodes by comparing their conditional entropy to the unconditional entropy of
+their noise. Our algorithms are provably correct and run in worst-case
+quadratic time. The main assumptions are faithfulness and injective noise, and
+either known noise entropies or weakly monotonically increasing noise entropies
+along directed paths. In addition, we require one of either a very mild
+extension of faithfulness, or strictly monotonically increasing noise
+entropies, or expanding noise injectivity to include an additional single
+argument in the structural functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization Error Guaranteed Auto-Encoder-Based Nonlinear Model
+  Reduction for Operator Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liu, Biraj Dahal, Rongjie Lai, Wenjing Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many physical processes in science and engineering are naturally represented
+by operators between infinite-dimensional function spaces. The problem of
+operator learning, in this context, seeks to extract these physical processes
+from empirical data, which is challenging due to the infinite or high
+dimensionality of data. An integral component in addressing this challenge is
+model reduction, which reduces both the data dimensionality and problem size.
+In this paper, we utilize low-dimensional nonlinear structures in model
+reduction by investigating Auto-Encoder-based Neural Network (AENet). AENet
+first learns the latent variables of the input data and then learns the
+transformation from these latent variables to corresponding output data. Our
+numerical experiments validate the ability of AENet to accurately learn the
+solution operator of nonlinear partial differential equations. Furthermore, we
+establish a mathematical and statistical estimation theory that analyzes the
+generalization error of AENet. Our theoretical framework shows that the sample
+complexity of training AENet is intricately tied to the intrinsic dimension of
+the modeled process, while also demonstrating the remarkable resilience of
+AENet to noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Budgeted Online Model Selection and Fine-Tuning via Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pouya M. Ghari, Yanning Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online model selection involves selecting a model from a set of candidate
+models 'on the fly' to perform prediction on a stream of data. The choice of
+candidate models henceforth has a crucial impact on the performance. Although
+employing a larger set of candidate models naturally leads to more flexibility
+in model selection, this may be infeasible in cases where prediction tasks are
+performed on edge devices with limited memory. Faced with this challenge, the
+present paper proposes an online federated model selection framework where a
+group of learners (clients) interacts with a server with sufficient memory such
+that the server stores all candidate models. However, each client only chooses
+to store a subset of models that can be fit into its memory and performs its
+own prediction task using one of the stored models. Furthermore, employing the
+proposed algorithm, clients and the server collaborate to fine-tune models to
+adapt them to a non-stationary environment. Theoretical analysis proves that
+the proposed algorithm enjoys sub-linear regret with respect to the best model
+in hindsight. Experiments on real datasets demonstrate the effectiveness of the
+proposed algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LDReg: Local Dimensionality Regularized <span class="highlight-title">Self-Supervised</span> Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10474v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10474v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanxun Huang, Ricardo J. G. B. Campello, Sarah Monazam Erfani, Xingjun Ma, Michael E. Houle, James Bailey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representations learned via self-supervised learning (SSL) can be susceptible
+to dimensional collapse, where the learned representation subspace is of
+extremely low dimensionality and thus fails to represent the full data
+distribution and modalities. Dimensional collapse also known as the
+"underfilling" phenomenon is one of the major causes of degraded performance on
+downstream tasks. Previous work has investigated the dimensional collapse
+problem of SSL at a global level. In this paper, we demonstrate that
+representations can span over high dimensional space globally, but collapse
+locally. To address this, we propose a method called $\textit{local
+dimensionality regularization (LDReg)}$. Our formulation is based on the
+derivation of the Fisher-Rao metric to compare and optimize local distance
+distributions at an asymptotically small radius for each data point. By
+increasing the local intrinsic dimensionality, we demonstrate through a range
+of experiments that LDReg improves the representation quality of SSL. The
+results also show that LDReg can regularize dimensionality at both local and
+global levels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Backdoors for Mixed Integer Programs with Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10467v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10467v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyang Cai, Taoan Huang, Bistra Dilkina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many real-world problems can be efficiently modeled as Mixed Integer Programs
+(MIPs) and solved with the Branch-and-Bound method. Prior work has shown the
+existence of MIP backdoors, small sets of variables such that prioritizing
+branching on them when possible leads to faster running times. However, finding
+high-quality backdoors that improve running times remains an open question.
+Previous work learns to estimate the relative solver speed of randomly sampled
+backdoors through ranking and then decide whether to use it. In this paper, we
+utilize the Monte-Carlo tree search method to collect backdoors for training,
+rather than relying on random sampling, and adapt a contrastive learning
+framework to train a Graph Attention Network model to predict backdoors. Our
+method, evaluated on four common MIP problem domains, demonstrates performance
+improvements over both Gurobi and previous models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Critical Data Size of Language Models from a Grokking Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuekai Zhu, Yao Fu, Bowen Zhou, Zhouhan Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the critical data size in language models, a threshold that marks
+a fundamental shift from quick memorization to slow generalization. We
+formalize the phase transition under the grokking configuration into the Data
+Efficiency Hypothesis and identify data insufficiency, sufficiency, and surplus
+regimes in language models training dynamics. We develop a grokking
+configuration to reproduce grokking on simplistic language models stably by
+rescaling initialization and weight decay. We show that generalization occurs
+only when language models reach a critical size. We analyze grokking across
+sample-wise and model-wise, verifying the proposed data efficiency hypothesis.
+Our experiments reveal smoother phase transitions occurring at the critical
+dataset size for language datasets. As the model size increases, this critical
+point also becomes larger, indicating that larger models require more data. Our
+results deepen the understanding of language model training, offering a novel
+perspective on the role of data in the learning mechanism of language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ultra-lightweight Neural Differential DSP Vocoder For High Quality
+  Speech Synthesis <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10460v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10460v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prabhav Agrawal, Thilo Koehler, Zhiping Xiu, Prashant Serai, Qing He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural vocoders model the raw audio waveform and synthesize high-quality
+audio, but even the highly efficient ones, like MB-MelGAN and LPCNet, fail to
+run real-time on a low-end device like a smartglass. A pure digital signal
+processing (DSP) based vocoder can be implemented via lightweight fast Fourier
+transforms (FFT), and therefore, is a magnitude faster than any neural vocoder.
+A DSP vocoder often gets a lower audio quality due to consuming over-smoothed
+acoustic model predictions of approximate representations for the vocal tract.
+In this paper, we propose an ultra-lightweight differential DSP (DDSP) vocoder
+that uses a jointly optimized acoustic model with a DSP vocoder, and learns
+without an extracted spectral feature for the vocal tract. The model achieves
+audio quality comparable to neural vocoders with a high average MOS of 4.36
+while being efficient as a DSP vocoder. Our C++ implementation, without any
+hardware-specific optimization, is at 15 MFLOPS, surpasses MB-MelGAN by 340
+times in terms of FLOPS, and achieves a vocoder-only RTF of 0.003 and overall
+RTF of 0.044 while running single-threaded on a 2GHz Intel Xeon CPU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Unlearning: A Contrastive Approach to Machine Unlearning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10458v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10458v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong kyu Lee, Qiuchen Zhang, Carl Yang, Jian Lou, Li Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning aims to eliminate the influence of a subset of training
+samples (i.e., unlearning samples) from a trained model. Effectively and
+efficiently removing the unlearning samples without negatively impacting the
+overall model performance is still challenging. In this paper, we propose a
+contrastive unlearning framework, leveraging the concept of representation
+learning for more effective unlearning. It removes the influence of unlearning
+samples by contrasting their embeddings against the remaining samples so that
+they are pushed away from their original classes and pulled toward other
+classes. By directly optimizing the representation space, it effectively
+removes the influence of unlearning samples while maintaining the
+representations learned from the remaining samples. Experiments on a variety of
+datasets and models on both class unlearning and sample unlearning showed that
+contrastive unlearning achieves the best unlearning effects and efficiency with
+the lowest performance loss compared with the state-of-the-art algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning-assisted Stochastic Capacity Expansion Planning: A Bayesian
+  Optimization Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aron Brenner, Rahman Khorramfar, Dharik Mallapragada, Saurabh Amin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving large-scale capacity expansion problems (CEPs) is central to
+cost-effective decarbonization of regional-scale energy systems. To ensure the
+intended outcomes of CEPs, modeling uncertainty due to weather-dependent
+variable renewable energy (VRE) supply and energy demand becomes crucially
+important. However, the resulting stochastic optimization models are often less
+computationally tractable than their deterministic counterparts. Here, we
+propose a learning-assisted approximate solution method to tractably solve
+two-stage stochastic CEPs. Our method identifies low-cost planning decisions by
+constructing and solving a sequence of tractable temporally aggregated
+surrogate problems. We adopt a Bayesian optimization approach to searching the
+space of time series aggregation hyperparameters and compute approximate
+solutions that minimize costs on a validation set of supply-demand projections.
+Importantly, we evaluate solved planning outcomes on a held-out set of test
+projections. We apply our approach to generation and transmission expansion
+planning for a joint power-gas system spanning New England. We show that our
+approach yields an estimated cost savings of up to 3.8% in comparison to
+benchmark time series aggregation approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating Training Strategies and Model Robustness of Low-Rank
+  Adaptation for Language Modeling in Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Yu, Chao-Han Huck Yang, Tuan Dinh, Sungho Ryu, Jari Kolehmainen, Roger Ren, Denis Filimonov, Prashanth G. Shivakumar, Ankur Gandhe, Ariya Rastow, Jia Xu, Ivan Bulyko, Andreas Stolcke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of low-rank adaptation (LoRA) with frozen pretrained language models
+(PLMs) has become increasing popular as a mainstream, resource-efficient
+modeling approach for memory-constrained hardware. In this study, we first
+explore how to enhance model performance by introducing various LoRA training
+strategies, achieving relative word error rate reductions of 3.50\% on the
+public Librispeech dataset and of 3.67\% on an internal dataset in the
+messaging domain. To further characterize the stability of LoRA-based
+second-pass speech recognition models, we examine robustness against input
+perturbations. These perturbations are rooted in homophone replacements and a
+novel metric called N-best Perturbation-based Rescoring Robustness (NPRR), both
+designed to measure the relative degradation in the performance of rescoring
+models. Our experimental results indicate that while advanced variants of LoRA,
+such as dynamic rank-allocated LoRA, lead to performance degradation in
+$1$-best perturbation, they alleviate the degradation in $N$-best perturbation.
+This finding is in comparison to fully-tuned models and vanilla LoRA tuning
+baselines, suggesting that a comprehensive selection is needed when using
+LoRA-based adaptation for compute-cost savings and robust language modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models are Efficient Learners of Noise-Robust Speech
+  Recognition <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Hu, Chen Chen, Chao-Han Huck Yang, Ruizhe Li, Chao Zhang, Pin-Yu Chen, EnSiong Chng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in large language models (LLMs) have promoted generative
+error correction (GER) for automatic speech recognition (ASR), which leverages
+the rich linguistic knowledge and powerful reasoning ability of LLMs to improve
+recognition results. The latest work proposes a GER benchmark with HyPoradise
+dataset to learn the mapping from ASR N-best hypotheses to ground-truth
+transcription by efficient LLM finetuning, which shows great effectiveness but
+lacks specificity on noise-robust ASR. In this work, we extend the benchmark to
+noisy conditions and investigate if we can teach LLMs to perform denoising for
+GER just like what robust ASR do}, where one solution is introducing noise
+information as a conditioner into LLM. However, directly incorporating noise
+embeddings from audio encoder could harm the LLM tuning due to cross-modality
+gap. To this end, we propose to extract a language-space noise embedding from
+the N-best list to represent the noise conditions of source speech, which can
+promote the denoising process in GER. Furthermore, in order to enhance its
+representation ability of audio noise, we design a knowledge distillation (KD)
+approach via mutual information estimation to distill the real noise
+information in audio embeddings to our language embedding. Experiments on
+various latest LLMs demonstrate our approach achieves a new breakthrough with
+up to 53.9% correction improvement in terms of word error rate while with
+limited training data. Analysis shows that our language-space noise embedding
+can well represent the noise conditions of source speech, under which
+off-the-shelf LLMs show strong ability of language-space denoising.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR 2024, Spotlight top 5%, 24 pages. This work will be
+  open sourced at: https://github.com/YUCHEN005/RobustGER under MIT license</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Path Choice Matters for Clear Attribution in Path Methods <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Borui Zhang, Wenzhao Zheng, Jie Zhou, Jiwen Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rigorousness and clarity are both essential for interpretations of DNNs to
+engender human trust. Path methods are commonly employed to generate rigorous
+attributions that satisfy three axioms. However, the meaning of attributions
+remains ambiguous due to distinct path choices. To address the ambiguity, we
+introduce \textbf{Concentration Principle}, which centrally allocates high
+attributions to indispensable features, thereby endowing aesthetic and
+sparsity. We then present \textbf{SAMP}, a model-agnostic interpreter, which
+efficiently searches the near-optimal path from a pre-defined set of
+manipulation paths. Moreover, we propose the infinitesimal constraint (IC) and
+momentum strategy (MS) to improve the rigorousness and optimality.
+Visualizations show that SAMP can precisely reveal DNNs by pinpointing salient
+image pixels. We also perform quantitative experiments and observe that our
+method significantly outperforms the counterparts. Code:
+https://github.com/zbr17/SAMP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A2Q+: Improving Accumulator-Aware Weight Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ian Colbert, Alessandro Pappalardo, Jakoba Petri-Koenig, Yaman Umuroglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization techniques commonly reduce the inference costs of neural
+networks by restricting the precision of weights and activations. Recent
+studies show that also reducing the precision of the accumulator can further
+improve hardware efficiency at the risk of numerical overflow, which introduces
+arithmetic errors that can degrade model accuracy. To avoid numerical overflow
+while maintaining accuracy, recent work proposed accumulator-aware quantization
+(A2Q), a quantization-aware training method that constrains model weights
+during training to safely use a target accumulator bit width during inference.
+Although this shows promise, we demonstrate that A2Q relies on an overly
+restrictive constraint and a sub-optimal weight initialization strategy that
+each introduce superfluous quantization error. To address these shortcomings,
+we introduce: (1) an improved bound that alleviates accumulator constraints
+without compromising overflow avoidance; and (2) a new strategy for
+initializing quantized weights from pre-trained floating-point checkpoints. We
+combine these contributions with weight normalization to introduce A2Q+. We
+support our analysis with experiments that show A2Q+ significantly improves the
+trade-off between accumulator bit width and model accuracy and characterize new
+trade-offs that arise as a consequence of accumulator constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Sets and Solution Paths of ReLU Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00119v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00119v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron Mishkin, Mert Pilanci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop an analytical framework to characterize the set of optimal ReLU
+neural networks by reformulating the non-convex training problem as a convex
+program. We show that the global optima of the convex parameterization are
+given by a polyhedral set and then extend this characterization to the optimal
+set of the non-convex training objective. Since all stationary points of the
+ReLU training problem can be represented as optima of sub-sampled convex
+programs, our work provides a general expression for all critical points of the
+non-convex objective. We then leverage our results to provide an optimal
+pruning algorithm for computing minimal networks, establish conditions for the
+regularization path of ReLU networks to be continuous, and develop sensitivity
+results for minimal ReLU networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Minor updates and corrections to clarify the role of merge/split
+  symmetries in formation of ReLU optimal set and add missing sufficient
+  conditions for all minimal models to have the same cardinality</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14393v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14393v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmad Faiz, Sotaro Kaneda, Ruhan Wang, Rita Osi, Prateek Sharma, Fan Chen, Lei Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The carbon footprint associated with large language models (LLMs) is a
+significant concern, encompassing emissions from their training, inference,
+experimentation, and storage processes, including operational and embodied
+carbon emissions. An essential aspect is accurately estimating the carbon
+impact of emerging LLMs even before their training, which heavily relies on GPU
+usage. Existing studies have reported the carbon footprint of LLM training, but
+only one tool, mlco2, can predict the carbon footprint of new neural networks
+prior to physical training. However, mlco2 has several serious limitations. It
+cannot extend its estimation to dense or mixture-of-experts (MoE) LLMs,
+disregards critical architectural parameters, focuses solely on GPUs, and
+cannot model embodied carbon footprints. Addressing these gaps, we introduce
+\textit{\carb}, an end-to-end carbon footprint projection model designed for
+both dense and MoE LLMs. Compared to mlco2, \carb~significantly enhances the
+accuracy of carbon footprint estimations for various LLMs. The source code is
+released at \url{https://github.com/SotaroKaneda/MLCarbon}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Choreographer: Learning and Adapting Skills in Imagination <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13350v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13350v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pietro Mazzaglia, Tim Verbelen, Bart Dhoedt, Alexandre Lacoste, Sai Rajeswar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised skill learning aims to learn a rich repertoire of behaviors
+without external supervision, providing artificial agents with the ability to
+control and influence the environment. However, without appropriate knowledge
+and exploration, skills may provide control only over a restricted area of the
+environment, limiting their applicability. Furthermore, it is unclear how to
+leverage the learned skill behaviors for adapting to downstream tasks in a
+data-efficient manner. We present Choreographer, a model-based agent that
+exploits its world model to learn and adapt skills in imagination. Our method
+decouples the exploration and skill learning processes, being able to discover
+skills in the latent state space of the model. During adaptation, the agent
+uses a meta-controller to evaluate and adapt the learned skills efficiently by
+deploying them in parallel in imagination. Choreographer is able to learn
+skills both from offline data, and by collecting data simultaneously with an
+exploration policy. The skills can be used to effectively adapt to downstream
+tasks, as we show in the URL benchmark, where we outperform previous approaches
+from both pixels and states inputs. The learned skills also explore the
+environment thoroughly, finding sparse rewards more frequently, as shown in
+goal-reaching tasks from the DMC Suite and Meta-World. Website and code:
+https://skillchoreographer.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICLR 2023 (notable top 25%)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust Offline Reinforcement Learning under Diverse Data
+  Corruption <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12955v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12955v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Yang, Han Zhong, Jiawei Xu, Amy Zhang, Chongjie Zhang, Lei Han, Tong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline reinforcement learning (RL) presents a promising approach for
+learning reinforced policies from offline datasets without the need for costly
+or unsafe interactions with the environment. However, datasets collected by
+humans in real-world environments are often noisy and may even be maliciously
+corrupted, which can significantly degrade the performance of offline RL. In
+this work, we first investigate the performance of current offline RL
+algorithms under comprehensive data corruption, including states, actions,
+rewards, and dynamics. Our extensive experiments reveal that implicit
+Q-learning (IQL) demonstrates remarkable resilience to data corruption among
+various offline RL algorithms. Furthermore, we conduct both empirical and
+theoretical analyses to understand IQL's robust performance, identifying its
+supervised policy learning scheme as the key factor. Despite its relative
+robustness, IQL still suffers from heavy-tail targets of Q functions under
+dynamics corruption. To tackle this challenge, we draw inspiration from robust
+statistics to employ the Huber loss to handle the heavy-tailedness and utilize
+quantile estimators to balance penalization for corrupted data and learning
+stability. By incorporating these simple yet effective modifications into IQL,
+we propose a more robust offline RL approach named Robust IQL (RIQL). Extensive
+experiments demonstrate that RIQL exhibits highly robust performance when
+subjected to diverse data corruption scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Algorithmic Assistance with Recommendation-Dependent Preferences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.07626v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.07626v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bryce McLaughlin, Jann Spiess
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When an algorithm provides risk assessments, we typically think of them as
+helpful inputs to human decisions, such as when risk scores are presented to
+judges or doctors. However, a decision-maker may not only react to the
+information provided by the algorithm. The decision-maker may also view the
+algorithmic recommendation as a default action, making it costly for them to
+deviate, such as when a judge is reluctant to overrule a high-risk assessment
+for a defendant or a doctor fears the consequences of deviating from
+recommended procedures. To address such unintended consequences of algorithmic
+assistance, we propose a principal-agent model of joint human-machine
+decision-making. Within this model, we consider the effect and design of
+algorithmic recommendations when they affect choices not just by shifting
+beliefs, but also by altering preferences. We motivate this assumption from
+institutional factors, such as a desire to avoid audits, as well as from
+well-established models in behavioral science that predict loss aversion
+relative to a reference point, which here is set by the algorithm. We show that
+recommendation-dependent preferences create inefficiencies where the
+decision-maker is overly responsive to the recommendation. As a potential
+remedy, we discuss algorithms that strategically withhold recommendations, and
+show how they can improve the quality of final decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Quantum Graph Neural Networks: An Ego-Graph Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.05158v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.05158v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xing Ai, Zhihong Zhang, Luzhe Sun, Junchi Yan, Edwin Hancock
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum machine learning is a fast-emerging field that aims to tackle machine
+learning using quantum algorithms and quantum computing. Due to the lack of
+physical qubits and an effective means to map real-world data from Euclidean
+space to Hilbert space, most of these methods focus on quantum analogies or
+process simulations rather than devising concrete architectures based on
+qubits. In this paper, we propose a novel hybrid quantum-classical algorithm
+for graph-structured data, which we refer to as the Ego-graph based Quantum
+Graph Neural Network (egoQGNN). egoQGNN implements the GNN theoretical
+framework using the tensor product and unity matrix representation, which
+greatly reduces the number of model parameters required. When controlled by a
+classical computer, egoQGNN can accommodate arbitrarily sized graphs by
+processing ego-graphs from the input graph using a modestly-sized quantum
+device. The architecture is based on a novel mapping from real-world data to
+Hilbert space. This mapping maintains the distance relations present in the
+data and reduces information loss. Experimental results show that the proposed
+method outperforms competitive state-of-the-art models with only 1.68\%
+parameters compared to those models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solution of the Probabilistic Lam<span class="highlight-title">bert</span> Problem: Connections with Optimal
+  Mass Transport, Schrödinger Bridge and Reaction-Diffusion PDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexis M. H. Teter, Iman Nodozi, Abhishek Halder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lambert's problem concerns with transferring a spacecraft from a given
+initial to a given terminal position within prescribed flight time via velocity
+control subject to a gravitational force field. We consider a probabilistic
+variant of the Lambert problem where the knowledge of the endpoint constraints
+in position vectors are replaced by the knowledge of their respective joint
+probability density functions. We show that the Lambert problem with endpoint
+joint probability density constraints is a generalized optimal mass transport
+(OMT) problem, thereby connecting this classical astrodynamics problem with a
+burgeoning area of research in modern stochastic control and stochastic machine
+learning. This newfound connection allows us to rigorously establish the
+existence and uniqueness of solution for the probabilistic Lambert problem. The
+same connection also helps to numerically solve the probabilistic Lambert
+problem via diffusion regularization, i.e., by leveraging further connection of
+the OMT with the Schr\"odinger bridge problem (SBP). This also shows that the
+probabilistic Lambert problem with additive dynamic process noise is in fact a
+generalized SBP, and can be solved numerically using the so-called
+Schr\"odinger factors, as we do in this work. We explain how the resulting
+analysis leads to solving a boundary-coupled system of reaction-diffusion PDEs
+where the nonlinear gravitational potential appears as the reaction rate. We
+propose novel algorithms for the same, and present illustrative numerical
+results. Our analysis and the algorithmic framework are nonparametric, i.e., we
+make neither statistical (e.g., Gaussian, first few moments, mixture or
+exponential family, finite dimensionality of the sufficient statistic) nor
+dynamical (e.g., Taylor series) approximations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Have it your way: Individualized Privacy Assignment for DP-SGD <span class="chip">NeurIPS'2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franziska Boenisch, Christopher Mühl, Adam Dziedzic, Roy Rinberg, Nicolas Papernot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When training a machine learning model with differential privacy, one sets a
+privacy budget. This budget represents a maximal privacy violation that any
+user is willing to face by contributing their data to the training set. We
+argue that this approach is limited because different users may have different
+privacy expectations. Thus, setting a uniform privacy budget across all points
+may be overly conservative for some users or, conversely, not sufficiently
+protective for others. In this paper, we capture these preferences through
+individualized privacy budgets. To demonstrate their practicality, we introduce
+a variant of Differentially Private Stochastic Gradient Descent (DP-SGD) which
+supports such individualized budgets. DP-SGD is the canonical approach to
+training models with differential privacy. We modify its data sampling and
+gradient noising mechanisms to arrive at our approach, which we call
+Individualized DP-SGD (IDP-SGD). Because IDP-SGD provides privacy guarantees
+tailored to the preferences of individual users and their data points, we find
+it empirically improves privacy-utility trade-offs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at NeurIPS'2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Group-level Brain Decoding with Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.14102v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.14102v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Csaky, Mats Van Es, Oiwi Parker Jones, Mark Woolrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decoding brain imaging data are gaining popularity, with applications in
+brain-computer interfaces and the study of neural representations. Decoding is
+typicallysubject-specific and does not generalise well over subjects, due to
+high amounts ofbetween subject variability. Techniques that overcome this will
+not only providericher neuroscientific insights but also make it possible for
+group-level models to out-perform subject-specific models. Here, we propose a
+method that uses subjectembedding, analogous to word embedding in natural
+language processing, to learnand exploit the structure in between-subject
+variability as part of a decoding model,our adaptation of the WaveNet
+architecture for classification. We apply this to mag-netoencephalography data,
+where 15 subjects viewed 118 different images, with30 examples per image; to
+classify images using the entire 1 s window followingimage presentation. We
+show that the combination of deep learning and subjectembedding is crucial to
+closing the performance gap between subject- and group-level decoding models.
+Importantly, group models outperform subject models onlow-accuracy subjects
+(although slightly impair high-accuracy subjects) and can behelpful for
+initialising subject models. While we have not generally found
+group-levelmodels to perform better than subject-level models, the performance
+of groupmodelling is expected to be even higher with bigger datasets. In order
+to providephysiological interpretation at the group level, we make use of
+permutation featureimportance. This provides insights into the spatiotemporal
+and spectral informationencoded in the models. All code is available on GitHub
+(https://github.com/ricsinaruto/MEG-group-decode).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Human Brain Mapping</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Salted Inference: Enhancing Privacy while Maintaining Efficiency of
+  Split Inference in Mobile Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.13384v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.13384v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Malekzadeh, Fahim Kawsar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In split inference, a deep neural network (DNN) is partitioned to run the
+early part of the DNN at the edge and the later part of the DNN in the cloud.
+This meets two key requirements for on-device machine learning: input privacy
+and computation efficiency. Still, an open question in split inference is
+output privacy, given that the outputs of the DNN are observable in the cloud.
+While encrypted computing can protect output privacy too, homomorphic
+encryption requires substantial computation and communication resources from
+both edge and cloud devices. In this paper, we introduce Salted DNNs: a novel
+approach that enables clients at the edge, who run the early part of the DNN,
+to control the semantic interpretation of the DNN's outputs at inference time.
+Our proposed Salted DNNs maintain classification accuracy and computation
+efficiency very close to the standard DNN counterparts. Experimental
+evaluations conducted on both images and wearable sensor data demonstrate that
+Salted DNNs attain classification accuracy very close to standard DNNs,
+particularly when the Salted Layer is positioned within the early part to meet
+the requirements of split inference. Our approach is general and can be applied
+to various types of DNNs. As a benchmark for future studies, we open-source our
+code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be appeared in the 25th International Workshop on Mobile Computing
+  Systems and Applications (HotMobile 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explaining dark matter halo density profiles with neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03077v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03077v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luisa Lucie-Smith, Hiranya V. Peiris, Andrew Pontzen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We use explainable neural networks to connect the evolutionary history of
+dark matter halos with their density profiles. The network captures independent
+factors of variation in the density profiles within a low-dimensional
+representation, which we physically interpret using mutual information. Without
+any prior knowledge of the halos' evolution, the network recovers the known
+relation between the early time assembly and the inner profile, and discovers
+that the profile beyond the virial radius is described by a single parameter
+capturing the most recent mass accretion rate. The results illustrate the
+potential for machine-assisted scientific discovery in complicated
+astrophysical datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures. Minor changes to match version accepted for
+  publication in PRL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Fast, Performant, Secure Distributed Training Framework For Large
+  Language Model <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09796v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09796v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Huang, Yinggui Wang, Anda Cheng, Aihui Zhou, Chaofan Yu, Lei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The distributed (federated) LLM is an important method for co-training the
+domain-specific LLM using siloed data. However, maliciously stealing model
+parameters and data from the server or client side has become an urgent problem
+to be solved. In this paper, we propose a secure distributed LLM based on model
+slicing. In this case, we deploy the Trusted Execution Environment (TEE) on
+both the client and server side, and put the fine-tuned structure (LoRA or
+embedding of P-tuning v2) into the TEE. Then, secure communication is executed
+in the TEE and general environments through lightweight encryption. In order to
+further reduce the equipment cost as well as increase the model performance and
+accuracy, we propose a split fine-tuning scheme. In particular, we split the
+LLM by layers and place the latter layers in a server-side TEE (the client does
+not need a TEE). We then combine the proposed Sparsification Parameter
+Fine-tuning (SPF) with the LoRA part to improve the accuracy of the downstream
+task. Numerous experiments have shown that our method guarantees accuracy while
+maintaining security.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2024 (Federated LLM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TemperatureGAN: Generative Modeling of Regional Atmospheric Temperatures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17248v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17248v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emmanuel Balogun, Ram Rajagopal, Arun Majumdar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic generators are useful for estimating climate impacts on various
+sectors. Projecting climate risk in various sectors, e.g. energy systems,
+requires generators that are accurate (statistical resemblance to
+ground-truth), reliable (do not produce erroneous examples), and efficient.
+Leveraging data from the North American Land Data Assimilation System, we
+introduce TemperatureGAN, a Generative Adversarial Network conditioned on
+months, locations, and time periods, to generate 2m above ground atmospheric
+temperatures at an hourly resolution. We propose evaluation methods and metrics
+to measure the quality of generated samples. We show that TemperatureGAN
+produces high-fidelity examples with good spatial representation and temporal
+dynamics consistent with known diurnal cycles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Let's do the time-warp-attend: Learning topological invariants of
+  dynamical systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noa Moriel, Matthew Ricci, Mor Nitzan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamical systems across the sciences, from electrical circuits to ecological
+networks, undergo qualitative and often catastrophic changes in behavior,
+called bifurcations, when their underlying parameters cross a threshold.
+Existing methods predict oncoming catastrophes in individual systems but are
+primarily time-series-based and struggle both to categorize qualitative
+dynamical regimes across diverse systems and to generalize to real data. To
+address this challenge, we propose a data-driven, physically-informed
+deep-learning framework for classifying dynamical regimes and characterizing
+bifurcation boundaries based on the extraction of topologically invariant
+features. We focus on the paradigmatic case of the supercritical Hopf
+bifurcation, which is used to model periodic dynamics across a wide range of
+applications. Our convolutional attention method is trained with data
+augmentations that encourage the learning of topological invariants which can
+be used to detect bifurcation boundaries in unseen systems and to design models
+of biological systems like oscillatory gene regulatory networks. We further
+demonstrate our method's use in analyzing real data by recovering distinct
+proliferation and differentiation dynamics along pancreatic endocrinogenesis
+trajectory in gene expression space based on single-cell data. Our method
+provides valuable insights into the qualitative, long-term behavior of a wide
+range of dynamical systems, and can detect bifurcations or catastrophic
+transitions in large-scale physical and biological systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ $α$-divergence Improves the Entropy Production Estimation via
+  Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02901v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02901v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Euijoon Kwon, Yongjoo Baek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have seen a surge of interest in the algorithmic estimation of
+stochastic entropy production (EP) from trajectory data via machine learning. A
+crucial element of such algorithms is the identification of a loss function
+whose minimization guarantees the accurate EP estimation. In this study, we
+show that there exists a host of loss functions, namely those implementing a
+variational representation of the $\alpha$-divergence, which can be used for
+the EP estimation. By fixing $\alpha$ to a value between $-1$ and $0$, the
+$\alpha$-NEEP (Neural Estimator for Entropy Production) exhibits a much more
+robust performance against strong nonequilibrium driving or slow dynamics,
+which adversely affects the existing method based on the Kullback-Leibler
+divergence ($\alpha = 0$). In particular, the choice of $\alpha = -0.5$ tends
+to yield the optimal results. To corroborate our findings, we present an
+exactly solvable simplification of the EP estimation problem, whose loss
+function landscape and stochastic properties give deeper intuition into the
+robustness of the $\alpha$-NEEP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Foundation Graph Model <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03976v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03976v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex O. Davies, Riku W. Green, Nirav S. Ajmeri, Telmo M. Silva Filho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The principal benefit of unsupervised graph representation learning is that a
+pre-trained model can be fine-tuned where data or labels are scarce. Existing
+approaches are domain specific, maintaining consistent node and edge attributes
+across the pre-training and target datasets. This precludes transfer to other
+domains. A model capable of positive transfer on arbitrary tasks and domains
+would represent the first foundation graph model.
+  In this work we use adversarial contrastive learning to present FoToM, a
+graph pre-training method based on node and edge feature exclusion. We use
+FoToM to pre-train models over multiple graph domains, producing the first
+foundation graph models. We demonstrate positive transfer on evaluation
+datasets from multiple domains, including domains not present in pre-training
+data. On all datasets performance is at worst on-par and on 76% significantly
+better than a supervised baseline ($P \leq 0.01$), with an 8 to 40% reduction
+in error at 95% confidence. Contrary to other research, pre-training on a
+dataset with the target domain excluded leads us to better performance than
+pre-training on a dataset from only the target domain. The multi-domain model
+at worst, matches, and on 56% of tasks, significantly outperforms single-domain
+($P \leq 0.01$). These results include when node labels are used in evaluation,
+where performance is consistently superior to single-domain or non-pre-trained
+models. Notably, FoToM benefits scenarios in both large or scarce data regimes
+for the target domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at the NeurIPS 2023 New Frontiers in Graph Learning
+  workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Privacy-Preserving Neural Graph Databases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.15591v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.15591v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Hu, Haoran Li, Jiaxin Bai, Yangqiu Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of big data and rapidly evolving information systems, efficient
+and accurate data retrieval has become increasingly crucial. Neural graph
+databases (NGDBs) have emerged as a powerful paradigm that combines the
+strengths of graph databases (graph DBs) and neural networks to enable
+efficient storage, retrieval, and analysis of graph-structured data. The usage
+of neural embedding storage and complex neural logical query answering provides
+NGDBs with generalization ability. When the graph is incomplete, by extracting
+latent patterns and representations, neural graph databases can fill gaps in
+the graph structure, revealing hidden relationships and enabling accurate query
+answering. Nevertheless, this capability comes with inherent trade-offs, as it
+introduces additional privacy risks to the database. Malicious attackers can
+infer more sensitive information in the database using well-designed
+combinatorial queries, such as by comparing the answer sets of where Turing
+Award winners born before 1950 and after 1940 lived, the living places of
+Turing Award winner Hinton are probably exposed, although the living places may
+have been deleted in the training due to the privacy concerns. In this work,
+inspired by the privacy protection in graph embeddings, we propose a
+privacy-preserving neural graph database (P-NGDB) to alleviate the risks of
+privacy leakage in NGDBs. We introduce adversarial training techniques in the
+training stage to force the NGDBs to generate indistinguishable answers when
+queried with private information, enhancing the difficulty of inferring
+sensitive information through combinations of multiple innocuous queries.
+Extensive experiment results on three datasets show that P-NGDB can effectively
+protect private information in the graph database while delivering high-quality
+public answers responses to queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interplay between depth and width for interpolation in neural ODEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09902v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09902v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio Álvarez-López, Arselane Hadj Slimane, Enrique Zuazua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural ordinary differential equations (neural ODEs) have emerged as a
+natural tool for supervised learning from a control perspective, yet a complete
+understanding of their optimal architecture remains elusive. In this work, we
+examine the interplay between their width $p$ and number of layer transitions
+$L$ (effectively the depth $L+1$). Specifically, we assess the model
+expressivity in terms of its capacity to interpolate either a finite dataset
+$D$ comprising $N$ pairs of points or two probability measures in
+$\mathbb{R}^d$ within a Wasserstein error margin $\varepsilon>0$. Our findings
+reveal a balancing trade-off between $p$ and $L$, with $L$ scaling as
+$O(1+N/p)$ for dataset interpolation, and
+$L=O\left(1+(p\varepsilon^d)^{-1}\right)$ for measure interpolation.
+  In the autonomous case, where $L=0$, a separate study is required, which we
+undertake focusing on dataset interpolation. We address the relaxed problem of
+$\varepsilon$-approximate controllability and establish an error decay of
+$\varepsilon\sim O(\log(p)p^{-1/d})$. This decay rate is a consequence of
+applying a universal approximation theorem to a custom-built Lipschitz vector
+field that interpolates $D$. In the high-dimensional setting, we further
+demonstrate that $p=O(N)$ neurons are likely sufficient to achieve exact
+control.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 10 figures, double column</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imitation Learning Inputting Image Feature to Each Layer of Neural
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09691v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09691v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koki Yamane, Sho Sakaino, Toshiaki Tsuji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation learning enables robots to learn and replicate human behavior from
+training data. Recent advances in machine learning enable end-to-end learning
+approaches that directly process high-dimensional observation data, such as
+images. However, these approaches face a critical challenge when processing
+data from multiple modalities, inadvertently ignoring data with a lower
+correlation to the desired output, especially when using short sampling
+periods. This paper presents a useful method to address this challenge, which
+amplifies the influence of data with a relatively low correlation to the output
+by inputting the data into each neural network layer. The proposed approach
+effectively incorporates diverse data sources into the learning process.
+Through experiments using a simple pick-and-place operation with raw images and
+joint information as input, significant improvements in success rates are
+demonstrated even when dealing with data from short sampling periods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures, Accepted at AMC2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A ripple in time: a discontinuity in American history 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01185v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01185v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Kolpakov, Igor Rivin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this note we use the State of the Union Address (SOTU) dataset from Kaggle
+to make some surprising (and some not so surprising) observations pertaining to
+the general timeline of American history, and the character and nature of the
+addresses themselves. Our main approach is using vector embeddings, such as
+BERT (DistilBERT) and GPT-2.
+  While it is widely believed that BERT (and its variations) is most suitable
+for NLP classification tasks, we find out that GPT-2 in conjunction with
+nonlinear dimension reduction methods such as UMAP provide better separation
+and stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In
+our case, no model fine-tuning is required, and the pre-trained out-of-the-box
+GPT-2 model is enough.
+  We also used a fine-tuned DistilBERT model for classification detecting which
+President delivered which address, with very good results (accuracy 93\% - 95\%
+depending on the run). An analogous task was performed to determine the year of
+writing, and we were able to pin it down to about 4 years (which is a single
+presidential term).
+  It is worth noting that SOTU addresses provide relatively small writing
+samples (with about 8000 words on average, and varying widely from under 2000
+words to more than 20000), and that the amount of authors is relatively large
+(we used SOTU addresses of 42 US presidents). This shows that the techniques
+employed turn out to be rather efficient, while all the computations described
+in this note can be performed using a single GPU instance of Google Colab.
+  The accompanying code is available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 8 figures; GitHub repository
+  https://github.com/sashakolpakov/ripple_in_time</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EZ-CLIP: Efficient Zeroshot Video Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.08010v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.08010v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahzad Ahmad, Sukalpa Chanda, Yogesh S Rawat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large-scale pre-training of visual-language models on
+paired image-text data have demonstrated impressive generalization capabilities
+for zero-shot tasks. Building on this success, efforts have been made to adapt
+these image-based visual-language models, such as CLIP, for videos extending
+their zero-shot capabilities to the video domain. While these adaptations have
+shown promising results, they come at a significant computational cost and
+struggle with effectively modeling the crucial temporal aspects inherent to the
+video domain. In this study, we present EZ-CLIP, a simple and efficient
+adaptation of CLIP that addresses these challenges. EZ-CLIP leverages temporal
+visual prompting for seamless temporal adaptation, requiring no fundamental
+alterations to the core CLIP architecture while preserving its remarkable
+generalization abilities. Moreover, we introduce a novel learning objective
+that guides the temporal visual prompts to focus on capturing motion, thereby
+enhancing its learning capabilities from video data. We conducted extensive
+experiments on five different benchmark datasets, thoroughly evaluating EZ-CLIP
+for zero-shot learning and base-to-novel video action recognition, and also
+demonstrating its potential for few-shot generalization.Impressively, with a
+mere 5.2 million learnable parameters (as opposed to the 71.1 million in the
+prior best model), EZ-CLIP can be efficiently trained on a single GPU,
+outperforming existing approaches in several evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LogLead -- Fast and Integrated Log Loader, Enhancer, and Anomaly
+  Detector 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.11809v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.11809v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mika Mäntylä, Yuqing Wang, Jesse Nyyssölä
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces LogLead, a tool designed for efficient log analysis
+benchmarking. LogLead combines three essential steps in log processing:
+loading, enhancing, and anomaly detection. The tool leverages Polars, a
+high-speed DataFrame library. We currently have Loaders for eight systems that
+are publicly available (HDFS, Hadoop, BGL, Thunderbird, Spirit, Liberty,
+TrainTicket, and GC Webshop). We have multiple enhancers with three parsers
+(Drain, Spell, LenMa), Bert embedding creation and other log representation
+techniques like bag-of-words. LogLead integrates to five supervised and four
+unsupervised machine learning algorithms for anomaly detection from SKLearn. By
+integrating diverse datasets, log representation methods and anomaly detectors,
+LogLead facilitates comprehensive benchmarking in log analysis research. We
+show that log loading from raw file to dataframe is over 10x faster with
+LogLead compared to past solutions. We demonstrate roughly 2x improvement in
+Drain parsing speed by off-loading log message normalization to LogLead. Our
+brief benchmarking on HDFS indicates that log representations extending beyond
+the bag-of-words approach offer limited additional benefits. Tool URL:
+https://github.com/EvoTestOps/LogLead
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2024 IEEE International Conference on Software Analysis, Evolution
+  and Reengineering (SANER)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Divide and not forget: Ensemble of selectively trained experts in
+  Continual Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10191v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10191v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Grzegorz Rypeść, Sebastian Cygert, Valeriya Khan, Tomasz Trzciński, Bartosz Zieliński, Bartłomiej Twardowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class-incremental learning is becoming more popular as it helps models widen
+their applicability while not forgetting what they already know. A trend in
+this area is to use a mixture-of-expert technique, where different models work
+together to solve the task. However, the experts are usually trained all at
+once using whole task data, which makes them all prone to forgetting and
+increasing computational burden. To address this limitation, we introduce a
+novel approach named SEED. SEED selects only one, the most optimal expert for a
+considered task, and uses data from this task to fine-tune only this expert.
+For this purpose, each expert represents each class with a Gaussian
+distribution, and the optimal expert is selected based on the similarity of
+those distributions. Consequently, SEED increases diversity and heterogeneity
+within the experts while maintaining the high stability of this ensemble
+method. The extensive experiments demonstrate that SEED achieves
+state-of-the-art performance in exemplar-free settings across various
+scenarios, showing the potential of expert diversification through data in
+continual learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for ICLR 2024 (main track), code is available at:
+  https://github.com/grypesc/SEED</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Graph Meets Large Language Model: Progress and Future
+  Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.12399v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.12399v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhan Li, Zhixun Li, Peisong Wang, Jia Li, Xiangguo Sun, Hong Cheng, Jeffrey Xu Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph plays a significant role in representing and analyzing complex
+relationships in real-world applications such as citation networks, social
+networks, and biological data. Recently, Large Language Models (LLMs), which
+have achieved tremendous success in various domains, have also been leveraged
+in graph-related tasks to surpass traditional Graph Neural Networks (GNNs)
+based methods and yield state-of-the-art performance. In this survey, we first
+present a comprehensive review and analysis of existing methods that integrate
+LLMs with graphs. First of all, we propose a new taxonomy, which organizes
+existing methods into three categories based on the role (i.e., enhancer,
+predictor, and alignment component) played by LLMs in graph-related tasks. Then
+we systematically survey the representative methods along the three categories
+of the taxonomy. Finally, we discuss the remaining limitations of existing
+studies and highlight promising avenues for future research. The relevant
+papers are summarized and will be consistently updated at:
+https://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress; 13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Statistical Test for Attention Map in Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08169v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08169v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomohiro Shiraishi, Daiki Miwa, Teruyuki Katsuoka, Vo Nguyen Le Duy, Kouichi Taji, Ichiro Takeuchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Vision Transformer (ViT) demonstrates exceptional performance in various
+computer vision tasks. Attention is crucial for ViT to capture complex
+wide-ranging relationships among image patches, allowing the model to weigh the
+importance of image patches and aiding our understanding of the decision-making
+process. However, when utilizing the attention of ViT as evidence in
+high-stakes decision-making tasks such as medical diagnostics, a challenge
+arises due to the potential of attention mechanisms erroneously focusing on
+irrelevant regions. In this study, we propose a statistical test for ViT's
+attentions, enabling us to use the attentions as reliable quantitative evidence
+indicators for ViT's decision-making with a rigorously controlled error rate.
+Using the framework called selective inference, we quantify the statistical
+significance of attentions in the form of p-values, which enables the
+theoretically grounded quantification of the false positive detection
+probability of attentions. We demonstrate the validity and the effectiveness of
+the proposed method through numerical experiments and applications to brain
+image diagnoses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42pages, 17figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Input Convex Lipschitz RNN: A Fast and Robust Approach for Engineering
+  Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Wang, P S Pravin, Zhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational efficiency and adversarial robustness are critical factors in
+real-world engineering applications. Yet, conventional neural networks often
+fall short in addressing both simultaneously, or even separately. Drawing
+insights from natural physical systems and existing literature, it is known
+that an input convex architecture enhances computational efficiency, while a
+Lipschitz-constrained architecture bolsters adversarial robustness. By
+leveraging the strengths of convexity and Lipschitz continuity, we develop a
+novel network architecture, termed Input Convex Lipschitz Recurrent Neural
+Networks. This model outperforms existing recurrent units across a spectrum of
+engineering tasks in terms of computational efficiency and adversarial
+robustness. These tasks encompass a benchmark MNIST image classification,
+real-world solar irradiance prediction for Solar PV system planning at LHT
+Holdings in Singapore, and real-time Model Predictive Control optimization for
+a chemical reactor.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Abilities in Large Language Models are Affected by Supervised
+  Fine-tuning Data Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05492v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05492v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanting Dong, Hongyi Yuan, Keming Lu, Chengpeng Li, Mingfeng Xue, Dayiheng Liu, Wei Wang, Zheng Yuan, Chang Zhou, Jingren Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) with enormous pre-training tokens and parameters
+emerge diverse abilities, including math reasoning, code generation, and
+instruction following. These abilities are further enhanced by supervised
+fine-tuning (SFT). While the open-source community has explored ad-hoc SFT for
+enhancing individual capabilities, proprietary LLMs exhibit versatility across
+various skills. Therefore, understanding the facilitation of multiple abilities
+via SFT is paramount. In this study, we specifically focuses on the interplay
+of data composition between mathematical reasoning, code generation, and
+general human-aligning abilities during SFT. We propose four intriguing
+research questions to explore the association between model performance and
+various factors including data amount, composition ratio, model size and SFT
+strategies. Our experiments reveal that distinct capabilities scale differently
+and larger models generally show superior performance with same amount of data.
+Mathematical reasoning and code generation consistently improve with increasing
+data amount, whereas general abilities plateau after roughly a thousand
+samples. Moreover, we observe data composition appears to enhance various
+abilities under limited data conditions, yet can lead to performance conflicts
+when data is plentiful. Our findings also suggest the amount of composition
+data influences performance more than the composition ratio. In analysis of SFT
+strategies, we find that sequentially learning multiple skills risks
+catastrophic forgetting. Our proposed Dual-stage Mixed Fine-tuning (DMT)
+strategy offers a promising solution to learn multiple abilities with different
+scaling patterns.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Input Convex LSTM: A Convex Approach for Fast Lyapunov-Based Model
+  Predictive Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07202v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07202v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Wang, Zhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging Input Convex Neural Networks (ICNNs), ICNN-based Model Predictive
+Control (MPC) successfully attains globally optimal solutions by upholding
+convexity within the MPC framework. However, current ICNN architectures
+encounter the issue of vanishing/exploding gradients, which limits their
+ability to serve as deep neural networks for complex tasks. Additionally, the
+current neural network-based MPC, including conventional neural network-based
+MPC and ICNN-based MPC, faces slower convergence speed when compared to MPC
+based on first-principles models. In this study, we leverage the principles of
+ICNNs to propose a novel Input Convex LSTM for Lyapunov-based MPC, with the
+specific goal of reducing convergence time and mitigating the
+vanishing/exploding gradient problem while ensuring closed-loop stability. From
+a simulation study of a nonlinear chemical reactor, we observed a mitigation of
+vanishing/exploding gradient problem and a reduction in convergence time, with
+a percentage decrease of 46.7%, 31.3%, and 20.2% compared to baseline plain
+RNN, plain LSTM, and Input Convex Recurrent Neural Networks, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to 6th Annual Learning for Dynamics & Control Conference
+  (L4DC 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Efficient and Certified Recovery from Poisoning Attacks in
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08216v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08216v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Jiang, Jiyuan Shen, Ziyao Liu, Chee Wei Tan, Kwok-Yan Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is vulnerable to poisoning attacks, where malicious
+clients manipulate their updates to affect the global model. Although various
+methods exist for detecting those clients in FL, identifying malicious clients
+requires sufficient model updates, and hence by the time malicious clients are
+detected, FL models have been already poisoned. Thus, a method is needed to
+recover an accurate global model after malicious clients are identified.
+Current recovery methods rely on (i) all historical information from
+participating FL clients and (ii) the initial model unaffected by the malicious
+clients, leading to a high demand for storage and computational resources. In
+this paper, we show that highly effective recovery can still be achieved based
+on (i) selective historical information rather than all historical information
+and (ii) a historical model that has not been significantly affected by
+malicious clients rather than the initial model. In this scenario, while
+maintaining comparable recovery performance, we can accelerate the recovery
+speed and decrease memory consumption. Following this concept, we introduce
+Crab, an efficient and certified recovery method, which relies on selective
+information storage and adaptive model rollback. Theoretically, we demonstrate
+that the difference between the global model recovered by Crab and the one
+recovered by train-from-scratch can be bounded under certain assumptions. Our
+empirical evaluation, conducted across three datasets over multiple machine
+learning models, and a variety of untargeted and targeted poisoning attacks
+reveals that Crab is both accurate and efficient, and consistently outperforms
+previous approaches in terms of both recovery speed and memory consumption.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge from Large-Scale Protein Contact Prediction Models Can Be
+  Transferred to the Data-Scarce RNA Contact Prediction Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.06120v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.06120v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiren Jian, Chongyang Gao, Chen Zeng, Yunjie Zhao, Soroush Vosoughi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RNA, whose functionality is largely determined by its structure, plays an
+important role in many biological activities. The prediction of pairwise
+structural proximity between each nucleotide of an RNA sequence can
+characterize the structural information of the RNA. Historically, this problem
+has been tackled by machine learning models using expert-engineered features
+and trained on scarce labeled datasets. Here, we find that the knowledge
+learned by a protein-coevolution Transformer-based deep neural network can be
+transferred to the RNA contact prediction task. As protein datasets are orders
+of magnitude larger than those for RNA contact prediction, our findings and the
+subsequent framework greatly reduce the data scarcity bottleneck. Experiments
+confirm that RNA contact prediction through transfer learning using a publicly
+available protein model is greatly improved. Our findings indicate that the
+learned structural patterns of proteins can be transferred to RNAs, opening up
+potential new avenues for research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is available at
+  https://github.com/yiren-jian/CoT-RNA-Transfer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Spectral Methods: <span class="highlight-title">Self-supervised</span> learning in the spectral domain <span class="chip">ICLR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.05225v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.05225v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiheng Du, Nithin Chalapathi, Aditi Krishnapriyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Neural Spectral Methods, a technique to solve parametric Partial
+Differential Equations (PDEs), grounded in classical spectral methods. Our
+method uses orthogonal bases to learn PDE solutions as mappings between
+spectral coefficients. In contrast to current machine learning approaches which
+enforce PDE constraints by minimizing the numerical quadrature of the residuals
+in the spatiotemporal domain, we leverage Parseval's identity and introduce a
+new training strategy through a \textit{spectral loss}. Our spectral loss
+enables more efficient differentiation through the neural network, and
+substantially reduces training complexity. At inference time, the computational
+cost of our method remains constant, regardless of the spatiotemporal
+resolution of the domain. Our experimental results demonstrate that our method
+significantly outperforms previous machine learning approaches in terms of
+speed and accuracy by one to two orders of magnitude on multiple different
+problems. When compared to numerical solvers of the same accuracy, our method
+demonstrates a $10\times$ increase in performance speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to International Conference on Learning Representations
+  (ICLR) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Granular-ball computing: an efficient, robust, and interpretable
+  adaptive multi-granularity representation and computation method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11171v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11171v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyin Xia, Guoyin Wang, Xinbo Gao, Xiaoyu Lian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human cognition operates on a "Global-first" cognitive mechanism,
+prioritizing information processing based on coarse-grained details. This
+mechanism inherently possesses an adaptive multi-granularity description
+capacity, resulting in computational traits such as efficiency, robustness, and
+interpretability. The analysis pattern reliance on the finest granularity and
+single-granularity makes most existing computational methods less efficient,
+robust, and interpretable, which is an important reason for the current lack of
+interpretability in neural networks. Multi-granularity granular-ball computing
+employs granular-balls of varying sizes to daptively represent and envelop the
+sample space, facilitating learning based on these granular-balls. Given that
+the number of coarse-grained "granular-balls" is fewer than sample points,
+granular-ball computing proves more efficient. Moreover, the inherent
+coarse-grained nature of granular-balls reduces susceptibility to fine-grained
+sample disturbances, enhancing robustness. The multi-granularity construct of
+granular-balls generates topological structures and coarse-grained
+descriptions, naturally augmenting interpretability. Granular-ball computing
+has successfully ventured into diverse AI domains, fostering the development of
+innovative theoretical methods, including granular-ball classifiers, clustering
+techniques, neural networks, rough sets, and evolutionary computing. This has
+notably ameliorated the efficiency, noise robustness, and interpretability of
+traditional methods. Overall, granular-ball computing is a rare and innovative
+theoretical approach in AI that can adaptively and simultaneously enhance
+efficiency, robustness, and interpretability. This article delves into the main
+application landscapes for granular-ball computing, aiming to equip future
+researchers with references and insights to refine and expand this promising
+theory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distribution Fitting for Combating Mode Collapse in Generative
+  Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.01521v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.01521v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanxiang Gong, Zhiwei Xie, Guozhen Duan, Zheng Ma, Mei Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mode collapse is a significant unsolved issue of generative adversarial
+networks. In this work, we examine the causes of mode collapse from a novel
+perspective. Due to the nonuniform sampling in the training process, some
+sub-distributions may be missed when sampling data. As a result, even when the
+generated distribution differs from the real one, the GAN objective can still
+achieve the minimum. To address the issue, we propose a global distribution
+fitting (GDF) method with a penalty term to confine the generated data
+distribution. When the generated distribution differs from the real one, GDF
+will make the objective harder to reach the minimal value, while the original
+global minimum is not changed. To deal with the circumstance when the overall
+real data is unreachable, we also propose a local distribution fitting (LDF)
+method. Experiments on several benchmarks demonstrate the effectiveness and
+competitive performance of GDF and LDF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convergence Analysis of Fractional Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.18426v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.18426v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashwani Aggarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fractional derivatives are a well-studied generalization of integer order
+derivatives. Naturally, for optimization, it is of interest to understand the
+convergence properties of gradient descent using fractional derivatives.
+Convergence analysis of fractional gradient descent is currently limited both
+in the methods analyzed and the settings analyzed. This paper aims to fill in
+these gaps by analyzing variations of fractional gradient descent in smooth and
+convex, smooth and strongly convex, and smooth and non-convex settings. First,
+novel bounds will be established bridging fractional and integer derivatives.
+Then, these bounds will be applied to the aforementioned settings to prove
+linear convergence for smooth and strongly convex functions and $O(1/T)$
+convergence for smooth and convex functions. Additionally, we prove $O(1/T)$
+convergence for smooth and non-convex functions using an extended notion of
+smoothness - H\"older smoothness - that is more natural for fractional
+derivatives. Finally, empirical results will be presented on the potential
+speed up of fractional gradient descent over standard gradient descent as well
+as the challenges of predicting which will be faster in general.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 4 figures. Added additional results for smooth and convex
+  functions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BioBridge: Bridging Biomedical Foundation Models via Knowledge Graphs <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03320v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03320v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zifeng Wang, Zichen Wang, Balasubramaniam Srinivasan, Vassilis N. Ioannidis, Huzefa Rangwala, Rishita Anubhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models (FMs) are able to leverage large volumes of unlabeled data
+to demonstrate superior performance across a wide range of tasks. However, FMs
+developed for biomedical domains have largely remained unimodal, i.e.,
+independently trained and used for tasks on protein sequences alone, small
+molecule structures alone, or clinical data alone. To overcome this limitation
+of biomedical FMs, we present BioBridge, a novel parameter-efficient learning
+framework, to bridge independently trained unimodal FMs to establish multimodal
+behavior. BioBridge achieves it by utilizing Knowledge Graphs (KG) to learn
+transformations between one unimodal FM and another without fine-tuning any
+underlying unimodal FMs. Our empirical results demonstrate that BioBridge can
+beat the best baseline KG embedding methods (on average by around 76.3%) in
+cross-modal retrieval tasks. We also identify BioBridge demonstrates
+out-of-domain generalization ability by extrapolating to unseen modalities or
+relations. Additionally, we also show that BioBridge presents itself as a
+general purpose retriever that can aid biomedical multimodal question answering
+as well as enhance the guided generation of novel drugs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning
+  and Optimization Functions for Enhanced Precision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15497v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15497v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel De Araujo, Shanlin Sun, Xiaohui Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image registration has traditionally been done using two distinct approaches:
+learning based methods, relying on robust deep neural networks, and
+optimization-based methods, applying complex mathematical transformations to
+warp images accordingly. Of course, both paradigms offer advantages and
+disadvantages, and, in this work, we seek to combine their respective strengths
+into a single streamlined framework, using the outputs of the learning based
+method as initial parameters for optimization while prioritizing computational
+power for the image pairs that offer the greatest loss. Our investigations
+showed improvements of up to 1.6% in test data, while maintaining the same
+inference time, and a substantial 1.0% points performance gain in deformation
+field smoothness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CFASL: Composite Factor-Aligned Symmetry Learning for Disentanglement in
+  Variational AutoEncoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08897v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08897v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hee-Jun Jung, Jaehyoung Jeong, Kangil Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Symmetries of input and latent vectors have provided valuable insights for
+disentanglement learning in VAEs.However, only a few works were proposed as an
+unsupervised method, and even these works require known factor information in
+training data. We propose a novel method, Composite Factor-Aligned Symmetry
+Learning (CFASL), which is integrated into VAEs for learning symmetry-based
+disentanglement in unsupervised learning without any knowledge of the dataset
+factor information.CFASL incorporates three novel features for learning
+symmetry-based disentanglement: 1) Injecting inductive bias to align latent
+vector dimensions to factor-aligned symmetries within an explicit learnable
+symmetry codebook 2) Learning a composite symmetry to express unknown factors
+change between two random samples by learning factor-aligned symmetries within
+the codebook 3) Inducing group equivariant encoder and decoder in training VAEs
+with the two conditions. In addition, we propose an extended evaluation metric
+for multi-factor changes in comparison to disentanglement evaluation in VAEs.
+In quantitative and in-depth qualitative analysis, CFASL demonstrates a
+significant improvement of disentanglement in single-factor change, and
+multi-factor change conditions compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Utilizing synthetic training data for the supervised classification of
+  rat ultrasonic vocalizations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.03183v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.03183v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        K. Jack Scott, Lucinda J. Speers, David K. Bilkey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Murine rodents generate ultrasonic vocalizations (USVs) with frequencies that
+extend to around 120kHz. These calls are important in social behaviour, and so
+their analysis can provide insights into the function of vocal communication,
+and its dysfunction. The manual identification of USVs, and subsequent
+classification into different subcategories is time consuming. Although machine
+learning approaches for identification and classification can lead to enormous
+efficiency gains, the time and effort required to generate training data can be
+high, and the accuracy of current approaches can be problematic. Here we
+compare the detection and classification performance of a trained human against
+two convolutional neural networks (CNNs), DeepSqueak and VocalMat, on audio
+containing rat USVs. Furthermore, we test the effect of inserting synthetic
+USVs into the training data of the VocalMat CNN as a means of reducing the
+workload associated with generating a training set. Our results indicate that
+VocalMat outperformed the DeepSqueak CNN on measures of call identification,
+and classification. Additionally, we found that the augmentation of training
+data with synthetic images resulted in a further improvement in accuracy, such
+that it was sufficiently close to human performance to allow for the use of
+this software in laboratory conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 5 main figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Deep Neural Network Based Reverse Radio Spectrogram Search Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13854v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13854v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Xiangyuan Ma, Steve Croft, Chris Lintott, Andrew P. V. Siemion
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern radio astronomy instruments generate vast amounts of data, and the
+increasingly challenging radio frequency interference (RFI) environment
+necessitates ever-more sophisticated RFI rejection algorithms. The "needle in a
+haystack" nature of searches for transients and technosignatures requires us to
+develop methods that can determine whether a signal of interest has unique
+properties, or is a part of some larger set of pernicious RFI. In the past,
+this vetting has required onerous manual inspection of very large numbers of
+signals. In this paper we present a fast and modular deep learning algorithm to
+search for lookalike signals of interest in radio spectrogram data. First, we
+trained a B-Variational Autoencoder on signals returned by an energy detection
+algorithm. We then adapted a positional embedding layer from classical
+Transformer architecture to a embed additional metadata, which we demonstrate
+using a frequency-based embedding. Next we used the encoder component of the
+B-Variational Autoencoder to extract features from small (~ 715,Hz, with a
+resolution of 2.79Hz per frequency bin) windows in the radio spectrogram. We
+used our algorithm to conduct a search for a given query (encoded signal of
+interest) on a set of signals (encoded features of searched items) to produce
+the top candidates with similar features. We successfully demonstrate that the
+algorithm retrieves signals with similar appearance, given only the original
+radio spectrogram data. This algorithm can be used to improve the efficiency of
+vetting signals of interest in technosignature searches, but could also be
+applied to a wider variety of searches for "lookalike" signals in large
+astronomical datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Local Explanations of Nonlinear Models Using Animated Linear
+  Projections 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.05359v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.05359v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Spyrison, Dianne Cook, Przemyslaw Biecek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increased predictive power of machine learning models comes at the cost
+of increased complexity and loss of interpretability, particularly in
+comparison to parametric statistical models. This trade-off has led to the
+emergence of eXplainable AI (XAI) which provides methods, such as local
+explanations (LEs) and local variable attributions (LVAs), to shed light on how
+a model use predictors to arrive at a prediction. These provide a point
+estimate of the linear variable importance in the vicinity of a single
+observation. However, LVAs tend not to effectively handle association between
+predictors. To understand how the interaction between predictors affects the
+variable importance estimate, we can convert LVAs into linear projections and
+use the radial tour. This is also useful for learning how a model has made a
+mistake, or the effect of outliers, or the clustering of observations. The
+approach is illustrated with examples from categorical (penguin species,
+chocolate types) and quantitative (soccer/football salaries, house prices)
+response models. The methods are implemented in the R package cheem, available
+on CRAN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Efficient Private Neighbor Generation for Subgraph Federated
+  Learning <span class="chip">SDM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04336v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04336v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Zhang, Lichao Sun, Bolin Ding, Siu Ming Yiu, Carl Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Behemoth graphs are often fragmented and separately stored by multiple data
+owners as distributed subgraphs in many realistic applications. Without harming
+data privacy, it is natural to consider the subgraph federated learning
+(subgraph FL) scenario, where each local client holds a subgraph of the entire
+global graph, to obtain globally generalized graph mining models. To overcome
+the unique challenge of incomplete information propagation on local subgraphs
+due to missing cross-subgraph neighbors, previous works resort to the
+augmentation of local neighborhoods through the joint FL of missing neighbor
+generators and GNNs. Yet their technical designs have profound limitations
+regarding the utility, efficiency, and privacy goals of FL. In this work, we
+propose FedDEP to comprehensively tackle these challenges in subgraph FL.
+FedDEP consists of a series of novel technical designs: (1) Deep neighbor
+generation through leveraging the GNN embeddings of potential missing
+neighbors; (2) Efficient pseudo-FL for neighbor generation through embedding
+prototyping; and (3) Privacy protection through noise-less
+edge-local-differential-privacy. We analyze the correctness and efficiency of
+FedDEP, and provide theoretical guarantees on its privacy. Empirical results on
+four real-world datasets justify the clear benefits of proposed techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SDM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Dimensional Rationale in Graph Contrastive Learning from
+  Causal Perspective <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10401v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10401v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qirui Ji, Jiangmeng Li, Jie Hu, Rui Wang, Changwen Zheng, Fanjiang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph contrastive learning is a general learning paradigm excelling at
+capturing invariant information from diverse perturbations in graphs. Recent
+works focus on exploring the structural rationale from graphs, thereby
+increasing the discriminability of the invariant information. However, such
+methods may incur in the mis-learning of graph models towards the
+interpretability of graphs, and thus the learned noisy and task-agnostic
+information interferes with the prediction of graphs. To this end, with the
+purpose of exploring the intrinsic rationale of graphs, we accordingly propose
+to capture the dimensional rationale from graphs, which has not received
+sufficient attention in the literature. The conducted exploratory experiments
+attest to the feasibility of the aforementioned roadmap. To elucidate the
+innate mechanism behind the performance improvement arising from the
+dimensional rationale, we rethink the dimensional rationale in graph
+contrastive learning from a causal perspective and further formalize the
+causality among the variables in the pre-training stage to build the
+corresponding structural causal model. On the basis of the understanding of the
+structural causal model, we propose the dimensional rationale-aware graph
+contrastive learning approach, which introduces a learnable dimensional
+rationale acquiring network and a redundancy reduction constraint. The
+learnable dimensional rationale acquiring network is updated by leveraging a
+bi-level meta-learning technique, and the redundancy reduction constraint
+disentangles the redundant features through a decorrelation process during
+learning. Empirically, compared with state-of-the-art methods, our method can
+yield significant performance boosts on various benchmarks with respect to
+discriminability and transferability. The code implementation of our method is
+available at https://github.com/ByronJi/DRGCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Novel Maximum-Entropy-Driven Technique for Low-Rank Orthogonal
+  Nonnegative Matrix Factorization with $\ell_0$-Norm sparsity Constraint 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.02672v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.02672v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salar Basiri, Srinivasa Salapaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In data-driven control and machine learning, a common requirement involves
+breaking down large matrices into smaller, low-rank factors that possess
+specific levels of sparsity. This paper introduces an innovative solution to
+the orthogonal nonnegative matrix factorization (ONMF) problem. The objective
+is to approximate input data by using two low-rank nonnegative matrices,
+adhering to both orthogonality and $\ell_0$-norm sparsity constraints. the
+proposed maximum-entropy-principle based framework ensures orthogonality and
+sparsity of features or the mixing matrix, while maintaining nonnegativity in
+both. Additionally, the methodology offers a quantitative determination of the
+``true'' number of underlying features, a crucial hyperparameter for ONMF.
+Experimental evaluation on synthetic and a standard datasets highlights the
+method's superiority in terms of sparsity, orthogonality, and computational
+speed compared to existing approaches. Notably, the proposed method achieves
+comparable or improved reconstruction errors in line with the literature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Model with Perceptual Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00110v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00110v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanchuan Lin, Xiao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models trained with mean squared error loss tend to generate
+unrealistic samples. Current state-of-the-art models rely on classifier-free
+guidance to improve sample quality, yet its surprising effectiveness is not
+fully understood. In this paper, we show that the effectiveness of
+classifier-free guidance partly originates from it being a form of implicit
+perceptual guidance. As a result, we can directly incorporate perceptual loss
+in diffusion training to improve sample quality. Since the score matching
+objective used in diffusion training strongly resembles the denoising
+autoencoder objective used in unsupervised training of perceptual networks, the
+diffusion model itself is a perceptual network and can be used to generate
+meaningful perceptual loss. We propose a novel self-perceptual objective that
+results in diffusion models capable of generating more realistic samples. For
+conditional generation, our method only improves sample quality without
+entanglement with the conditional input and therefore does not sacrifice sample
+diversity. Our method can also improve sample quality for unconditional
+generation, which was not possible with classifier-free guidance before.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Folding Attention: Memory and Power Optimization for On-Device
+  <span class="highlight-title">Transformer</span>-based Streaming Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.07988v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.07988v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Li, Liangzhen Lai, Yuan Shangguan, Forrest N. Iandola, Zhaoheng Ni, Ernie Chang, Yangyang Shi, Vikas Chandra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based models excel in speech recognition. Existing efforts to
+optimize Transformer inference, typically for long-context applications, center
+on simplifying attention score calculations. However, streaming speech
+recognition models usually process a limited number of tokens each time, making
+attention score calculation less of a bottleneck. Instead, the bottleneck lies
+in the linear projection layers of multi-head attention and feedforward
+networks, constituting a substantial portion of the model size and contributing
+significantly to computation, memory, and power usage.
+  To address this bottleneck, we propose folding attention, a technique
+targeting these linear layers, significantly reducing model size and improving
+memory and power efficiency. Experiments on on-device Transformer-based
+streaming speech recognition models show that folding attention reduces model
+size (and corresponding memory consumption) by up to 24% and power consumption
+by up to 23%, all without compromising model accuracy or computation overhead.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hybrid Parameter Search and Dynamic Model Selection for Mixed-Variable
+  Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.01409v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.01409v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengrui Luo, Younghyun Cho, James W. Demmel, Xiaoye S. Li, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a new type of hybrid model for Bayesian optimization (BO)
+adept at managing mixed variables, encompassing both quantitative (continuous
+and integer) and qualitative (categorical) types. Our proposed new hybrid
+models (named hybridM) merge the Monte Carlo Tree Search structure (MCTS) for
+categorical variables with Gaussian Processes (GP) for continuous ones. hybridM
+leverages the upper confidence bound tree search (UCTS) for MCTS strategy,
+showcasing the tree architecture's integration into Bayesian optimization. Our
+innovations, including dynamic online kernel selection in the surrogate
+modeling phase and a unique UCTS search strategy, position our hybrid models as
+an advancement in mixed-variable surrogate models. Numerical experiments
+underscore the superiority of hybrid models, highlighting their potential in
+Bayesian optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 8 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Speech Emotion Recognition Through Differentiable Architecture
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14402v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14402v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thejan Rajapakshe, Rajib Rana, Sara Khalifa, Berrak Sisman, Björn Schuller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech Emotion Recognition (SER) is a critical enabler of emotion-aware
+communication in human-computer interactions. Recent advancements in Deep
+Learning (DL) have substantially enhanced the performance of SER models through
+increased model complexity. However, designing optimal DL architectures
+requires prior experience and experimental evaluations. Encouragingly, Neural
+Architecture Search (NAS) offers a promising avenue to determine an optimal DL
+model automatically. In particular, Differentiable Architecture Search (DARTS)
+is an efficient method of using NAS to search for optimised models. This paper
+proposes a DARTS-optimised joint CNN and LSTM architecture, to improve SER
+performance, where the literature informs the selection of CNN and LSTM
+coupling to offer improved performance. While DARTS has previously been applied
+to CNN and LSTM combinations, our approach introduces a novel mechanism,
+particularly in selecting CNN operations using DARTS. In contrast to previous
+studies, we refrain from imposing constraints on the order of the layers for
+the CNN within the DARTS cell; instead, we allow DARTS to determine the optimal
+layer order autonomously. Experimenting with the IEMOCAP and MSP-IMPROV
+datasets, we demonstrate that our proposed methodology achieves significantly
+higher SER accuracy than hand-engineering the CNN-LSTM configuration. It also
+outperforms the best-reported SER results achieved using DARTS on CNN-LSTM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ M2ORT: Many-To-One Regression <span class="highlight-title">Transformer</span> for Spatial Transcriptomics
+  Prediction from Histopathology Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyi Wang, Xiuju Du, Jing Liu, Shuyi Ouyang, Yen-Wei Chen, Lanfen Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of Spatial Transcriptomics (ST) has facilitated the
+spatially-aware profiling of gene expressions based on histopathology images.
+Although ST data offers valuable insights into the micro-environment of tumors,
+its acquisition cost remains expensive. Therefore, directly predicting the ST
+expressions from digital pathology images is desired. Current methods usually
+adopt existing regression backbones for this task, which ignore the inherent
+multi-scale hierarchical data structure of digital pathology images. To address
+this limit, we propose M2ORT, a many-to-one regression Transformer that can
+accommodate the hierarchical structure of the pathology images through a
+decoupled multi-scale feature extractor. Different from traditional models that
+are trained with one-to-one image-label pairs, M2ORT accepts multiple pathology
+images of different magnifications at a time to jointly predict the gene
+expressions at their corresponding common ST spot, aiming at learning a
+many-to-one relationship through training. We have tested M2ORT on three public
+ST datasets and the experimental results show that M2ORT can achieve
+state-of-the-art performance with fewer parameters and floating-point
+operations (FLOPs). The code is available at:
+https://github.com/Dootmaan/M2ORT/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short
+  Video Search Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangshuo Qiao, Xianxin Li, Xiaozhe Qu, Jie Zhang, Yang Liu, Yu Luo, Cihang Jin, Jin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Models pre-trained on large-scale image-text datasets have
+shown superior performance in downstream tasks such as image retrieval. Most of
+the images for pre-training are presented in the form of open domain
+common-sense visual elements. Differently, video covers in short video search
+scenarios are presented as user-originated contents that provide important
+visual summaries of videos. In addition, a portion of the video covers come
+with manually designed cover texts that provide semantic complements. In order
+to fill in the gaps in short video cover data, we establish the first
+large-scale cover-text benchmark for Chinese short video search scenarios.
+Specifically, we release two large-scale datasets CBVS-5M/10M to provide short
+video covers, and the manual fine-labeling dataset CBVS-20K to provide real
+user queries, which serves as an image-text benchmark test in the Chinese short
+video search field. To integrate the semantics of cover text in the case of
+modality missing, we propose UniCLIP where cover texts play a guiding role
+during training, however are not relied upon by inference. Extensive evaluation
+on CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has
+been deployed to Tencent's online video search systems with hundreds of
+millions of visits and achieved significant gains. The complete dataset, code
+and checkpoints will be available upon release.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+
+</body>
+
+<footer>
+    <div>
+        <time id="build-timestamp" datetime="2024-01-27T05:20:06.429459614Z">
+            2024-01-27 05:20:06 UTC
+        </time>
+    </div>
+</footer>
+<script src="index.js"></script>
+</html>
diff --git a/index.js b/index.js
new file mode 100644
index 00000000..69f5da7b
--- /dev/null
+++ b/index.js
@@ -0,0 +1,39 @@
+/* Exapand/Collapse with TAB key */
+var expanded = false;
+document.onkeydown = function (e) {
+    if (e.keyCode === 9) {
+        expanded = !expanded;
+        document.querySelectorAll("details").forEach(detail => detail.open = expanded);
+        return false;
+    }
+};
+
+/* Switch Theme */
+const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+
+function switchTheme(e) {
+    if (e.target.checked) {
+        document.documentElement.setAttribute('data-theme', 'light');
+        document.getElementById("theme-icon").className = "ri-sun-line";
+        localStorage.setItem('theme', 'light'); //add this
+    } else {
+        document.documentElement.setAttribute('data-theme', 'dark');
+        document.getElementById("theme-icon").className = "ri-moon-line";
+        localStorage.setItem('theme', 'dark'); //add this
+    }
+}
+
+toggleSwitch.addEventListener('change', switchTheme, false);
+const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+if (currentTheme) {
+    document.documentElement.setAttribute('data-theme', currentTheme);
+    if (currentTheme === 'light') {
+        toggleSwitch.checked = true;
+    }
+}
+
+const timestamp = document.getElementById("build-timestamp");
+const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString();
+
+const badge = document.getElementById("build-timestamp-badge");
+// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`